# 1. Load in Cleaned Article Text

In [1]:
import json
import pandas as pd
from unidecode import unidecode

with open("/Users/taylor_bolt/Desktop/ASKE_MULTIVAC/cleanedArticles-03042019.json", 'r', encoding='utf-8') as jf:
    src_data = json.load(jf)
 
texts = [src_data[art]['text'] for art in src_data if src_data[art]['text'] is not None]
 
# The "unidecode" step simplifies non-ASCII chars which
# mess up the R GloVe engine. Probably a more sophisticated way to
# bridge that gap but this is the quick and dirty solution
 
texts_df = pd.Series(texts).apply(lambda x: unidecode(x))
texts_df = pd.DataFrame({'text':texts_df})
del texts, src_data

# 2. Import Domain-General and Domain-Specific Model

In [2]:
import pandas as pd
import numpy as np
import pickle

## Load in the Domain-Specific Glove Embedding Model computed using the 'trainEmbedding' R script
df = pd.read_csv('/Users/taylor_bolt/Desktop/ASKE_MULTIVAC/Domain_Adapted_GloVe.csv',index_col=0)
domain_spec_vocab = list(df.index)
DS_embeddings = df.values


## Load in Stanford's 'Common Crawl' Domain-General Glove Embedding Model
# Only pull out the words that are contained in our corpus
# * This can take a while (~30min) - could use some optimization * 
# * For now, we just load in a previously saved 'model subset' *
def loadGloveModel(gloveFile):
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        if word in domain_spec_vocab:
            #print(word)
            embedding = np.array([float(val) for val in splitLine[1:]])
            model[word] = embedding
    return model
# DG_embeddings = loadGloveModel('/Users/taylor_bolt/Desktop/ASKE_MULTIVAC/glove.42B.300d.txt')
DG_embeddings = pickle.load( open( "/Users/taylor_bolt/Desktop/ASKE_MULTIVAC/model_subset_DG.p", "rb" ) )
domain_gen_vocab = list(DG_embeddings.keys())

## Post-processing to ensure rows match between the DG and DS e
# Convert domain general (DG) embedding from dictionary to array
DG_embeddings = np.array([DG_embeddings[i] for i in DG_embeddings.keys()])
# Find the indices of matching words
both = set(domain_gen_vocab).intersection(domain_spec_vocab)
indices_gen = [domain_gen_vocab.index(x) for x in both]
indices_spec = [domain_spec_vocab.index(x) for x in both]

# Sort domain specific (DS) and domain general (DG) embedding array to match indices of DG array
DS_embeddings = DS_embeddings[indices_spec,:]
DG_embeddings = DG_embeddings[indices_gen,:]



# 3. Run CCA b/w Domain-General and Domain-Specific Embeddings

In [14]:
def domain_adapted_CCA(DG_embed,DS_embed,NC=100):
    from scipy.stats import zscore 
    from sklearn.cross_decomposition import CCA
    # Transpose both and z-score
    DG_embed_norm = zscore(DG_embed)
    print(DG_embed_norm.shape)
    DS_embed_norm = zscore(DS_embed)
    print(DS_embed_norm.shape)
    # Initialize CCA Model
    cca = CCA(n_components=NC)
    cca.fit(DG_embed_norm,DS_embed_norm)
    
    DA_embeddings = (cca_res.x_scores_ + cca_res.y_scores_)/2
    return cca, DA_embeddings

cca_res, DA_embeddings = domain_adapted_CCA(DG_embeddings,DS_embeddings,NC=10)

(48536, 300)
(48536, 300)
