# 1. Load in Cleaned Article Text

> Load in in the parsed article files in .json format and output as a dataframe

In [2]:
import json
import pandas as pd
from unidecode import unidecode

with open("/Users/taylor_bolt/Desktop/ASKE_MULTIVAC/cleanedArticles-03042019.json", 'r', encoding='utf-8') as jf:
    src_data = json.load(jf)
 
texts = [src_data[art]['text'] for art in src_data if src_data[art]['text'] is not None]
 
# The "unidecode" step simplifies non-ASCII chars which
# mess up the R GloVe engine. Probably a more sophisticated way to
# bridge that gap but this is the quick and dirty solution
 
texts_df = pd.Series(texts).apply(lambda x: unidecode(x))
texts_df = pd.DataFrame({'text':texts_df})
del texts, src_data



# 2. Create Domain-Specific GloVe Embedding Model

> Create the domain-specific (DS) GloVe embedding model from the parsed journal article text. This is run from the 'trainEmbeddings.R' script, but called from Python using rpy2. This takes a while, so for quickness, we load in a previously run script. Given the size of the vocabulary, this process takes a lot of memory usage. We recommend that this process is run on a computer with at least 16GB of RAM.

In [4]:
# from rpy2.robjects import r, pandas2ri, numpy2ri

# # Source all the functions contained in the 'trainEmbeddings' R file
# r("source('trainEmbeddings.R')")
# # Call the main GloveEmbedding function from the R script
# trainEmbeddings_R = r("trainEmbeddings")
# # Train DS GloVe Embedding model and ouput as a Numpy Matrix
# pandas2ri.activate()
# DS_embeddings_R = trainEmbeddings_R(texts_df)
# del texts_df
# pandas2ri.deactivate()
# DS_embeddings = numpy2ri.rpy2py(DS_embeddings_R[0])
# # Get DS GloVe vocabulary
# domain_spec_vocab = list(DS_embeddings_R[1])
# del DS_embeddings_R


# 3. Import Domain-General GloVe Model

> Load in the domain-general (DG) GloVe embedding model from the 'Common Crawl' pre-trained model from Stanford (https://nlp.stanford.edu/projects/glove/). This is saved as a .txt on disk. This takes a while to load (~30min). Thus, repeatedly loading this in is fairly inefficient. Rather, I have previously run this script and saved the relevant parts of the model to a pickle file on disk. However, I provide the code commented out below to load in the data if necessary (most relevant is the loadGloveModel function).

In [5]:
import pandas as pd
import numpy as np
import pickle

## Load in Stanford's 'Common Crawl' Domain-General Glove Embedding Model
# Only pull out the words that are contained in our corpus
# * This can take a while (~30min) - could use some optimization * 

# def loadGloveModel(gloveFile):
#     f = open(gloveFile,'r')
#     model = {}
#     for line in f:
#         splitLine = line.split()
#         word = splitLine[0]
#         if word in domain_spec_vocab:
#             #print(word)
#             embedding = np.array([float(val) for val in splitLine[1:]])
#             model[word] = embedding
#     return model
# DG_embeddings = loadGloveModel('/Users/taylor_bolt/Desktop/ASKE_MULTIVAC/glove.42B.300d.txt')

# * For now, we just load in a previously saved 'model subset' *
DG_embeddings = pickle.load( open( "/Users/taylor_bolt/Desktop/ASKE_MULTIVAC/model_subset_DG.p", "rb" ) )
domain_gen_vocab = list(DG_embeddings.keys())


# 4. Run CCA b/w Domain-General and Domain-Specific GloVe Embeddings

> Create the domain-adapted (DA) GloVe embeddings the CCA between on the tokens that are shared in common between the DG and DS vocabulary. The vectors for each token of the DA glove embedding model are derived from a weighted average of the canonical vectors (N = 100) from the CCA analysis.

In [6]:
from scipy.stats import zscore 
from sklearn.cross_decomposition import CCA

## Processing to ensure rows match between the DG and DS embeddings
# Convert domain general (DG) embedding from dictionary to array
DG_embeddings = np.array([DG_embeddings[i] for i in DG_embeddings.keys()])
# Find the indices of matching words
both = set(domain_gen_vocab).intersection(domain_spec_vocab)
indices_gen = [domain_gen_vocab.index(x) for x in both]
indices_spec = [domain_spec_vocab.index(x) for x in both]
indices_spec_notDG = [domain_spec_vocab.index(x) for x in domain_spec_vocab if x not in both]

# Sort and subset domain specific (DS) array to match indices of DG array
DS_embeddings_subset = DS_embeddings[indices_spec,:].copy()
DG_embeddings_subset = DG_embeddings[indices_gen,:].copy()


def domain_adapted_CCA(DG_embed,DS_embed,NC=100):
    # Z-score
    DG_embed_norm = zscore(DG_embed)
    print(DG_embed_norm.shape)
    DS_embed_norm = zscore(DS_embed)
    print(DS_embed_norm.shape)
    # Initialize CCA Model
    cca = CCA(n_components=NC)
    cca.fit(DG_embed_norm,DS_embed_norm)
    
    DA_embeddings = (cca.x_scores_ + cca.y_scores_)/2
    return cca, DA_embeddings

cca_res, DA_embeddings = domain_adapted_CCA(DG_embeddings_subset,DS_embeddings_subset,NC=100)

(48534, 300)
(48534, 300)




# 4. Project Left-Out Domain-Specific Tokens on to Domain-Adapted Space

> The tokens of the DS embedding model that are left out of the intersection between the DS and DG embedding model are projected into the 100-dimensional canonical vector space from the CCA analysis (via matrix multiplication) and appended to the DA embedding vectors (created above). Your final output is 'DA_embeddings_final'.

In [19]:
DS_embeddings_notinDG = DS_embeddings[indices_spec_notDG,:]
DS_embeddings_notinDG_norm = zscore(DS_embeddings_notinDG)

DA_notinDG_embeddings = cca_res.y_weights_.T @ DS_embeddings_notinDG_norm.T
DA_embeddings_final = np.append(DA_embeddings,DA_notinDG_embeddings.T,axis=0)