# 1. Load in Cleaned Article Text

> Load in in the parsed article files in .json format and output as a dataframe

In [1]:
import json
import os
import pandas as pd
from unidecode import unidecode

os.chdir('/Users/taylor_bolt/PycharmProjects/multivac/multivac')

with open("data/output_20195727_095737.json", 'r', encoding='utf-8') as jf:
    src_data = json.load(jf)
 
texts = [art[list(art.keys())[0]]['text'] for art in src_data]
 
# The "unidecode" step simplifies non-ASCII chars which
# mess up the R GloVe engine. Probably a more sophisticated way to
# bridge that gap but this is the quick and dirty solution
 
texts_df = pd.Series(texts).apply(lambda x: unidecode(x))
texts_df = pd.DataFrame({'text':texts_df})
del texts, src_data



# 2. Create Domain-Specific GloVe Embedding Model

> Create the domain-specific (DS) GloVe embedding model from the parsed journal article text. This is run from the 'trainEmbeddings.R' script, but called from Python using rpy2. This takes a while, so for quickness, we load in a previously run script. Given the size of the vocabulary, this process takes a lot of memory usage. We recommend that this process is run on a computer with at least 16GB of RAM.

In [2]:
from rpy2.robjects import r, pandas2ri, numpy2ri

# Source all the functions contained in the 'trainEmbeddings' R file
r("source('src/data/trainEmbeddings.R')")
# Call the main GloveEmbedding function from the R script
trainEmbeddings_R = r("trainEmbeddings")
# Train DS GloVe Embedding model and ouput as a Numpy Matrix
pandas2ri.activate()
DS_embeddings_R = trainEmbeddings_R(texts_df)
del texts_df
pandas2ri.deactivate()
DS_embeddings = numpy2ri.rpy2py(DS_embeddings_R[0])
# Get DS GloVe vocabulary
domain_spec_vocab = list(DS_embeddings_R[1])
del DS_embeddings_R


R[write to console]: Loading required package: data.table

R[write to console]: Loading required package: dplyr

R[write to console]: 
Attaching package: ‘dplyr’


R[write to console]: The following objects are masked from ‘package:data.table’:

    between, first, last


R[write to console]: The following objects are masked from ‘package:stats’:

    filter, lag


R[write to console]: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


R[write to console]: Loading required package: text2vec

R[write to console]: Loading required package: Rtsne

R[write to console]: Loading required package: quanteda

R[write to console]: Package version: 1.4.1

R[write to console]: Parallel computing: 2 of 12 threads used.

R[write to console]: See https://quanteda.io for tutorials and examples.

R[write to console]: 
Attaching package: ‘quanteda’


R[write to console]: The following object is masked from ‘package:utils’:

    View


R[write to console]: Lo

INFO [2019-10-09 13:07:24] 2019-10-09 13:07:24 - epoch 1, expected cost 0.0563
INFO [2019-10-09 13:07:28] 2019-10-09 13:07:28 - epoch 2, expected cost 0.0294
INFO [2019-10-09 13:07:32] 2019-10-09 13:07:32 - epoch 3, expected cost 0.0226
INFO [2019-10-09 13:07:36] 2019-10-09 13:07:36 - epoch 4, expected cost 0.0191
INFO [2019-10-09 13:07:39] 2019-10-09 13:07:39 - epoch 5, expected cost 0.0168
INFO [2019-10-09 13:07:43] 2019-10-09 13:07:43 - epoch 6, expected cost 0.0152
INFO [2019-10-09 13:07:47] 2019-10-09 13:07:47 - epoch 7, expected cost 0.0139
INFO [2019-10-09 13:07:51] 2019-10-09 13:07:51 - epoch 8, expected cost 0.0129
INFO [2019-10-09 13:07:54] 2019-10-09 13:07:54 - epoch 9, expected cost 0.0121
INFO [2019-10-09 13:07:58] 2019-10-09 13:07:58 - epoch 10, expected cost 0.0115
INFO [2019-10-09 13:08:02] 2019-10-09 13:08:02 - epoch 11, expected cost 0.0109
INFO [2019-10-09 13:08:06] 2019-10-09 13:08:06 - epoch 12, expected cost 0.0104
INFO [2019-10-09 13:08:09] 2019-10-09 13:08:09 - 

# 3. Import Domain-General GloVe Model

> Load in the domain-general (DG) GloVe embedding model from the 'Common Crawl' pre-trained model from Stanford (https://nlp.stanford.edu/projects/glove/). This is saved as a .txt on disk. This takes a while to load (~30min). Thus, repeatedly loading this in is fairly inefficient. Rather, I have previously run this script and saved the relevant parts of the model to a pickle file on disk. However, I provide the code commented out below to load in the data if necessary (most relevant is the loadGloveModel function).

In [36]:
import pandas as pd
import numpy as np
import pickle

## Load in Stanford's 'Common Crawl' Domain-General Glove Embedding Model
# Only pull out the words that are contained in our corpus
# * This can take a while (~30min) - could use some optimization * 

def loadGloveModel(gloveFile):
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        if word in domain_spec_vocab:
#             print(word)
            embedding = np.array([float(val) for val in splitLine[1:]])
            model[word] = embedding
    return model
DG_embeddings = loadGloveModel('data/glove.42B.300d.txt')


# 4. Run CCA b/w Domain-General and Domain-Specific GloVe Embeddings

> Create the domain-adapted (DA) GloVe embeddings the CCA between on the tokens that are shared in common between the DG and DS vocabulary. The vectors for each token of the DA glove embedding model are derived from a weighted average of the canonical vectors (N = 100) from the CCA analysis.

In [37]:
from scipy.stats import zscore 
from sklearn.cross_decomposition import CCA

## Processing to ensure rows match between the DG and DS embeddings
# Convert domain general (DG) embedding from dictionary to array
domain_gen_vocab = [token for token in DG_embeddings.keys()]
DG_embeddings = np.array([DG_embeddings[i] for i in DG_embeddings.keys()])
# Find the indices of matching words
both = set(domain_gen_vocab).intersection(domain_spec_vocab)
indices_gen = [domain_gen_vocab.index(x) for x in both]
indices_spec = [domain_spec_vocab.index(x) for x in both]
indices_spec_notDG = [domain_spec_vocab.index(x) for x in domain_spec_vocab if x not in both]

# Sort and subset domain specific (DS) array to match indices of DG array
DS_embeddings_subset = DS_embeddings[indices_spec,:].copy()
DG_embeddings_subset = DG_embeddings[indices_gen,:].copy()


def domain_adapted_CCA(DG_embed,DS_embed,NC=100):
    # Z-score
    DG_embed_norm = zscore(DG_embed)
    print(DG_embed_norm.shape)
    DS_embed_norm = zscore(DS_embed)
    print(DS_embed_norm.shape)
    # Initialize CCA Model
    cca = CCA(n_components=NC)
    cca.fit(DG_embed_norm,DS_embed_norm)
    
    DA_embeddings = (cca.x_scores_ + cca.y_scores_)/2
    return cca, DA_embeddings

cca_res, DA_embeddings = domain_adapted_CCA(DG_embeddings_subset,DS_embeddings_subset,NC=100)

(11075, 300)
(11075, 300)




# 4. Project Left-Out Domain-Specific Tokens on to Domain-Adapted Space

> The tokens of the DS embedding model that are left out of the intersection between the DS and DG embedding model are projected into the 100-dimensional canonical vector space from the CCA analysis (via matrix multiplication) and appended to the DA embedding vectors (created above). Your final output is 'DA_embeddings_final'.

In [40]:
DS_embeddings_notinDG = DS_embeddings[indices_spec_notDG,:]
DS_embeddings_notinDG_norm = zscore(DS_embeddings_notinDG)

DA_notinDG_embeddings = cca_res.y_weights_.T @ DS_embeddings_notinDG_norm.T
DA_embeddings_final = np.append(DA_embeddings,DA_notinDG_embeddings.T,axis=0)

vocab_final_indx = indices_spec + indices_spec_notDG
vocab_final = [domain_spec_vocab[i] for i in vocab_final_indx]

In [44]:
import pickle

glove_embeddings = {'embeddings': DA_embeddings_final, 'vocab': vocab_final}
pickle.dump(glove_embeddings, open('data/DA_glove_embeddings.pickle', 'wb'))