# 1. Parse Arxiv PDFs

Below parses the pdfs into text, while attempting to remove the title and reference pages. For some pdfs, the title and reference pages cannot be removed.

In [None]:
def parse_all_pdfs(pdf_dir):
    import slate
    import sys
    import glob, os
    import logging
    logging.propagate = False 
    logging.getLogger().setLevel(logging.ERROR)
    
    os.chdir(pdf_dir)
    text_all=[]
    files = glob.glob("*.pdf")
    for file in glob.glob("*.pdf"):
        print(file)
        try:
            with open(pdf_dir + '/' + file,'rb') as f:
                doc = slate.PDF(f)
        except:
            print('reading of ' + file + ' failed')
            continue
        doc = ' '.join([' '.join(x.split()) for x in doc])
        ## Try to get rid of irrelevant parts of the text (Title Page, References, Appendix)
        # Remove title page
        text_split = doc.lower().split(' abstract ')
        if len(text_split)>1: 
            text_no_title = ' '.join(text_split[1:])
        else: # if no abstract text, try to get introduction onward
            text_split = doc.lower().split(' introduction ')
    
            if len(text_split)>1:
                text_no_title = ' '.join(text_split[1:])
            else: # if still can't get any split, just give up and take the title page onward
                text_no_title = text_split[0]
                print("can't remove title page :(")
                
        # Remove reference page
        text_split = text_no_title.lower().split(' acknowledgements ')
        if len(text_split)>1:
            text_no_title_ref = ' '.join(text_split[:-1])
        else: # sometimes 'acknowledgements' is spelled 'acknowledgments' without the 'e'
            text_split = text_no_title.lower().split(' acknowledgments ')
            
            if len(text_split)>1: 
                text_no_title_ref = ' '.join(text_split[:-1])
            else: # If no acknowledgements section try to get references
                text_split = text_no_title.lower().split(' references ')
                
                if len(text_split)>1:
                    text_no_title_ref = ' '.join(text_split[:-1])
                else: # if still can't get any split, just give up and take reference pages
                    text_no_title_ref = text_split[0]
                    print("can't remove reference pages :(")
        
        text_all.append(text_no_title_ref)
        
    return text_all

            
# Parse all arxiv pdf articles into text
text_all = parse_all_pdfs('/Users/taylor_bolt/Desktop/ASKE_MULTIVAC/arxiv')   

# 2. Tokenizing and Pre-processing of PDFs

Below use SpaCy to preprocess each pdf text. Note, we exclude 'uninformative' parts of speech - punctuation, particles, numbers, etc.

In [1]:
def spacy_preprocess(text):
    import spacy
    nlp = spacy.load('en_core_web_sm',disable=['parser','ner'])
    doc_final = []
    select_pos = ['ADJ','NOUN','ADV','VERB','ADP','PROPN']

    # Loop through all reviews, lemmatize words and select parts of speech with SpaCy
    for i in text:
        doc_new = nlp(i)
        doc_new = [token.lower_ for token in doc_new if token.pos_ in select_pos if len(token)<20 if token.is_alpha]
        doc_final.append(doc_new)
    return doc_final

# Load all text saved in pickle object and apply spacy preprocessing
import pickle
pickle_in = open("/Users/taylor_bolt/Desktop/ASKE_MULTIVAC/GloVe_CCA/arxiv_text_all.pickle","rb")
text_all = pickle.load(pickle_in)
text_prep = spacy_preprocess(text_all)



# 3. Create Word Co-Occurence Matrix

The input to the GloVe Embedding Model requires a co-occurence matrix. We essentially slide a fixed window size (n = 10 words) across each parse pdf, and count the co-occurences between all the words of our vocabulary.

Note, I had to limit the vocabulary to 20,000 words to get the GloVe embedding model to converge in a reasonable time (the entire vocab is about 40,000 words).

In [None]:
def co_occur_matrix(text,window_size=10,overlap=9,N_sub=20000):
    from sklearn.feature_extraction.text import CountVectorizer
    import numpy as np
    import random
    # Create dummy function to 'fool' CountVectorizer into not tokenizing an already tokenized list
    def dummy_fun(text):
          return text
    random.seed(1) # make sure we get the same random numbers again for debugging
    # Find vocabulary of words from all text
    vocab = list(set([y for x in text for y in x]))
    # Have to downsample vocabulary for GloVe estimation
    rand_indx = random.sample(list(range(len(vocab))),N_sub)
    vocab_subset = [vocab[i] for i in rand_indx]
    # Initialize count model
    count_model = CountVectorizer(analyzer="word",tokenizer=dummy_fun,preprocessor=dummy_fun,
                                  vocabulary=vocab_subset,token_pattern=None) 

    # Loop through all documents provided
    for loop_indx,doc in enumerate(text):
        #print(loop_indx)
        # If this is the first document, initialize the word-co-occurence 
        if loop_indx==0:
            # Get all windows of specifized size from document
            windows_all = [doc[i:i+overlap] for i in range(0, len(doc), window_size-overlap)]
            X = count_model.fit_transform(windows_all)
            # Set all occurences to 1, just in case the same word appears more than once in a window
            X[X>0] = 1
            # Compute Co-occurence matrix w/ matrix multiplication
            Xc = (X.T * X)
        # After the first document, add each document co-occurence to overall co-occurence matrix
        else: 
            # Get all windows of specifized size from document
            windows_all = [doc[i:i+overlap] for i in range(0, len(doc), window_size-overlap)]
            X = count_model.fit_transform(windows_all)
            # Set all occurences to 1, just in case the same word appears more than once in a window
            X[X>0] = 1
            # Compute Co-occurence matrix w/ matrix multiplication
            Xc_temp = (X.T * X)
            Xc = Xc + Xc_temp
    
    return Xc, vocab_subset
            
Xc, vocab = co_occur_matrix(text_prep)

    

# 4. Run GloVe Model on Pre-processed Text

We estimate 300 dimensions in the GloVe model.

In [None]:
from mittens import GloVe
Xc_array = Xc.toarray()
glove_model = GloVe(n=300, max_iter=200)  # 300 is the embedding dimension
embeddings = glove_model.fit(Xc_array)

# 5. Load in Pretrained GloVe Model

To combine the pretrained Glove Model and our domain specific Glove Model we need to find the intersection between the two vocabularies. We load in that vocabulary of the pretrained GloVe model that is also in our domain specific model

In [None]:
import numpy as np
def loadGloveModel(gloveFile):
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        if word in vocab:
            print(word)
            embedding = np.array([float(val) for val in splitLine[1:]])
            model[word] = embedding
    return model
model_subset = loadGloveModel('glove.42B.300d.txt')

# 6. CCA between Pretrained and Domain-specific GloVe Models

In [7]:
def domain_adapted_CCA(DG_embed,DS_embed,DS_vocab,NC=100):
    from scipy.stats import zscore
    import numpy as np
    from sklearn.cross_decomposition import CCA
    # Convert domain general (DG) embedding from dictionary to array
    DG_embed_new = np.array([DG_embed[i] for i in DG_embed.keys()])
    # Find the indices of matching words
    indx_match = []
    for i, x in enumerate(DG_embed.keys()):
        for j, y in enumerate(DS_vocab):
            if x == y:
                indx_match.append([i, j])
    # Sort domain specific (DS) embedding array to match indices of DG array
    sort_indx = np.array([i[1] for i in indx_match]).argsort()
    DS_embed_new = DS_embed[sort_indx,:]
    # Transpose both and z-score
    DG_embed_norm = zscore(DG_embed_new)
    print(DG_embed_norm.shape)
    DS_embed_norm = zscore(DS_embed_new)
    print(DS_embed_norm.shape)
    # Initialize CCA Model
    cca = CCA(n_components=NC)
    cca.fit(DG_embed_norm,DS_embed_norm)
    
    return cca

# # Load in previously estimated GloVe model on Arxiv Articles
# import pickle 
# pickle_in = open("DA_embedding_D300.dat","rb")
# pickle_output = pickle.load(pickle_in)
# DS_embed = pickle_output[0]
# DS_vocab = pickle_output[1]

# # Load in Pre-trained GloVe that was processed earlier
# import pickle 
# pickle_in = open("glove_pretrained_subset.pickle","rb")
# pickle_output = pickle.load(pickle_in)
# DG_embed = pickle_output

#cca_res = domain_adapted_CCA(DG_embed,DS_embed,DS_vocab,NC=10)

cca_res = domain_adapted_CCA(model_subset,embeddings,vocab,NC=10)

(14630, 300)
(14630, 300)




In [12]:
import numpy as np
np.corrcoef(cca_res.y_scores_[:,0],cca_res.x_scores_[:,0])

array([[1.        , 0.27892825],
       [0.27892825, 1.        ]])