In [1]:
#import functions

import os, yaml, json, re

#pre-defined functions
import functions

#new libraries
import pandas as pd
from sklearn.feature_extraction.text import (CountVectorizer, TfidfTransformer)
from sklearn.metrics.pairwise import cosine_similarity
#################################################
#set variables

settingsFile = "Memex_config.yml"
settings = yaml.safe_load(open(settingsFile))
pathToMemex = settings["path_to_memex"]

In [2]:
#aggregating publications into corpus

ocrFiles = functions.dicOfRelevantFiles(pathToMemex, ".json") #use pre-defined function (creates a dictionary of citationKey:Path pairs)
citeKeys = list(ocrFiles.keys()) #list of citekeys

docList   = [] #empty list to be filled with texts
docIdList = [] #empty list to be filled with citekeys

for citeKey in citeKeys: #initiate for-loop through citekey list
    docData = json.load(open(ocrFiles[citeKey])) #reads in the data of the respective citekey 
    
    docId = citeKey #sets variable docID to the respective citekey of each loop run
    doc   = " ".join(docData.values()) #connects docData values with whitespace

    doc   = re.sub(r'(\w)-\n(\w)', r'\1\2', doc) #replaces pattern of "alphanumeric character - alphanumeric characters" 
    doc   = re.sub('\W+', ' ', doc) #replaces non-word characters with whitespace 
    doc   = re.sub('\d+', ' ', doc) #replaces digits with whitespace
    doc   = re.sub(' +', ' ', doc) #replaces multiple whitespaces with single whitespace

    docList.append(doc)
    docIdList.append(docId)

In [3]:
print(docList)



In [5]:
#using sklearn 

vectorizer = CountVectorizer(ngram_range=(1,1), min_df=5, max_df=0.5) #create vectorizer (only unigrams, threshold: 0.5 < df_word > 5)
countVectorized = vectorizer.fit_transform(docList)
tfidfTransformer = TfidfTransformer(smooth_idf=True, use_idf=True)
vectorized = tfidfTransformer.fit_transform(countVectorized)#matrix with calculated tf-idf
cosineMatrix = cosine_similarity(vectorized) #matrix with calculated cosine similarity 

In [6]:
print(vectorized)

  (0, 5270)	0.010936323947681053
  (0, 5266)	0.010412049923747018
  (0, 5257)	0.010412049923747018
  (0, 5247)	0.008874826345825473
  (0, 5225)	0.007016004389837301
  (0, 5222)	0.009198980942768446
  (0, 5169)	0.007819974220054482
  (0, 5165)	0.009557317224967091
  (0, 5149)	0.008874826345825473
  (0, 5131)	0.008578896384351971
  (0, 5125)	0.015639948440108964
  (0, 5096)	0.040746180572468337
  (0, 5053)	0.008874826345825473
  (0, 5040)	0.0076004755437368504
  (0, 5038)	0.017749652691650946
  (0, 5032)	0.017157792768703942
  (0, 5030)	0.006841553379439362
  (0, 5027)	0.008874826345825473
  (0, 5011)	0.015200951087473701
  (0, 5010)	0.010936323947681053
  (0, 4988)	0.00651739878249639
  (0, 4985)	0.007819974220054482
  (0, 4981)	0.008578896384351971
  (0, 4976)	0.008054622360417934
  (0, 4972)	0.009957903107065933
  :	:
  (53, 1017)	0.006226937361580354
  (53, 949)	0.004586150625150604
  (53, 850)	0.003624803858423162
  (53, 802)	0.011402910326586434
  (53, 776)	0.009172301250301208
  (

In [8]:
#tf-idf matrix conversion

tfidfTable = pd.DataFrame(vectorized.toarray(), index=docIdList, columns=vectorizer.get_feature_names())
print(tfidfTable.head())
print("tfidfTable Shape: ", tfidfTable.shape)
tfidfTable = tfidfTable.transpose()
tfidfTableDic = tfidfTable.to_dict()

                          __   aa      aaai        ab  abilities  abraham  \
labbe_tool_2006     0.000000  0.0  0.000000  0.021873        0.0      0.0   
leroux_notion_2012  0.000000  0.0  0.000000  0.000000        0.0      0.0   
kraker_case_2011    0.011378  0.0  0.012781  0.000000        0.0      0.0   
kidd_new_2019       0.000000  0.0  0.000000  0.000000        0.0      0.0   
gold_debates_2019   0.000000  0.0  0.000000  0.000000        0.0      0.0   

                    abs   absence  absent  absolute  ...  zero  zhang  zhao  \
labbe_tool_2006     0.0  0.000000     0.0  0.060052  ...   0.0    0.0   0.0   
leroux_notion_2012  0.0  0.000000     0.0  0.000000  ...   0.0    0.0   0.0   
kraker_case_2011    0.0  0.000000     0.0  0.000000  ...   0.0    0.0   0.0   
kidd_new_2019       0.0  0.023019     0.0  0.000000  ...   0.0    0.0   0.0   
gold_debates_2019   0.0  0.000000     0.0  0.000000  ...   0.0    0.0   0.0   

                        zhou  zhu      zone   zu  zum  zweig  

In [9]:
#CosineSimilarity matrix conversion

cosineTable = pd.DataFrame(cosineMatrix)
print(cosineTable.head())
print("cosineTable Shape: ", cosineTable.shape) 
cosineTable.columns = docIdList
cosineTable.index = docIdList
cosineTableDic = cosineTable.to_dict()

         0         1         2         3         4         5         6   \
0  1.000000  0.225373  0.026659  0.029159  0.051935  0.075595  0.023653   
1  0.225373  1.000000  0.044770  0.007456  0.053238  0.008812  0.003940   
2  0.026659  0.044770  1.000000  0.048836  0.051594  0.056738  0.012953   
3  0.029159  0.007456  0.048836  1.000000  0.066902  0.053294  0.016420   
4  0.051935  0.053238  0.051594  0.066902  1.000000  0.101972  0.016374   

         7         8         9   ...        44        45        46        47  \
0  0.053719  0.062995  0.075172  ...  0.403040  0.030568  0.084486  0.033176   
1  0.007490  0.007547  0.225944  ...  0.001905  0.013170  0.003554  0.007117   
2  0.027716  0.056617  0.016883  ...  0.020544  0.197619  0.029645  0.071304   
3  0.018858  0.031090  0.017210  ...  0.021882  0.057149  0.014112  0.044136   
4  0.020568  0.042924  0.042724  ...  0.020827  0.066160  0.023902  0.034510   

         48        49        50        51        52        53  
0  0

In [73]:
#filtering

def textAnalysis_filter(d):
    dic_filtered = {} #create empty dictionary
    for t_id, t_calc in d.items(): #loop through first layer of dic
        print(t_id, "... processing") #just to see progress
        
        for k, v in t_calc.items(): #loop thorugh second layer of dic 
            dic1 = {k:v for k,v in t_calc.items() if v > 0.1} #save only those pairs that fall in the set threshold
            dic_filtered[t_id] = dic1 #add filtered data to new dic
            
    return dic_filtered
    

    
#tfidfTableDic_filtered = textAnalysis_filter(tfidfTableDic) #use threshold: 0.04
cosineTableDic_filtered = textAnalysis_filter(cosineTableDic) #use threshold: 0.1
        


labbe_tool_2006 ... processing
leroux_notion_2012 ... processing
kraker_case_2011 ... processing
kidd_new_2019 ... processing
gold_debates_2019 ... processing
bhattacharyya_deep_2020 ... processing
bolukbasi_man_2016 ... processing
bojanowski_enriching_2017 ... processing
baroni_new_2006 ... processing
bahdanau_neural_2016 ... processing
barranha_derivative_2018 ... processing
best_surface_nodate ... processing
bender_climbing_2020 ... processing
rebelo_optical_2012 ... processing
reitz_hitchhikers_nodate ... processing
ribeiro_why_2016 ... processing
ridge_crowdsourcing_nodate ... processing
pennington_glove_2014 ... processing
devlin_bert_2019 ... processing
de_santis_crossing_2019 ... processing
drucker_is_2013 ... processing
davis_universal_nodate ... processing
collar_networks_2015 ... processing
conneau_supervised_2017 ... processing
nadeau_survey_2007 ... processing
nadeau_survey_2007-1 ... processing
noack_modularity_2009 ... processing
svensson_three_2020 ... processing
shneid

In [76]:
#save dictionary as JSON files

with open(pathToMemex+'/tfidfTableDic_filtered.txt', 'w', encoding ='utf8') as json_file1: 
    json.dump(tfidfTableDic_filtered, json_file1) 

with open(pathToMemex+'/cosineTableDic_filtered.txt', 'w', encoding ='utf8') as json_file2: 
    json.dump(cosineTableDic_filtered, json_file2) 