In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
import tensorflow as tf
import json, gensim, sklearn, pickle, sys, re, os
import IPython.display as ipd

In [2]:
from gensim.parsing.preprocessing import preprocess_documents
from gensim.parsing.preprocessing import preprocess_string
from gensim.models.phrases import Phrases
from gensim.parsing.preprocessing import strip_tags, strip_short, strip_multiple_whitespaces, stem_text
from gensim.parsing.preprocessing import remove_stopwords
from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
from gensim import utils
from scholarly import scholarly
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from IPython.display import Audio
sound_file = './Music/invalid_keypress.mp3'

In [3]:
datapd=pd.read_json("arxiv-metadata-oai-snapshot.json",  lines=True)

In [4]:
wave = np.sin(2*np.pi*200*np.arange(5000*2)/10000) 
play=Audio(wave, rate=50000, autoplay=True)

## Model 1 -- TF-IDF ## 
### We will first focus on just one subfield -- hep-th ###

In [5]:
datapd_hep_th = datapd.loc[datapd['categories'].str.contains('hep-th')]
hep_abstracts = datapd_hep_th[ list(datapd_hep_th.iloc[:,0:1]) + ['abstract'] + ['title']]

In [6]:
datapd[0:5]

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"


In [7]:
hep_abstracts[0:5]

Unnamed: 0,id,abstract,title
14,704.0015,The pure spinor formulation of the ten-dimen...,Fermionic superstring loop amplitudes in the p...
17,704.0018,We give a prescription for how to compute th...,In quest of a generalized Callias index theorem
51,704.0052,We study space-time symmetries in scalar qua...,Quantum Field Theory on Curved Backgrounds. II...
63,704.0064,We capture the off-shell as well as the on-s...,Nilpotent symmetry invariance in the superfiel...
65,704.0066,Possible (algebraic) commutation relations i...,Lagrangian quantum field theory in momentum pi...


In [9]:
hep_corpus=hep_abstracts['abstract'].values
#processed_hep=preprocess_documents(hep_corpus)
processed_hep=[gensim.utils.simple_preprocess(gensim.parsing.preprocessing.remove_stopwords(doc),min_len=2) for doc in hep_corpus] 

In [10]:
processed_hep[0:1]

[['the',
  'pure',
  'spinor',
  'formulation',
  'ten',
  'dimensional',
  'superstring',
  'leads',
  'manifestly',
  'supersymmetric',
  'loop',
  'amplitudes',
  'expressed',
  'integrals',
  'pure',
  'spinor',
  'superspace',
  'this',
  'paper',
  'explores',
  'different',
  'methods',
  'evaluate',
  'integrals',
  'uses',
  'calculate',
  'kinematic',
  'factors',
  'one',
  'loop',
  'two',
  'loop',
  'massless',
  'four',
  'point',
  'amplitudes',
  'involving',
  'ramond',
  'states']]

In [11]:
hep_dictionary = gensim.corpora.Dictionary(processed_hep)
hep_bow_corpus = [hep_dictionary.doc2bow(text) for text in processed_hep]

In [12]:
print("Total number of papers in high energy physics theory (hep-th) : ",len(hep_bow_corpus))
print("Size of our dictionary : ", len(hep_dictionary))

Total number of papers in high energy physics theory (hep-th) :  140500
Size of our dictionary :  53081


In [13]:
hep_tfidf = gensim.models.TfidfModel(hep_bow_corpus, smartirs='npu')

In [14]:
index = gensim.similarities.MatrixSimilarity(hep_tfidf[hep_bow_corpus])

In [15]:
def search(string,number):
    new_doc = gensim.parsing.preprocessing.preprocess_string(string)
    new_vec = hep_dictionary.doc2bow(new_doc)
    vec_bow_tfidf = hep_tfidf[new_vec]
    sims = index[vec_bow_tfidf]
    for (s,i) in zip(sorted(enumerate(sims), key=lambda item: -item[1])[:number],range(1,number+1)):
        print( f"{i}) {datapd_hep_th['title'].iloc[s[0]]} by {datapd_hep_th['authors'].iloc[s[0]]} ({str(s[1])}). Arxiv: {datapd_hep_th['id'].iloc[s[0]]} \n")

In [16]:
search("ads cft entanglment islands information paradox",15)

1) Islands in Asymptotically Flat 2D Gravity by Thomas Hartman, Edgar Shaghoulian, Andrew Strominger (0.31529188). Arxiv: 2004.13857 

2) Pulling Out the Island with Modular Flow by Yiming Chen (0.27871448). Arxiv: 1912.02210 

3) Is the island universe model consistent with observations? by Yun-Song Piao (0.25776836). Arxiv: astro-ph/0506072 

4) Island in the Presence of Higher Derivative Terms by Mohsen Alishahiha, Amin Faraji Astaneh and Ali Naseh (0.2540105). Arxiv: 2005.08715 

5) Pure de Sitter space and the island moving back in time by Watse Sybesma (0.24061042). Arxiv: 2008.07994 

6) What the information paradox is {\it not} by Samir D. Mathur (0.2291399). Arxiv: 1108.0302 

7) Galilean Islands in Eternally Inflating Background by Zhi-Guo Liu, Yun-Song Piao (0.21806723). Arxiv: 1301.6833 

8) Deep learning and k-means clustering in heterotic string vacua with line
  bundles by Hajime Otsuka, Kenta Takemoto (0.21647947). Arxiv: 2003.11880 

9) Why Black Hole Information Loss 

In [17]:
def recommend(string,number):
    pointer3=hep_abstracts.loc[(hep_abstracts['id'].str.match(string))]
    queryvalue3=pointer3['abstract'].values
    processed_query3=preprocess_documents(queryvalue3)
    test_bow_corpus3 = [hep_dictionary.doc2bow(text) for text in processed_query3]
    vec_test_bow3=hep_tfidf[test_bow_corpus3]
    parameters={}
    for i in range(len(processed_query3)):
        parameters["sims" + str(i)] = index[vec_test_bow3[i]]
    values=parameters.values()
    for (s,j) in zip(sorted(enumerate(np.sqrt(sum(values))), key=lambda item: -item[1])[:number+len(processed_query3)],range(1,number+1+len(processed_query3))):
        if j<=len(processed_query3):
            continue
        print( f" \n {j-len(processed_query3)}) {datapd_hep_th['title'].iloc[s[0]]} by {datapd_hep_th['authors'].iloc[s[0]]} ({str(s[1])}). Arxiv: {datapd_hep_th['id'].iloc[s[0]]}")

In [18]:
recommend("1005.4690|1107.2116",15)

 
 1) Effective Holographic Theories for low-temperature condensed matter
  systems by C. Charmousis (1,2), B. Gout\'eraux (1), B. S. Kim (3,4), E. Kiritsis
  (4) and Rene Meyer (4) ((1) LPT Orsay, Univ. Paris-Sud, (2) LMPT, Univ.
  Tours, (3) IESL-FORTH, Greece, (4) CCTP, Univ. of Crete) (0.44348332). Arxiv: 1005.4690
 
 2) Holography of electrically and magnetically charged black branes by Zhenhua Zhou, Jian-Pin Wu and Yi Ling (0.43717888). Arxiv: 1807.07704
 
 3) Solution generating in 5D Einstein-Maxwell-dilaton gravity and
  derivation of dipole black ring solutions by Stoytcho S. Yazadjiev (0.4356358). Arxiv: hep-th/0604140
 
 4) Gravitating BIon and BIon black hole with dilaton by Takashi Tamaki, Takashi Torii (0.43228203). Arxiv: gr-qc/0004071
 
 5) Charged Dilatonic Black Holes and their Transport Properties by Blaise Gouteraux, Bom Soo Kim and Rene Meyer (0.42729348). Arxiv: 1102.4440
 
 6) SU(3) Einstein-Yang-Mills-Dilaton Sphalerons and Black Holes by Burkhard Kleihaus, Jut

## Model 2 -- Doc2Vec ##
### Again, just focusing on hep-th ###

In [265]:
def testmodel(model,phrase,number):
    new_doc = gensim.parsing.preprocessing.preprocess_string(phrase)
    test_doc_vector = model.infer_vector(new_doc)
    sims = model.docvecs.most_similar(positive = [test_doc_vector],topn=number)
    for (s,i) in zip(sims,range(1,number+1)):
        print( f"{i}) {datapd_hep_th['title'].iloc[s[0]]} by {datapd_hep_th['authors'].iloc[s[0]]} ({str(s[1])}). Arxiv: {datapd_hep_th['id'].iloc[s[0]]} \n") 

In [285]:
def recommend_doc2vec(model,string,number):
    pointer3=hep_abstracts.loc[(hep_abstracts['id'].str.match(string))]
    queryvalue3=pointer3['abstract'].values
    processed_query3=preprocess_documents(queryvalue3)
    temper=[[] for y in range(len(processed_query3))]
    for i in range(len(processed_query3)):
        temper[i]=model.infer_vector(processed_query3[i])
    sims = model.docvecs.most_similar(positive = temper,topn=1+number+len(processed_query3))
    for (s,j) in zip(sims,range(1,number+1+len(processed_query3))):
        if j<=len(processed_query3):
            continue
        print( f"{j-len(processed_query3)}) {datapd_hep_th['title'].iloc[s[0]]} by {datapd_hep_th['authors'].iloc[s[0]]} ({str(s[1])}). Arxiv: {datapd_hep_th['id'].iloc[s[0]]} \n") 

In [287]:
tagged_hep_corpus = [TaggedDocument(d, [i]) for i, d in enumerate(processed_hep)]

In [302]:
model_big_bow300_5_10 = Doc2Vec(tagged_hep_corpus, dm=0, vector_size=300, window=5, min_count=1, epochs=10, hs=1)

In [303]:
recommend_doc2vec(model_big_bow300_5_10,"1212.4103|1212.3616",10)

1) Conformal Field Theories in Fractional Dimensions by S. El-Showk, M. Paulos, D. Poland, S. Rychkov, D. Simmons-Duffin, A.
  Vichi (0.45039984583854675). Arxiv: 1309.5089 

2) Analytic bootstrap at large spin by Apratim Kaviraj, Kallol Sen and Aninda Sinha (0.4402136206626892). Arxiv: 1502.01437 

3) Bootstrapping the O(N) Vector Models by Filip Kos, David Poland, David Simmons-Duffin (0.4221642017364502). Arxiv: 1307.6856 

4) Higher Spin ANEC and the Space of CFTs by David Meltzer (0.42161595821380615). Arxiv: 1811.01913 

5) Fast Conformal Bootstrap and Constraints on 3d Gravity by Nima Afkhami-Jeddi, Thomas Hartman, and Amirhossein Tajdini (0.4142104685306549). Arxiv: 1903.06272 

6) Correlation functions in scalar field theory at large charge by Guillermo Arias-Tamargo, Diego Rodriguez-Gomez, Jorge G. Russo (0.41208416223526). Arxiv: 1912.01623 

7) Conformal Field Theories and Deep Inelastic Scattering by Zohar Komargodski, Manuela Kulaxizi, Andrei Parnachev, and Alexander
  Zh

In [306]:
testmodel(model_big_bow300_5_10,"sparse conformal field theories holgraphy",10)

1) Conformal Field Theory Correlators From sine-Gordon Model on AdS
  Spacetime by Sze-Shiang Feng and Mu-Lin Yan (0.49992844462394714). Arxiv: hep-th/0008179 

2) An Introduction to Conformal Field Theory by Matthias R Gaberdiel (0.4996393322944641). Arxiv: hep-th/9910156 

3) Fermionisation of a Two-Dimensional Free Massless Complex Scalar Field by Laure Gouba (IMSP, CIPMA, Rep. Benin), Gabriel Y.H. Avossevou (IMSP,
  CIPMA, Rep. Benin), Jan Govaerts (UCL, Louvain-la-Neuve, Belgium), M. Norbert
  Hounkonnou (IMSP, CIPMA, Rep. Benin) (0.49109622836112976). Arxiv: hep-th/0408024 

4) Operator Product Expansions and Consistency Relations in a O(N)
  Invariant Fermionic CFT for 2<d<4 by Anastasios C. Petkou (0.4671935439109802). Arxiv: hep-th/9602054 

5) Logarithmic conformal field theories and AdS correspondence by A.M. Ghezelbash. M. Khorrami, A. Aghamohammadi (0.4661291241645813). Arxiv: hep-th/9807034 

6) On the BCFT Description of Holes in the c=1 Matrix Model by Davide Gaiotto, N

## Testing different models ##

### We found that CBOW works much better than skipgram for a given windowsize, vector dimensions and epochs. ###

In [293]:
model_big_bow200_2_10 = pickle.load(open("Modelsfeb3/model_big_bow200_2_10", 'rb'))
model_big_bow200_2_20 = pickle.load(open("Modelsfeb3/model_big_bow200_2_20", 'rb'))
model_big_bow200_2_30 = pickle.load(open("Modelsfeb3/model_big_bow200_2_30", 'rb'))
model_big_bow200_3_10 = pickle.load(open("Modelsfeb3/model_big_bow200_3_10", 'rb'))
model_big_bow200_3_20 = pickle.load(open("Modelsfeb3/model_big_bow200_3_20", 'rb'))
model_big_bow200_3_30 = pickle.load(open("Modelsfeb3/model_big_bow200_3_30", 'rb'))
model_big_bow200_4_10 = pickle.load(open("Modelsfeb3/model_big_bow200_4_10", 'rb'))
model_big_bow200_4_20 = pickle.load(open("Modelsfeb3/model_big_bow200_4_20", 'rb'))

In [294]:
model_big_bow300_2_10 = pickle.load(open("Modelsfeb3/model_big_bow300_3_10", 'rb'))
model_big_bow300_2_20 = pickle.load(open("Modelsfeb3/model_big_bow300_3_20", 'rb'))
model_big_bow300_2_30 = pickle.load(open("Modelsfeb3/model_big_bow300_3_30", 'rb'))
model_big_bow300_3_10 = pickle.load(open("Modelsfeb3/model_big_bow300_3_10", 'rb'))
model_big_bow300_3_20 = pickle.load(open("Modelsfeb3/model_big_bow300_3_20", 'rb'))
model_big_bow300_3_30 = pickle.load(open("Modelsfeb3/model_big_bow300_3_30", 'rb'))
model_big_bow300_4_10 = pickle.load(open("Modelsfeb3/model_big_bow300_4_10", 'rb'))
model_big_bow300_4_20 = pickle.load(open("Modelsfeb3/model_big_bow300_4_20", 'rb'))
model_big_bow300_5_20 = pickle.load(open("Modelsfeb3/model_big_bow300_5_20", 'rb'))
model_big_bow300_5_30 = pickle.load(open("Modelsfeb3/model_big_bow300_5_30", 'rb'))

In [298]:
model_big_skipgram_200_2_20 = pickle.load(open("Modelsfeb3/model_big_skipgram200_2_20", 'rb'))
model_big_skipgram_200_2_30 = pickle.load(open("Modelsfeb3/model_big_skipgram200_2_30", 'rb'))
model_big_skipgram_200_2_30 = pickle.load(open("Modelsfeb3/model_big_skipgram200_2_30", 'rb'))
model_big_skipgram_200_4_10 = pickle.load(open("Modelsfeb3/model_big_skipgram200_4_10", 'rb'))
model_big_skipgram_200_4_20 = pickle.load(open("Modelsfeb3/model_big_skipgram200_4_20", 'rb'))

In [301]:
model_big_skipgram_300_2_20 = pickle.load(open("Modelsfeb3/model_big_skipgram300_2_20", 'rb'))
model_big_skipgram_300_2_30 = pickle.load(open("Modelsfeb3/model_big_skipgram300_2_30", 'rb'))
model_big_skipgram_300_3_20 = pickle.load(open("Modelsfeb3/model_big_skipgram300_3_20", 'rb'))
model_big_skipgram_300_3_30 = pickle.load(open("Modelsfeb3/model_big_skipgram300_3_30", 'rb'))
model_big_skipgram_300_4_10 = pickle.load(open("Modelsfeb3/model_big_skipgram300_4_10", 'rb'))
model_big_skipgram_300_4_20 = pickle.load(open("Modelsfeb3/model_big_skipgram300_4_20", 'rb'))

# Our (not definitive, still testing) conclusion -- CBOW with vector dimensions = 300 and window=5, works best #