In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd

In [3]:
df = pd.read_csv(r'arxiv_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51774 entries, 0 to 51773
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   titles     51774 non-null  object
 1   summaries  51774 non-null  object
 2   terms      51774 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


In [4]:
df.head()

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


In [5]:
df.drop(['terms'],inplace =True, axis=1)
df.sample(3)

Unnamed: 0,titles,summaries
20115,Strong Black-box Adversarial Attacks on Unsupe...,Machine Learning (ML) and Deep Learning (DL) m...
10658,Do Vision Transformers See Like Convolutional ...,Convolutional neural networks (CNNs) have so f...
32920,Causal variables from reinforcement learning u...,Many open problems in machine learning are int...


In [6]:
df.isnull().sum()

titles       0
summaries    0
dtype: int64

In [7]:
df.duplicated().sum()

12789

In [8]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38985 entries, 0 to 51772
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   titles     38985 non-null  object
 1   summaries  38985 non-null  object
dtypes: object(2)
memory usage: 913.7+ KB


In [12]:
# preprocessing
import re
import nltk
nltk.download('omw-1.4')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def preprocess_text(text):
    
    text=str(text)
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words
    #tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the tokens
    #tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return " ".join(tokens)

def preprocess_dataframe(df, column_name):
    df[column_name] = df[column_name].apply(preprocess_text)
    return df

proc_df = preprocess_dataframe(df, 'titles')
proc_df = preprocess_dataframe(df, 'summaries')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
proc_df

Unnamed: 0,titles,summaries
0,survey on semantic stereo matching semantic de...,stereo matching is one of the widely used tech...
1,future ai guiding principles and consensus rec...,the recent advancements in artificial intellig...
2,enforcing mutual consistency of hard regions f...,in this paper we proposed a novel mutual consi...
3,parameter decoupling strategy for semi supervi...,consistency training has proven to be an advan...
4,background foreground segmentation for interio...,to ensure safety in automated driving the corr...
...,...,...
51767,a ray based approach for boundary estimation o...,diffusion tensor imaging dti is a non invasive...
51768,statistical denoising for single molecule fluo...,single molecule fluorescence microscopy is a p...
51770,blinking molecule tracking,we discuss a method for tracking individual mo...
51771,towards a mathematical foundation of immunolog...,we attempt to set a mathematical foundation of...


In [53]:
#!pip install sentence-transformers

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: bert-base-nli-mean-tokens
2023-02-02 08:39:31.239 Load pretrained SentenceTransformer: bert-base-nli-mean-tokens
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu
2023-02-02 08:39:32.825 Use pytorch device: cpu


In [18]:
sample_proc_df = proc_df.iloc[:10000, :]
texts = sample_proc_df[['titles', 'summaries']].apply(lambda x: " ".join(x), axis=1).tolist()

embeddings = model.encode(texts)

df_embeddings = pd.DataFrame(embeddings, index=sample_proc_df.index)

df_final = pd.concat([sample_proc_df, df_embeddings], axis=1)
df_final

Unnamed: 0,titles,summaries,0,1,2,3,4,5,6,7,...,758,759,760,761,762,763,764,765,766,767
0,survey on semantic stereo matching semantic de...,stereo matching is one of the widely used tech...,-0.401362,0.274166,0.737166,0.061062,-0.130100,-1.155461,-0.219747,-0.160350,...,-0.615661,-0.332307,-1.073472,-1.877089,-0.430962,-0.637424,-0.012629,-0.386892,-0.890290,0.287929
1,future ai guiding principles and consensus rec...,the recent advancements in artificial intellig...,0.065373,0.927454,1.059528,-0.150318,0.155360,-0.375887,0.679360,-0.161551,...,-0.529815,0.007603,-0.565516,-2.106932,-0.253091,-0.667359,-0.487242,-0.192181,-0.572494,0.304154
2,enforcing mutual consistency of hard regions f...,in this paper we proposed a novel mutual consi...,-0.158494,0.666247,0.715463,0.264252,0.363120,-0.505795,0.369803,-0.755779,...,-0.329765,-0.215263,-0.649595,-1.988840,-0.194235,-0.150264,-0.545643,-0.202913,-0.090725,0.584428
3,parameter decoupling strategy for semi supervi...,consistency training has proven to be an advan...,-0.281313,0.271175,1.395214,0.238585,0.282723,-0.552527,0.457093,-0.906195,...,-0.418928,-0.583181,-0.871498,-1.915339,-0.103052,-0.562866,-0.811119,-0.229740,-0.414263,0.491176
4,background foreground segmentation for interio...,to ensure safety in automated driving the corr...,-0.615799,0.812330,0.941279,0.412257,0.172324,-0.480968,0.325665,-0.521881,...,-0.483864,-0.719375,-0.939510,-1.444788,-0.402659,-1.114555,0.137149,-1.258552,-1.627449,0.159872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51767,a ray based approach for boundary estimation o...,diffusion tensor imaging dti is a non invasive...,,,,,,,,,...,,,,,,,,,,
51768,statistical denoising for single molecule fluo...,single molecule fluorescence microscopy is a p...,,,,,,,,,...,,,,,,,,,,
51770,blinking molecule tracking,we discuss a method for tracking individual mo...,,,,,,,,,...,,,,,,,,,,
51771,towards a mathematical foundation of immunolog...,we attempt to set a mathematical foundation of...,,,,,,,,,...,,,,,,,,,,


In [19]:
df_final = pd.concat([sample_proc_df, df_embeddings], axis=1)
df_final

Unnamed: 0,titles,summaries,0,1,2,3,4,5,6,7,...,758,759,760,761,762,763,764,765,766,767
0,survey on semantic stereo matching semantic de...,stereo matching is one of the widely used tech...,-0.401362,0.274166,0.737166,0.061062,-0.130100,-1.155461,-0.219747,-0.160350,...,-0.615661,-0.332307,-1.073472,-1.877089,-0.430962,-0.637424,-0.012629,-0.386892,-0.890290,0.287929
1,future ai guiding principles and consensus rec...,the recent advancements in artificial intellig...,0.065373,0.927454,1.059528,-0.150318,0.155360,-0.375887,0.679360,-0.161551,...,-0.529815,0.007603,-0.565516,-2.106932,-0.253091,-0.667359,-0.487242,-0.192181,-0.572494,0.304154
2,enforcing mutual consistency of hard regions f...,in this paper we proposed a novel mutual consi...,-0.158494,0.666247,0.715463,0.264252,0.363120,-0.505795,0.369803,-0.755779,...,-0.329765,-0.215263,-0.649595,-1.988840,-0.194235,-0.150264,-0.545643,-0.202913,-0.090725,0.584428
3,parameter decoupling strategy for semi supervi...,consistency training has proven to be an advan...,-0.281313,0.271175,1.395214,0.238585,0.282723,-0.552527,0.457093,-0.906195,...,-0.418928,-0.583181,-0.871498,-1.915339,-0.103052,-0.562866,-0.811119,-0.229740,-0.414263,0.491176
4,background foreground segmentation for interio...,to ensure safety in automated driving the corr...,-0.615799,0.812330,0.941279,0.412257,0.172324,-0.480968,0.325665,-0.521881,...,-0.483864,-0.719375,-0.939510,-1.444788,-0.402659,-1.114555,0.137149,-1.258552,-1.627449,0.159872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10420,on pursuit of designing multi modal transforme...,video grounding aims to localize the temporal ...,-0.018061,0.373618,0.914051,0.656141,0.207891,-0.539779,0.295630,-0.092069,...,-0.165338,-0.407510,-1.032758,-0.713749,-0.457005,-0.201730,-0.150337,-0.132504,-0.541550,0.851033
10421,understanding and resolving performance degrad...,a graph convolutional network gcn stacks sever...,-0.755894,0.498649,0.494526,0.393653,-0.293687,-0.187803,1.121835,-0.239535,...,-0.320823,0.100917,-1.588022,-1.633420,-0.101100,-0.411805,-0.395718,-0.375819,-0.578289,0.336016
10422,lipschitz normalization for self attention lay...,attention based neural networks are state of t...,-0.737709,0.708255,0.862576,0.329984,-0.261250,-0.845091,0.849795,-0.035448,...,-0.416458,-0.065611,-1.430246,-2.139826,-0.367281,-0.475605,-0.512294,-0.367532,-0.737827,0.386805
10424,systematic generalisation through task tempora...,this work introduces a neuro symbolic agent th...,-0.371549,0.714129,0.746740,0.330015,0.323127,-0.693621,1.271481,-0.167864,...,-0.023782,0.312450,-0.929245,-1.586567,-0.399263,0.049273,0.009188,-0.259865,-1.371033,0.528589


In [94]:
df_last_columns = df_final.iloc[:, -767:]
df_last_columns['paper_embeddings'] = df_last_columns.apply(lambda x: ','.join(x.astype(str)), axis=1)
df_result = df_final.iloc[:, :3].join(df_last_columns[['paper_embeddings']])
df_result



Unnamed: 0,titles,summaries,0,paper_embeddings
0,survey on semantic stereo matching semantic de...,stereo matching is one of the widely used tech...,-0.401362,"0.27416563,0.7371659,0.061061595,-0.13009974,-..."
1,future ai guiding principles and consensus rec...,the recent advancements in artificial intellig...,0.065373,"0.92745405,1.0595278,-0.15031834,0.15536031,-0..."
2,enforcing mutual consistency of hard regions f...,in this paper we proposed a novel mutual consi...,-0.158494,"0.6662466,0.71546316,0.2642523,0.36312008,-0.5..."
3,parameter decoupling strategy for semi supervi...,consistency training has proven to be an advan...,-0.281313,"0.2711747,1.3952142,0.23858501,0.28272292,-0.5..."
4,background foreground segmentation for interio...,to ensure safety in automated driving the corr...,-0.615799,"0.8123297,0.94127905,0.41225722,0.17232448,-0...."
...,...,...,...,...
10420,on pursuit of designing multi modal transforme...,video grounding aims to localize the temporal ...,-0.018061,"0.3736176,0.9140511,0.65614057,0.2078915,-0.53..."
10421,understanding and resolving performance degrad...,a graph convolutional network gcn stacks sever...,-0.755894,"0.49864912,0.494526,0.39365342,-0.2936868,-0.1..."
10422,lipschitz normalization for self attention lay...,attention based neural networks are state of t...,-0.737709,"0.70825505,0.8625759,0.32998365,-0.26125005,-0..."
10424,systematic generalisation through task tempora...,this work introduces a neuro symbolic agent th...,-0.371549,"0.7141286,0.74674,0.33001506,0.32312724,-0.693..."


In [91]:
df_result

Unnamed: 0,titles,summaries,0,paper_embeddings
0,survey on semantic stereo matching semantic de...,stereo matching is one of the widely used tech...,-0.401362,survey on semantic stereo matching semantic de...
1,future ai guiding principles and consensus rec...,the recent advancements in artificial intellig...,0.065373,future ai guiding principles and consensus rec...
2,enforcing mutual consistency of hard regions f...,in this paper we proposed a novel mutual consi...,-0.158494,enforcing mutual consistency of hard regions f...
3,parameter decoupling strategy for semi supervi...,consistency training has proven to be an advan...,-0.281313,parameter decoupling strategy for semi supervi...
4,background foreground segmentation for interio...,to ensure safety in automated driving the corr...,-0.615799,background foreground segmentation for interio...
...,...,...,...,...
10420,on pursuit of designing multi modal transforme...,video grounding aims to localize the temporal ...,-0.018061,on pursuit of designing multi modal transforme...
10421,understanding and resolving performance degrad...,a graph convolutional network gcn stacks sever...,-0.755894,understanding and resolving performance degrad...
10422,lipschitz normalization for self attention lay...,attention based neural networks are state of t...,-0.737709,lipschitz normalization for self attention lay...
10424,systematic generalisation through task tempora...,this work introduces a neuro symbolic agent th...,-0.371549,systematic generalisation through task tempora...


In [95]:

df_final=df_result


In [96]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 10425
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   titles            10000 non-null  object 
 1   summaries         10000 non-null  object 
 2   0                 10000 non-null  float32
 3   paper_embeddings  10000 non-null  object 
dtypes: float32(1), object(3)
memory usage: 609.6+ KB


In [97]:
from google.colab import files
df_final.to_csv("df_final.csv")
files.download("df_final.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
df_final.to_csv("df_final.csv")
files.download("df_final.csv")

In [110]:
df = pd.read_csv("df_final_small.csv")
df.drop(['0'],inplace=True, axis=1)
df

Unnamed: 0.1,Unnamed: 0,titles,summaries,paper_embeddings
0,0,survey on semantic stereo matching semantic de...,stereo matching is one of the widely used tech...,"0.27416563,0.7371659,0.061061595,-0.13009974,-..."
1,1,future ai guiding principles and consensus rec...,the recent advancements in artificial intellig...,"0.92745405,1.0595278,-0.15031834,0.15536031,-0..."
2,2,enforcing mutual consistency of hard regions f...,in this paper we proposed a novel mutual consi...,"0.6662466,0.71546316,0.2642523,0.36312008,-0.5..."
3,3,parameter decoupling strategy for semi supervi...,consistency training has proven to be an advan...,"0.2711747,1.3952142,0.23858501,0.28272292,-0.5..."
4,4,background foreground segmentation for interio...,to ensure safety in automated driving the corr...,"0.8123297,0.94127905,0.41225722,0.17232448,-0...."
...,...,...,...,...
9995,10420,on pursuit of designing multi modal transforme...,video grounding aims to localize the temporal ...,"0.3736176,0.9140511,0.65614057,0.2078915,-0.53..."
9996,10421,understanding and resolving performance degrad...,a graph convolutional network gcn stacks sever...,"0.49864912,0.494526,0.39365342,-0.2936868,-0.1..."
9997,10422,lipschitz normalization for self attention lay...,attention based neural networks are state of t...,"0.70825505,0.8625759,0.32998365,-0.26125005,-0..."
9998,10424,systematic generalisation through task tempora...,this work introduces a neuro symbolic agent th...,"0.7141286,0.74674,0.33001506,0.32312724,-0.693..."


In [112]:
%%writefile app.py
import streamlit as st
import ast
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load the dataset of ML papers
df = pd.read_csv("df_final_small.csv")
df.drop(['0'],inplace=True, axis=1)

st.write("# Machine Learning Paper Recommendation System")
query = st.text_input("Enter the paper title and summary:")
model = SentenceTransformer('bert-base-nli-mean-tokens')
if query:
    # Encode the queries
    query_embedding = model.encode([query])[0] # Using pre-trained NLP model

    # Calculate the cosine similarity
    paper_embeddings = [ast.literal_eval(e) for e in df["paper_embeddings"]]
    paper_embeddings = np.array(paper_embeddings)
    scores = cosine_similarity(paper_embeddings, query_embedding.reshape(1, -1))

    # Select the top results
    top_k = 5
    top_results = df.iloc[np.argsort(scores.flatten())[-top_k:][::-1]]

    # Present the recommendations
    st.write("Top {} recommended papers:".format(top_k))
    st.write(top_results[["title", "summary"]])
else:
    st.write("Enter a paper title and summary to get recommendations.")


Overwriting app.py


In [31]:
!pip install streamlit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [41]:
 ! pip install pyngrok

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [113]:
from pyngrok import ngrok

ngrok.set_auth_token("2LArVCZnMmuPMV0uYjEx4zAOSpx_6AiMAuvnRpVMQxFzSkvaE") 


!nohup streamlit run app.py --server.port 80 &
url = ngrok.connect(port = '80')
print(url)

INFO:pyngrok.process:Updating authtoken for default "config_path" of "ngrok_path": /usr/local/lib/python3.8/dist-packages/pyngrok/bin/ngrok
2023-02-02 09:57:29.021 Updating authtoken for default "config_path" of "ngrok_path": /usr/local/lib/python3.8/dist-packages/pyngrok/bin/ngrok


nohup: appending output to 'nohup.out'


INFO:pyngrok.ngrok:Opening tunnel named: http-80-785684b0-a4cb-4f5c-8921-cfa980416e40
2023-02-02 09:57:29.289 Opening tunnel named: http-80-785684b0-a4cb-4f5c-8921-cfa980416e40
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg="no configuration paths supplied"
2023-02-02 09:57:29.431 t=2023-02-02T09:57:29+0000 lvl=info msg="no configuration paths supplied"
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg="using configuration at default config path" path=/root/.ngrok2/ngrok.yml
2023-02-02 09:57:29.444 t=2023-02-02T09:57:29+0000 lvl=info msg="using configuration at default config path" path=/root/.ngrok2/ngrok.yml
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg="open config file" path=/root/.ngrok2/ngrok.yml err=nil
2023-02-02 09:57:29.460 t=2023-02-02T09:57:29+0000 lvl=info msg="open config file" path=/root/.ngrok2/ngrok.yml err=nil
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg="starting web service" obj=web addr

NgrokTunnel: "http://7541-34-134-231-216.ngrok.io" -> "http://localhost:80"


2023-02-02 09:57:29.767 t=2023-02-02T09:57:29+0000 lvl=info msg="started tunnel" obj=tunnels name="http-80-785684b0-a4cb-4f5c-8921-cfa980416e40 (http)" addr=http://localhost:80 url=http://7541-34-134-231-216.ngrok.io
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg="started tunnel" obj=tunnels name=http-80-785684b0-a4cb-4f5c-8921-cfa980416e40 addr=http://localhost:80 url=https://7541-34-134-231-216.ngrok.io
2023-02-02 09:57:29.780 t=2023-02-02T09:57:29+0000 lvl=info msg="started tunnel" obj=tunnels name=http-80-785684b0-a4cb-4f5c-8921-cfa980416e40 addr=http://localhost:80 url=https://7541-34-134-231-216.ngrok.io
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg=end pg=/api/tunnels id=b098012bcc5fd721 status=201 dur=120.824339ms
2023-02-02 09:57:29.791 t=2023-02-02T09:57:29+0000 lvl=info msg=end pg=/api/tunnels id=b098012bcc5fd721 status=201 dur=120.824339ms
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg=start pg="/api/tunnels/http-

In [37]:
import streamlit as st

st.title("Machine Learning Paper Recommendation System")

query = st.text_input("Enter the paper title and summary:")

if query:
    query_embedding = model.encode([query])[0]
    scores = df_final.iloc[:, 2:].apply(lambda x: cosine_similarity(x, query_embedding), axis=1)
    top_papers = df_final.loc[scores.sort_values(ascending=False).index[:5]]
    st.write("Top 5 recommended papers:")
    st.write(top_papers[["title", "summary"]])
else:
    st.write("Enter a paper title and summary to get recommendations.")



In [111]:
ngrok.kill()

INFO:pyngrok.process:Killing ngrok process: 78769
2023-02-02 09:57:17.028 Killing ngrok process: 78769
