In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r'arxiv_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51774 entries, 0 to 51773
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   titles     51774 non-null  object
 1   summaries  51774 non-null  object
 2   terms      51774 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


In [3]:
df.head()

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


In [4]:
df.drop(['terms'],inplace =True, axis=1)
df.sample(3)

Unnamed: 0,titles,summaries
36151,Dual Control for Approximate Bayesian Reinforc...,"Control of non-episodic, finite-horizon dynami..."
3743,Unsupervised Learning of Long-Term Motion Dyna...,We present an unsupervised representation lear...
16085,Self-Paced Video Data Augmentation with Dynami...,There is an urgent need for an effective video...


In [None]:
df.isnull().sum()

titles       0
summaries    0
dtype: int64

In [5]:
df.duplicated().sum()

12789

In [6]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38985 entries, 0 to 51772
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   titles     38985 non-null  object
 1   summaries  38985 non-null  object
dtypes: object(2)
memory usage: 913.7+ KB


In [8]:
df= df.iloc[:1000, :]
# preprocessing
import re
import nltk
nltk.download('omw-1.4')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def preprocess_text(text):
    
    text=str(text)
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words
    #tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the tokens
    #tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return " ".join(tokens)

def preprocess_dataframe(df, column_name):
    df[column_name] = df[column_name].apply(preprocess_text)
    return df

proc_df = preprocess_dataframe(df, 'titles')
proc_df = preprocess_dataframe(df, 'summaries')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:

import spacy

# Load the English model
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

# Apply the lemmatization function to each row of the dataframe
df["titles"] = df["titles"].apply(lemmatize_text)
df["summaries"] = df["summaries"].apply(lemmatize_text)




In [9]:
proc_df

Unnamed: 0,titles,summaries
0,survey on semantic stereo matching semantic de...,stereo matching is one of the widely used tech...
1,future ai guiding principles and consensus rec...,the recent advancements in artificial intellig...
2,enforcing mutual consistency of hard regions f...,in this paper we proposed a novel mutual consi...
3,parameter decoupling strategy for semi supervi...,consistency training has proven to be an advan...
4,background foreground segmentation for interio...,to ensure safety in automated driving the corr...
...,...,...
995,deepigeos a deep interactive geodesic framewor...,accurate medical image segmentation is essenti...
996,d densely convolutional networks for volumetri...,in the isointense stage the accurate volumetri...
997,ui net interactive artificial neural networks ...,for complex segmentation tasks fully automatic...
998,one shot learning for semantic segmentation,low shot learning methods for image classifica...


In [10]:
!pip install transformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
import torch
import transformers

# Load the BERT model from the transformers library
model = transformers.BertModel.from_pretrained('bert-base-uncased')
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

# Convert the 'titles' and 'summaries' into numerical representations (embeddings)
def text_to_embeddings(text):
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    input_ids = torch.tensor(input_ids).unsqueeze(0)  # Batch size 1
    with torch.no_grad():
        embeddings = model(input_ids)
    return embeddings[0][0].numpy()  # Take the first element of the first batch

proc_df['title_embeddings'] = proc_df['titles'].apply(text_to_embeddings)
proc_df['summary_embeddings'] = proc_df['summaries'].apply(text_to_embeddings)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [13]:
proc_df

Unnamed: 0,titles,summaries,title_embeddings,summary_embeddings
0,survey on semantic stereo matching semantic de...,stereo matching is one of the widely used tech...,"[[-0.46689484, -0.2965702, -0.6477647, -0.1736...","[[-0.43845478, -0.23303717, -0.22184424, -0.04..."
1,future ai guiding principles and consensus rec...,the recent advancements in artificial intellig...,"[[-0.40950543, -0.1522009, -0.25473914, 0.0119...","[[-0.5314807, 0.12459163, -0.08610397, -0.2717..."
2,enforcing mutual consistency of hard regions f...,in this paper we proposed a novel mutual consi...,"[[-0.61176145, -0.33108956, -0.29513532, -0.17...","[[-0.64391226, -0.27200112, -0.028475389, -0.1..."
3,parameter decoupling strategy for semi supervi...,consistency training has proven to be an advan...,"[[-0.58726233, -0.42740965, -0.11960147, -0.30...","[[-0.41404715, -0.15180549, 0.09761048, -0.142..."
4,background foreground segmentation for interio...,to ensure safety in automated driving the corr...,"[[-0.34370944, -0.2880199, -0.18248755, 0.0075...","[[-0.53730506, 0.031368207, 0.08903427, 0.0483..."
...,...,...,...,...
995,deepigeos a deep interactive geodesic framewor...,accurate medical image segmentation is essenti...,"[[-0.61203927, -0.51077944, 0.048020102, -0.18...","[[-0.7253999, -0.24362628, 0.029179268, -0.213..."
996,d densely convolutional networks for volumetri...,in the isointense stage the accurate volumetri...,"[[-0.7877132, -0.37293816, -0.18111935, -0.012...","[[-0.5892139, -0.21000516, 0.4636682, -0.15474..."
997,ui net interactive artificial neural networks ...,for complex segmentation tasks fully automatic...,"[[-0.3349297, -0.35803187, -0.071189895, 0.107...","[[-0.27074197, -0.041412942, 0.1364401, 0.0675..."
998,one shot learning for semantic segmentation,low shot learning methods for image classifica...,"[[-0.28677863, -0.45384553, -0.4078341, 0.0182...","[[-0.6340567, -0.27668187, 0.21570231, -0.1050..."


In [14]:
from google.colab import files
proc_df.to_csv("MLpaperembed.csv")
files.download("MLpaperembed.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [31]:
proc_df=pd.read_csv('MLpaperembed.csv',index_col=[0])
proc_df.sample(5)

Unnamed: 0,titles,summaries,title_embeddings,summary_embeddings
634,when unseen domain generalization is unnecessa...,recent advances in deep learning for medical i...,[[-0.20077214 0.086312 -0.14481415 ... -0.4...,[[-0.69375634 -0.06468189 0.16749965 ... -0.2...
830,skin lesion segmentation using atrous convolut...,as melanoma diagnoses increase across the us a...,[[-0.42483255 -0.11169448 -0.42448086 ... -0.1...,[[-3.45247895e-01 -7.52259269e-02 1.81502670e...
410,cardiac segmentation with strong anatomical gu...,convolutional neural networks cnn have had unp...,[[-0.94357044 -0.02739831 -0.21742782 ... -0.4...,[[-0.89360714 -0.45374972 -0.03771863 ... -0.4...
789,effective cloud detection and segmentation usi...,being able to effectively identify clouds and ...,[[-0.7245987 -0.27738103 0.27569953 ... -0.8...,[[-1.0434102 -0.2716397 0.538931 ... -0.3...
602,et net a generic edge attention guidance netwo...,segmentation is a fundamental task in medical ...,[[-0.41052943 -0.3205713 -0.21511295 ... -0.3...,[[-0.18318044 -0.10382743 0.16185316 ... -0.1...


In [33]:
proc_df.drop(['title_embeddings','summaries','titles'], inplace=True, axis=1)
proc_df

Unnamed: 0,summary_embeddings
0,[[-0.43845478 -0.23303717 -0.22184424 ... -0.1...
1,[[-5.31480730e-01 1.24591634e-01 -8.61039683e...
2,[[-0.64391226 -0.27200112 -0.02847539 ... -0.2...
3,[[-0.41404715 -0.15180549 0.09761048 ... -0.4...
4,[[-0.53730506 0.03136821 0.08903427 ... -0.7...
...,...
995,[[-0.7253999 -0.24362628 0.02917927 ... -0.7...
996,[[-0.5892139 -0.21000516 0.4636682 ... -0.2...
997,[[-0.27074197 -0.04141294 0.1364401 ... -0.6...
998,[[-0.6340567 -0.27668187 0.21570231 ... -0.5...


In [55]:
import numpy as np
from scipy.spatial.distance import cosine

model = transformers.BertModel.from_pretrained('bert-base-uncased')
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

def text_to_embeddings(text,model,tokenizer):
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    input_ids = torch.tensor(input_ids).unsqueeze(0)  # Batch size 1
    with torch.no_grad(): 
     embeddings = model(input_ids)
    return embeddings[0].numpy().flatten()

def search_engine(query, embeddings_df, model,tokenizer,top_k=1):
    query_embedding = text_to_embeddings(query,model,tokenizer)
    max_len = max([len(embedding) for embedding in embeddings_df['summary_embeddings']])
    distances = []
    for embedding in embeddings_df['summary_embeddings']:
        if len(embedding) < max_len:
            embedding = np.pad(embedding, (0, max_len - len(embedding)), 'constant', constant_values=0)
        distances.append(cosine(query_embedding, embedding))
    nearest_indices = np.argsort(distances)[:top_k]
    return embeddings_df.iloc[nearest_indices] 

proc_df = pd.read_csv("MLpaperembed.csv")
proc_df.drop(['title_embeddings','summary_embeddings','titles'], inplace=True, axis=1)

proc_df['summary_embeddings'] = proc_df['summaries'].apply(lambda x: text_to_embeddings(x, model, tokenizer))

query = "clustering in ML"
result = search_engine(query, proc_df, model, tokenizer)
print(result)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ValueError: ignored

In [None]:
!pip install streamlit
! pip install pyngrok

In [None]:
%%writefile app.py
import streamlit as st
import ast
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load the dataset of ML papers
df = pd.read_csv("df_final_small.csv")
df.drop(['0'],inplace=True, axis=1)

st.write("# Machine Learning Paper Recommendation System")
query = st.text_input("Enter the paper title and summary:")
model = SentenceTransformer('bert-base-nli-mean-tokens')
if query:
    # Encode the queries
    query_embedding = model.encode([query])[0] # Using pre-trained NLP model

    # Calculate the cosine similarity
    paper_embeddings = [ast.literal_eval(e) for e in df["paper_embeddings"]]
    paper_embeddings = np.array(paper_embeddings)
    scores = cosine_similarity(paper_embeddings, query_embedding.reshape(1, -1))

    # Select the top results
    top_k = 5
    top_results = df.iloc[np.argsort(scores.flatten())[-top_k:][::-1]]

    # Present the recommendations
    st.write("Top {} recommended papers:".format(top_k))
    st.write(top_results[["title", "summary"]])
else:
    st.write("Enter a paper title and summary to get recommendations.")


Overwriting app.py


In [None]:
from pyngrok import ngrok

ngrok.set_auth_token("2LArVCZnMmuPMV0uYjEx4zAOSpx_6AiMAuvnRpVMQxFzSkvaE") 


!nohup streamlit run app.py --server.port 80 &
url = ngrok.connect(port = '80')
print(url)

INFO:pyngrok.process:Updating authtoken for default "config_path" of "ngrok_path": /usr/local/lib/python3.8/dist-packages/pyngrok/bin/ngrok
2023-02-02 09:57:29.021 Updating authtoken for default "config_path" of "ngrok_path": /usr/local/lib/python3.8/dist-packages/pyngrok/bin/ngrok


nohup: appending output to 'nohup.out'


INFO:pyngrok.ngrok:Opening tunnel named: http-80-785684b0-a4cb-4f5c-8921-cfa980416e40
2023-02-02 09:57:29.289 Opening tunnel named: http-80-785684b0-a4cb-4f5c-8921-cfa980416e40
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg="no configuration paths supplied"
2023-02-02 09:57:29.431 t=2023-02-02T09:57:29+0000 lvl=info msg="no configuration paths supplied"
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg="using configuration at default config path" path=/root/.ngrok2/ngrok.yml
2023-02-02 09:57:29.444 t=2023-02-02T09:57:29+0000 lvl=info msg="using configuration at default config path" path=/root/.ngrok2/ngrok.yml
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg="open config file" path=/root/.ngrok2/ngrok.yml err=nil
2023-02-02 09:57:29.460 t=2023-02-02T09:57:29+0000 lvl=info msg="open config file" path=/root/.ngrok2/ngrok.yml err=nil
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg="starting web service" obj=web addr

NgrokTunnel: "http://7541-34-134-231-216.ngrok.io" -> "http://localhost:80"


2023-02-02 09:57:29.767 t=2023-02-02T09:57:29+0000 lvl=info msg="started tunnel" obj=tunnels name="http-80-785684b0-a4cb-4f5c-8921-cfa980416e40 (http)" addr=http://localhost:80 url=http://7541-34-134-231-216.ngrok.io
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg="started tunnel" obj=tunnels name=http-80-785684b0-a4cb-4f5c-8921-cfa980416e40 addr=http://localhost:80 url=https://7541-34-134-231-216.ngrok.io
2023-02-02 09:57:29.780 t=2023-02-02T09:57:29+0000 lvl=info msg="started tunnel" obj=tunnels name=http-80-785684b0-a4cb-4f5c-8921-cfa980416e40 addr=http://localhost:80 url=https://7541-34-134-231-216.ngrok.io
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg=end pg=/api/tunnels id=b098012bcc5fd721 status=201 dur=120.824339ms
2023-02-02 09:57:29.791 t=2023-02-02T09:57:29+0000 lvl=info msg=end pg=/api/tunnels id=b098012bcc5fd721 status=201 dur=120.824339ms
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg=start pg="/api/tunnels/http-

In [None]:
ngrok.kill()

INFO:pyngrok.process:Killing ngrok process: 78769
2023-02-02 09:57:17.028 Killing ngrok process: 78769
