In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from scipy.spatial.distance import cosine

In [None]:
df = pd.read_csv('arxiv_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52612 entries, 0 to 52611
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   titles     52612 non-null  object
 1   summaries  52608 non-null  object
 2   terms      52605 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


In [None]:
df.head()

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


In [None]:
df.drop(['terms'],inplace =True, axis=1)
df.sample(3)

Unnamed: 0,titles,summaries
36151,Dual Control for Approximate Bayesian Reinforc...,"Control of non-episodic, finite-horizon dynami..."
3743,Unsupervised Learning of Long-Term Motion Dyna...,We present an unsupervised representation lear...
16085,Self-Paced Video Data Augmentation with Dynami...,There is an urgent need for an effective video...


In [None]:
df.duplicated().sum()

12789

In [None]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38985 entries, 0 to 51772
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   titles     38985 non-null  object
 1   summaries  38985 non-null  object
dtypes: object(2)
memory usage: 913.7+ KB


In [None]:
df= df.iloc[:1000, :]
# preprocessing
import re
import nltk
nltk.download('omw-1.4')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def preprocess_text(text):
    
    text=str(text)
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words
    #tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the tokens
    #tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return " ".join(tokens)

def preprocess_dataframe(df, column_name):
    df[column_name] = df[column_name].apply(preprocess_text)
    return df

proc_df = preprocess_dataframe(df, 'titles')
proc_df = preprocess_dataframe(df, 'summaries')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:

import spacy

# Load the English model
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

# Apply the lemmatization function to each row of the dataframe
df["titles"] = df["titles"].apply(lemmatize_text)
df["summaries"] = df["summaries"].apply(lemmatize_text)




In [None]:
proc_df

Unnamed: 0,titles,summaries
0,survey on semantic stereo matching semantic de...,stereo matching is one of the widely used tech...
1,future ai guiding principles and consensus rec...,the recent advancements in artificial intellig...
2,enforcing mutual consistency of hard regions f...,in this paper we proposed a novel mutual consi...
3,parameter decoupling strategy for semi supervi...,consistency training has proven to be an advan...
4,background foreground segmentation for interio...,to ensure safety in automated driving the corr...
...,...,...
995,deepigeos a deep interactive geodesic framewor...,accurate medical image segmentation is essenti...
996,d densely convolutional networks for volumetri...,in the isointense stage the accurate volumetri...
997,ui net interactive artificial neural networks ...,for complex segmentation tasks fully automatic...
998,one shot learning for semantic segmentation,low shot learning methods for image classifica...


In [None]:
from google.colab import files
proc_df.to_csv("MLpaperembed.csv")
files.download("MLpaperembed.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
!pip install tensorflow --quiet

In [6]:
df = pd.read_csv("MLpaperembed.csv",index_col=[0])
df['title_summaries'] = df['titles'] + " " + df['summaries']
df.head()

Unnamed: 0,titles,summaries,title_embeddings,summary_embeddings,title_summaries
0,survey on semantic stereo matching semantic de...,stereo matching is one of the widely used tech...,[[-0.46689484 -0.2965702 -0.6477647 ... -0.2...,[[-0.43845478 -0.23303717 -0.22184424 ... -0.1...,survey on semantic stereo matching semantic de...
1,future ai guiding principles and consensus rec...,the recent advancements in artificial intellig...,[[-0.40950543 -0.1522009 -0.25473914 ... -0.5...,[[-5.31480730e-01 1.24591634e-01 -8.61039683e...,future ai guiding principles and consensus rec...
2,enforcing mutual consistency of hard regions f...,in this paper we proposed a novel mutual consi...,[[-0.61176145 -0.33108956 -0.29513532 ... -0.2...,[[-0.64391226 -0.27200112 -0.02847539 ... -0.2...,enforcing mutual consistency of hard regions f...
3,parameter decoupling strategy for semi supervi...,consistency training has proven to be an advan...,[[-5.8726233e-01 -4.2740965e-01 -1.1960147e-01...,[[-0.41404715 -0.15180549 0.09761048 ... -0.4...,parameter decoupling strategy for semi supervi...
4,background foreground segmentation for interio...,to ensure safety in automated driving the corr...,[[-0.34370944 -0.2880199 -0.18248755 ... -0.4...,[[-0.53730506 0.03136821 0.08903427 ... -0.7...,background foreground segmentation for interio...


In [21]:
new_df = df[['titles','title_summaries']]
new_df.head()

Unnamed: 0,titles,title_summaries
0,survey on semantic stereo matching semantic de...,survey on semantic stereo matching semantic de...
1,future ai guiding principles and consensus rec...,future ai guiding principles and consensus rec...
2,enforcing mutual consistency of hard regions f...,enforcing mutual consistency of hard regions f...
3,parameter decoupling strategy for semi supervi...,parameter decoupling strategy for semi supervi...
4,background foreground segmentation for interio...,background foreground segmentation for interio...


In [27]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from keras.callbacks import ModelCheckpoint

data = new_df

# Convert the title_summaries column to integers using LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(data['title_summaries'])
encoded_labels = label_encoder.transform(data['title_summaries'])

# Tokenize the text and query columns
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['title_summaries'].tolist() + data['titles'].tolist())

# Get the maximum sequence length
max_sequence_length = max([len(x.split()) for x in data['title_summaries'].tolist() + data['titles'].tolist()])

# Convert the text and query columns to sequences
text_sequences = tokenizer.texts_to_sequences(data['title_summaries'].tolist())
text_sequences_padded = pad_sequences(text_sequences, maxlen=max_sequence_length)
query_sequences = tokenizer.texts_to_sequences(data['titles'].tolist())
query_sequences_padded = pad_sequences(query_sequences, maxlen=max_sequence_length)

# Define the LSTM model architecture
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=max_sequence_length)(input_layer)
lstm_layer = LSTM(128)(embedding_layer)
output_layer = Dense(len(data['title_summaries'].unique()), activation='softmax')(lstm_layer)
model = Model(inputs=input_layer, outputs=output_layer)


# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

checkpoint_name = 'best_model.hdf5'
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

# Train the model with one-hot encoded labels
model.fit(text_sequences_padded, tf.keras.utils.to_categorical(encoded_labels), epochs=20,callbacks=callbacks_list)

# Retrieve the most similar text to the query
#query_sequence_padded = pad_sequences(tokenizer.texts_to_sequences(['decoupling strategy']), maxlen=max_sequence_length)
#query_embedding = model.layers[1](query_sequence_padded)
#query_lstm = model.layers[2](query_embedding)
#cosine_similarities = cosine_similarity(query_lstm.numpy(), text_sequences_padded)
#most_similar_text_index = np.argmax(cosine_similarities)
#most_similar_text = label_encoder.inverse_transform([most_similar_text_index])[0]
#most_similar_text 


Epoch 1/20



Epoch 2/20



Epoch 3/20



Epoch 4/20



Epoch 5/20



Epoch 6/20



Epoch 7/20



Epoch 8/20



Epoch 9/20



Epoch 10/20



Epoch 11/20



Epoch 12/20



Epoch 13/20



Epoch 14/20



Epoch 15/20



Epoch 16/20



Epoch 17/20



Epoch 18/20



Epoch 19/20



Epoch 20/20





<keras.callbacks.History at 0x7fbd5f1b47c0>

In [None]:

tf.keras.backend.clear_session()
# Example query sentence
query = "Mutual consistency"

# Load the dataframe with the summaries column
df = pd.read_csv("MLpaperembed.csv",index_col=[0])
df['title_summaries'] = df['titles'] + " " + df['summaries']
df.drop(['title_embeddings','summary_embeddings','titles','summaries'], inplace=True, axis=1)

# Tokenize the summaries and the query
tokenizer = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, split=' ')
tokenizer.fit_on_texts(df['summaries'].append(pd.Series(query)))
encoded_summaries = tokenizer.texts_to_sequences(df['summaries'])
encoded_query = tokenizer.texts_to_sequences([query])

# Pad the encoded summaries and the encoded query to the maximum length
max_length = max([len(seq) for seq in encoded_summaries + encoded_query])
padded_summaries = pad_sequences(encoded_summaries, maxlen=max_length, padding='post')
padded_query = pad_sequences(encoded_query, maxlen=max_length, padding='post')

# Build the model

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(len(tokenizer.word_index) + 1, 128, input_length=max_length))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)))
model.add(tf.keras.layers.GlobalMaxPool1D())
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Get the embeddings of the summaries and the query
summaries_embeddings = model.predict(padded_summaries)
query_embedding = model.predict(padded_query)

# Calculate the cosine similarity between the query embedding and each of the summary embeddings
similarities = [1 - cosine(query_embedding[0], summary_embedding[0]) for summary_embedding in summaries_embeddings]

# Get the index of the closest summary to the query
closest_summary_index = np.argpartition(similarities,5)[-5:]

# Get the closest title and summary
closest_title = df['titles'][closest_summary_index]
#closest_summary = df['summaries'][closest_summary_index]
print(closest_title)
#print(closest_summary)

995    deepigeos a deep interactive geodesic framewor...
996    d densely convolutional networks for volumetri...
2      enforcing mutual consistency of hard regions f...
998          one shot learning for semantic segmentation
999    exploring and exploiting diversity for image s...
Name: titles, dtype: object


In [None]:
!pip install streamlit
! pip install pyngrok

In [None]:
%%writefile app.py
import streamlit as st
import ast
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load the dataset of ML papers
df = pd.read_csv("df_final_small.csv")
df.drop(['0'],inplace=True, axis=1)

st.write("# Machine Learning Paper Recommendation System")
query = st.text_input("Enter the paper title and summary:")
model = SentenceTransformer('bert-base-nli-mean-tokens')
if query:
    # Encode the queries
    query_embedding = model.encode([query])[0] # Using pre-trained NLP model

    # Calculate the cosine similarity
    paper_embeddings = [ast.literal_eval(e) for e in df["paper_embeddings"]]
    paper_embeddings = np.array(paper_embeddings)
    scores = cosine_similarity(paper_embeddings, query_embedding.reshape(1, -1))

    # Select the top results
    top_k = 5
    top_results = df.iloc[np.argsort(scores.flatten())[-top_k:][::-1]]

    # Present the recommendations
    st.write("Top {} recommended papers:".format(top_k))
    st.write(top_results[["title", "summary"]])
else:
    st.write("Enter a paper title and summary to get recommendations.")


Overwriting app.py


In [None]:
from pyngrok import ngrok

ngrok.set_auth_token("2LArVCZnMmuPMV0uYjEx4zAOSpx_6AiMAuvnRpVMQxFzSkvaE") 


!nohup streamlit run app.py --server.port 80 &
url = ngrok.connect(port = '80')
print(url)

INFO:pyngrok.process:Updating authtoken for default "config_path" of "ngrok_path": /usr/local/lib/python3.8/dist-packages/pyngrok/bin/ngrok
2023-02-02 09:57:29.021 Updating authtoken for default "config_path" of "ngrok_path": /usr/local/lib/python3.8/dist-packages/pyngrok/bin/ngrok


nohup: appending output to 'nohup.out'


INFO:pyngrok.ngrok:Opening tunnel named: http-80-785684b0-a4cb-4f5c-8921-cfa980416e40
2023-02-02 09:57:29.289 Opening tunnel named: http-80-785684b0-a4cb-4f5c-8921-cfa980416e40
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg="no configuration paths supplied"
2023-02-02 09:57:29.431 t=2023-02-02T09:57:29+0000 lvl=info msg="no configuration paths supplied"
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg="using configuration at default config path" path=/root/.ngrok2/ngrok.yml
2023-02-02 09:57:29.444 t=2023-02-02T09:57:29+0000 lvl=info msg="using configuration at default config path" path=/root/.ngrok2/ngrok.yml
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg="open config file" path=/root/.ngrok2/ngrok.yml err=nil
2023-02-02 09:57:29.460 t=2023-02-02T09:57:29+0000 lvl=info msg="open config file" path=/root/.ngrok2/ngrok.yml err=nil
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg="starting web service" obj=web addr

NgrokTunnel: "http://7541-34-134-231-216.ngrok.io" -> "http://localhost:80"


2023-02-02 09:57:29.767 t=2023-02-02T09:57:29+0000 lvl=info msg="started tunnel" obj=tunnels name="http-80-785684b0-a4cb-4f5c-8921-cfa980416e40 (http)" addr=http://localhost:80 url=http://7541-34-134-231-216.ngrok.io
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg="started tunnel" obj=tunnels name=http-80-785684b0-a4cb-4f5c-8921-cfa980416e40 addr=http://localhost:80 url=https://7541-34-134-231-216.ngrok.io
2023-02-02 09:57:29.780 t=2023-02-02T09:57:29+0000 lvl=info msg="started tunnel" obj=tunnels name=http-80-785684b0-a4cb-4f5c-8921-cfa980416e40 addr=http://localhost:80 url=https://7541-34-134-231-216.ngrok.io
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg=end pg=/api/tunnels id=b098012bcc5fd721 status=201 dur=120.824339ms
2023-02-02 09:57:29.791 t=2023-02-02T09:57:29+0000 lvl=info msg=end pg=/api/tunnels id=b098012bcc5fd721 status=201 dur=120.824339ms
INFO:pyngrok.process.ngrok:t=2023-02-02T09:57:29+0000 lvl=info msg=start pg="/api/tunnels/http-

In [None]:
ngrok.kill()

NameError: ignored