In [None]:
# Imports
# data manipulation, gathering
import numpy as np # for array manipulation
import pandas as pd # for dataframe manipulation/reading in data
import json # for reading in Data
from itertools import islice # for slicing and dicing JSON records
import os # for getting the filepath information
import re # to identify characters that are to be removed
import nltk # for preprocessing of textual data
from nltk.corpus import stopwords # for removing stopwords
from nltk.tokenize import word_tokenize # for tokenizing text
from nltk.stem import WordNetLemmatizer # for lemmatizing text
from sklearn.feature_extraction.text import TfidfVectorizer # for featurizing text
from sklearn.metrics.pairwise import cosine_similarity # for getting similarity score
from sklearn.decomposition import PCA #for dimensionality reduction
from sklearn.cluster import KMeans #for clustering
from sklearn.manifold import TSNE #For reducing to 2 dimensions for plotting

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

### Importing the dataset

In [None]:
#Function to yield data from the stored file
def extract_data(path):
    with open(path, 'r') as f:
        for x in f:
            yield x
            
#Defining PATH
PATH = '/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json'


#Creating a data generator to extract data from the JSON file
data_gen = extract_data(PATH)

In [None]:
#Function to yield N records from the data generator
def fetch_n_records(data_gen, n):
    return [json.loads(record) for record in islice(data_gen, n)]

#Fetching 250000 records from the given data for the use of recommender systems
CHUNK_SIZE = 250000
data = fetch_n_records(data_gen, CHUNK_SIZE)

In [None]:
#The variable data records is a list of dictionaries
data[77]

In [None]:
#Function to generate a dataframe from a list of dictionaries
def get_dataframe(list_of_dicts):
    data = pd.DataFrame(list_of_dicts)
    return data

#Generating dataframe from the list of records
data_df = get_dataframe(data)

In [None]:
data_df.info()

In [None]:
data_df = data_df[['id','title','authors','categories', 'abstract']]

In [None]:
data_df.to_csv("data_df.csv",index = False)

In [None]:
data_df['abstract'][5]

### Preprocessing

In [None]:
#Function to decontract contractions
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
#Function to remove all charaters between $ characters 
def remove_eqns(txt):
    reg = re.compile(r'\$*?\$') #Regex for a URL
    return reg.sub(r'', txt)

In [None]:
#Function to replace all \n characters with a space
def remove_newlines(txt):
    return re.sub(r'\n', " ", txt)

In [None]:
#Function to remove all special characters from a text
def remove_spl(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', txt)

In [None]:
#Function to remove stopwords from the text and lemmatize the words in the text
def remove_stopwords(txt):
    words = word_tokenize(txt) #Tokenizing the text
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words] #Removing stopwords and lemmatizing the words
    filtered_txt = ' '.join(filtered_words) #Joining the filtered words back into a string
    return filtered_txt

In [None]:
#Function to remove stopwords from the text and lemmatize the words in the text
def remove_stopwords_lemmatize(txt):
    words = word_tokenize(txt) #Tokenizing the text
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words] #Removing stopwords and lemmatizing the words
    filtered_txt = ' '.join(filtered_words) #Joining the filtered words back into a string
    return filtered_txt

In [None]:
def preprocess(df_column):
    df_column = df_column.apply(decontracted) 
    df_column = df_column.apply(remove_eqns)
    df_column = df_column.apply(remove_newlines)
    df_column = df_column.apply(remove_spl) 
    df_column = df_column.apply(lambda txt : txt.lower()) #Converting text to lowercase
    df_column = df_column.apply(remove_stopwords_lemmatize)
    return df_column

In [None]:
data_df['abstract'] = preprocess(data_df['abstract'])
data_df['title'] = preprocess(data_df['title'])
data_df['authors'] = preprocess(data_df['authors'])

In [None]:
#Function to replace all occurrences of . and - with _ in the given text."""
def replace_chars(text):
    return text.replace('.', '_').replace('-', '_')

In [None]:
#Preprocessing categories
data_df['categories'] = data_df['categories'].apply(replace_chars)

In [None]:
data_df

In [None]:
data_df['final_text'] = data_df['categories'] + " " + data_df['authors'] + " " + data_df['title'] + " " + data_df['abstract']

In [None]:
final_df = data_df[['id','final_text']].copy()

In [None]:
final_df.to_csv("final_df.csv",index = False)

### Feature extraction

In [None]:
initial_df = pd.read_csv("/kaggle/input/research-paper-data/initial_df")
final_df = pd.read_csv("/kaggle/input/research-paper-data/final_df")

In [None]:
tfidf_vectorizer = TfidfVectorizer()

# Generate the tf-idf vectors for the data
tfidf_matrix = tfidf_vectorizer.fit_transform(final_df['final_text'])

In [None]:
tfidf_matrix.shape

### Getting recommendations

In [None]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix[0])
s = cosine_sim.reshape(cosine_sim.shape[0])

In [None]:
np.argsort(-s)[:5]

In [None]:
[s[i] for i in np.argsort(-s)[1:6]]

In [None]:
final_df['id'][0]

In [None]:
def get_recommendations(paper_id:str,tfidf_matrix,num_rec):
    idx = final_df.index[final_df['id'] == paper_id][0]
    sim = cosine_similarity(tfidf_matrix, tfidf_matrix[idx])
    sim = sim.reshape(sim.shape[0])
    top_n_idx = np.argsort(-sim)[1:num_rec+1]
    top_n_id = [final_df['id'][x] for x in top_n_idx]
    return top_n_id

### Dimensionality Reduction

In [None]:
tfidf_vectorizer2 = TfidfVectorizer(max_features=10000)

# Generate the tf-idf vectors for the data
tfidf_matrix2 = tfidf_vectorizer2.fit_transform(final_df['final_text'])

In [None]:
rec = get_recommendations(704.0001,tfidf_matrix,1000)
idxs = list(final_df[final_df['id'].isin(rec)].index)
rec_matrix = tfidf_matrix2[idxs]

In [None]:
pca = PCA(n_components=0.95, random_state=42) #Keep 95% of the variance
reduced_matrix = pca.fit_transform(rec_matrix.toarray())

In [None]:
reduced_matrix.shape

In [None]:
k = 10 # selectable
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(reduced_matrix)

In [None]:
tsne = TSNE(perplexity=100, random_state=42)
two_dim_matrix = tsne.fit_transform(reduced_matrix)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(rc={'figure.figsize':(5,5)})


# plot
sns.scatterplot(x=two_dim_matrix[:,0], y=two_dim_matrix[:,1], hue=y_pred, legend='full', palette="Set1")
plt.title('t-SNE with Kmeans Labels')
plt.savefig("cluster_tsne.png")
plt.show()

In [None]:
import plotly.express as px
fig = px.scatter(initial_df[final_df['id'].isin(rec)], x=two_dim_matrix[:,0], y=two_dim_matrix[:,1], color=y_pred.astype(str),
                 hover_data=['id','title'],
                 height= 525, width=525,
                title = "Clustered Papers")
fig.show()

# Topic Modelling For Keyword Extraction

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizers = []
    
for x in range(0, k):
    # Creating a vectorizer
    vectorizers.append(CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}'))

In [None]:
topic_df = pd.DataFrame()
topic_df['id'] = initial_df[final_df['id'].isin(rec)]['id']
topic_df['title'] = initial_df[final_df['id'].isin(rec)]['title']
topic_df['text'] = initial_df[final_df['id'].isin(rec)]['title']+" "+initial_df[final_df['id'].isin(rec)]['abstract']
topic_df['cluster'] = y_pred

In [None]:
vectorized_data = []

for current_cluster, cvec in enumerate(vectorizers):
    try:
        vectorized_data.append(cvec.fit_transform(topic_df.loc[topic_df['cluster'] == current_cluster, 'text']))
    except Exception as e:
        print("Not enough instances in cluster: " + str(current_cluster))
        vectorized_data.append(None)

In [None]:
NUM_TOPICS_PER_CLUSTER = 5 #choose

lda_models = []
for x in range(0, k):
    # Latent Dirichlet Allocation Model
    lda = LatentDirichletAllocation(n_components=NUM_TOPICS_PER_CLUSTER, max_iter=10, learning_method='online',verbose=False, random_state=42)
    lda_models.append(lda)

In [None]:
clusters_lda_data = []

for current_cluster, lda in enumerate(lda_models):
    #print("Current Cluster: " + str(current_cluster))
    
    if vectorized_data[current_cluster] != None:
        clusters_lda_data.append((lda.fit_transform(vectorized_data[current_cluster])))

In [None]:
def selected_topics(model, vectorizer, top_n=3):
    current_words = []
    keywords = []
    
    for idx, topic in enumerate(model.components_):
        words = [(vectorizer.get_feature_names_out()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]]
        for word in words:
            if word[0] not in current_words:
                keywords.append(word)
                current_words.append(word[0])
                
    keywords.sort(key = lambda x: x[1])  
    keywords.reverse()
    return_values = []
    for x in keywords:
        return_values.append(x[0])
    return " ".join(return_values)

In [None]:
all_keywords = []
for current_vectorizer, lda in enumerate(lda_models):
    #print("Current Cluster: " + str(current_vectorizer))

    if vectorized_data[current_vectorizer] != None:
        all_keywords.append(selected_topics(lda, vectorizers[current_vectorizer]))

In [None]:
cluster_keyword = {x:all_keywords[x] for x in range(k)}
word_pred = list(map(cluster_keyword.get, y_pred))

In [None]:
topic_df['keywords'] = word_pred

In [None]:
topic_df['link'] = 

In [None]:
fig = px.scatter(topic_df, x=two_dim_matrix[:,0], y=two_dim_matrix[:,1], color='keywords',
                 hover_data=['id','title'],
                 height= 500, width=1200,
                title = "Clustered Papers")
fig.show()

In [None]:
fig.write_html("plot.html")