In [1]:
# Imports
# data manipulation, gathering
import numpy as np # for array manipulation
import pandas as pd # for dataframe manipulation/reading in data
import json # for reading in Data
from itertools import islice # for slicing and dicing JSON records
import os # for getting the filepath information
import re # to identify characters that are to be removed
import nltk # for preprocessing of textual data
from nltk.corpus import stopwords # for removing stopwords
from nltk.tokenize import word_tokenize # for tokenizing text
from nltk.stem import WordNetLemmatizer # for lemmatizing text
from sklearn.feature_extraction.text import TfidfVectorizer # for featurizing text
from sklearn.metrics.pairwise import cosine_similarity # for getting similarity score

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr/share/nltk_data

### Importing the dataset

In [4]:
#Function to yield data from the stored file
def extract_data(path):
    with open(path, 'r') as f:
        for x in f:
            yield x
            
#Defining PATH
PATH = '/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json'


#Creating a data generator to extract data from the JSON file
data_gen = extract_data(PATH)

In [5]:
#Function to yield N records from the data generator
def fetch_n_records(data_gen, n):
    return [json.loads(record) for record in islice(data_gen, n)]

#Fetching 250000 records from the given data for the use of recommender systems
CHUNK_SIZE = 250000
data = fetch_n_records(data_gen, CHUNK_SIZE)

In [6]:
#The variable data records is a list of dictionaries
data[77]

{'id': '0704.0078',
 'submitter': 'Raul Vera',
 'authors': 'Marc Mars, Filipe C. Mena, Raul Vera',
 'title': 'Linear perturbations of matched spacetimes: the gauge problem and\n  background symmetries',
 'comments': '18 pages, plain LaTeX file',
 'journal-ref': 'Class.Quant.Grav.24:3673-3690,2007',
 'doi': '10.1088/0264-9381/24/14/008',
 'report-no': None,
 'categories': 'gr-qc',
 'license': None,
 'abstract': '  We present a critical review about the study of linear perturbations of\nmatched spacetimes including gauge problems. We analyse the freedom introduced\nin the perturbed matching by the presence of background symmetries and revisit\nthe particular case of spherically symmetry in n-dimensions. This analysis\nincludes settings with boundary layers such as brane world models and shell\ncosmologies.\n',
 'versions': [{'version': 'v1', 'created': 'Sun, 1 Apr 2007 10:08:25 GMT'}],
 'update_date': '2008-11-26',
 'authors_parsed': [['Mars', 'Marc', ''],
  ['Mena', 'Filipe C.', ''],
  

In [7]:
# #Resampling method to fetch records for generating user_profile and recommendation algorithm
# def split_records(data_records, profile_capacity=100, random_state=42):
#     np.random.seed(random_state)
#     np.random.shuffle(data_records)
#     profile_records, recommend_records = data_records[:profile_capacity], data_records[profile_capacity:]
#     return profile_records, recommend_records

# #Splitting the fetched records into profile and recommendation records
# profile_records, recommend_records = split_records(data_records, profile_capacity=500)

In [8]:
#Function to generate a dataframe from a list of dictionaries
def get_dataframe(list_of_dicts):
    data = pd.DataFrame(list_of_dicts)
    return data

#Generating dataframe from the list of records
data_df = get_dataframe(data)

In [9]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   id              250000 non-null  object
 1   submitter       250000 non-null  object
 2   authors         250000 non-null  object
 3   title           250000 non-null  object
 4   comments        213517 non-null  object
 5   journal-ref     123099 non-null  object
 6   doi             151332 non-null  object
 7   report-no       21668 non-null   object
 8   categories      250000 non-null  object
 9   license         206768 non-null  object
 10  abstract        250000 non-null  object
 11  versions        250000 non-null  object
 12  update_date     250000 non-null  object
 13  authors_parsed  250000 non-null  object
dtypes: object(14)
memory usage: 26.7+ MB


In [10]:
data_df = data_df[['id','title','authors','categories', 'abstract']]

In [11]:
data_df.to_csv("data_df.csv",index = False)

In [12]:
data_df['abstract'][5]

'  We study the two-particle wave function of paired atoms in a Fermi gas with\ntunable interaction strengths controlled by Feshbach resonance. The Cooper pair\nwave function is examined for its bosonic characters, which is quantified by\nthe correction of Bose enhancement factor associated with the creation and\nannihilation composite particle operators. An example is given for a\nthree-dimensional uniform gas. Two definitions of Cooper pair wave function are\nexamined. One of which is chosen to reflect the off-diagonal long range order\n(ODLRO). Another one corresponds to a pair projection of a BCS state. On the\nside with negative scattering length, we found that paired atoms described by\nODLRO are more bosonic than the pair projected definition. It is also found\nthat at $(k_F a)^{-1} \\ge 1$, both definitions give similar results, where more\nthan 90% of the atoms occupy the corresponding molecular condensates.\n'

### Preprocessing

In [13]:
#Function to decontract contractions
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [14]:
#Function to remove all charaters between $ characters 
def remove_eqns(txt):
    reg = re.compile(r'\$*?\$') #Regex for a URL
    return reg.sub(r'', txt)

In [15]:
#Function to replace all \n characters with a space
def remove_newlines(txt):
    return re.sub(r'\n', " ", txt)

In [16]:
#Function to remove all special characters from a text
def remove_spl(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', txt)

In [17]:
#Function to remove stopwords from the text and lemmatize the words in the text
def remove_stopwords(txt):
    words = word_tokenize(txt) #Tokenizing the text
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words] #Removing stopwords and lemmatizing the words
    filtered_txt = ' '.join(filtered_words) #Joining the filtered words back into a string
    return filtered_txt

In [18]:
#Function to remove stopwords from the text and lemmatize the words in the text
def remove_stopwords_lemmatize(txt):
    words = word_tokenize(txt) #Tokenizing the text
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words] #Removing stopwords and lemmatizing the words
    filtered_txt = ' '.join(filtered_words) #Joining the filtered words back into a string
    return filtered_txt

In [19]:
def preprocess(df_column):
    df_column = df_column.apply(decontracted) 
    df_column = df_column.apply(remove_eqns)
    df_column = df_column.apply(remove_newlines)
    df_column = df_column.apply(remove_spl) 
    df_column = df_column.apply(lambda txt : txt.lower()) #Converting text to lowercase
    df_column = df_column.apply(remove_stopwords_lemmatize)
    return df_column

In [None]:
data_df['abstract'] = preprocess(data_df['abstract'])
data_df['title'] = preprocess(data_df['title'])
data_df['authors'] = preprocess(data_df['authors'])

In [None]:
#Function to replace all occurrences of . and - with _ in the given text."""
def replace_chars(text):
    return text.replace('.', '_').replace('-', '_')

In [None]:
#Preprocessing categories
data_df['categories'] = data_df['categories'].apply(replace_chars)

In [None]:
data_df

In [None]:
data_df['final_text'] = data_df['categories'] + " " + data_df['authors'] + " " + data_df['title'] + " " + data_df['abstract']

In [None]:
final_df = data_df[['id','final_text']].copy()

In [None]:
final_df

In [None]:
final_df.to_csv("final_df.csv",index = False)

### Feature extraction

In [None]:
tfidf_vectorizer = TfidfVectorizer()

# Generate the tf-idf vectors for the data
tfidf_matrix = tfidf_vectorizer.fit_transform(final_df['final_text'])

In [None]:
tfidf_matrix.shape

In [None]:
#TODO Doc to Vec

### Getting recommendations

In [None]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix[0])
s = cosine_sim.reshape(cosine_sim.shape[0])

In [None]:
np.argsort(-s)[:5]

In [None]:
[s[i] for i in np.argsort(-s)[1:6]]

In [None]:
final_df['id'][0]

In [None]:
def get_recommendations(paper_id:str,tfidf_matrix,num_rec):
    idx = final_df.index[final_df['id'] == paper_id][0]
    sim = cosine_similarity(tfidf_matrix, tfidf_matrix[idx])
    sim = s`im.reshape(sim.shape[0])
    top_n_idx = np.argsort(-sim)[1:num_rec+1]
    top_n_id = [final_df['id'][x] for x in top_n_idx]
    return top_n_id

### Dimensionality Reduction