In [34]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pickle

In [2]:
df = pd.read_csv('../dataset/arxiv-metadata-oai.csv', low_memory=False)

In [3]:
df = df.fillna('None')

In [4]:
# missing rate
doi_missing_rate = df[df.doi == 'None'].shape[0] / df.shape[0]
journal_ref_missing_rate = \
    df[df.journal_ref == 'None'].shape[0] / df.shape[0]

print('doi missing rate: ' + str(round(doi_missing_rate, 2)))
print('journal_ref missing rate: ' + \
      str(round(journal_ref_missing_rate, 2)))

doi missing rate: 0.7
journal_ref missing rate: 0.83


In [5]:
# get rid of journal_ref column
df = df.drop(columns=['journal_ref', 'doi'])

In [6]:
# drop duplicated abstracts
df.drop_duplicates(['abstract'], inplace=True)

In [8]:
# randomly sample 50,000 rows
df = df.sample(50000, random_state=42)

In [9]:
# extract all abstracts and replace \n token with a space
corpus = []
for index, row in df.iterrows():
    text = row.abstract
    text = text.replace('\n', ' ')
    corpus.append(text)

In [37]:
vectorizer = TfidfVectorizer(stop_words='english', min_df = 0.02)
doc_term_matrix = vectorizer.fit_transform(corpus)
vec = pd.DataFrame(doc_term_matrix.toarray(), columns=cv.get_feature_names_out())

In [36]:
with open('../pkl/doc_term_matrix.pkl', 'wb') as f:
    pickle.dump(X, f)

In [38]:
with open('../pkl/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [21]:
vec.head()

Unnamed: 0,10,2d,3d,ability,able,access,according,account,accuracy,accurate,...,wave,way,weak,wide,widely,work,works,world,years,zero
0,0.0,0.0,0.0,0.0,0.0,0.637278,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156212,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.1277,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
vec.to_csv('../dataset/tfidf.csv', index=False)
df.to_csv('../dataset/arxiv_sampled.csv', index=False)