# Article recommanding system introduction:
Article recommanding system is a system makes use natural language processing and unsupervised k-nearest neighbour to recommand k most relevant articles by papers' title and abstract. It does the task by vectorzing the provided paper's text and using knn to match the "closest" to the provided paper in the database. In addition, the system provides filter function which accurates to the month to filter out the paper earlier than the specified year and month.

# Insight Deriving:
Inside the OVID-19 Open Research Dataset (CORD-19), there are over 
The system helps the users to find relevant papers of a provided paper among all papers in the database by simply providing the title and the abstract of the provided paper.

# Install and import necessary packages

In [1]:
!pip install tqdm_notebook > /dev/null 2>&1

In [2]:
!pip install scispacy > /dev/null 2>&1

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz > /dev/null 2>&1

In [None]:
#Import Libraries

import csv
import random
import numpy as np
import pandas as pd
import html
import re

# Plotting
import matplotlib.pyplot as plt

import scispacy
import spacy
import en_core_sci_lg

%matplotlib inline

In [None]:
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  from tqdm.notebook import tqdm_notebook as tqdm
  tqdm.pandas()
  drive.mount('/content/drive')

**Please modify the following path to run in your local machine**

In [None]:
base_path = "/content/drive/MyDrive/Colab Notebooks/data/"

# Data cleaning

In [None]:
df = pd.read_csv(base_path+"metadata.csv", low_memory=False)
df.info(verbose=True)
df.head()

In [None]:
# load the meta data from the CSV file and extract out fields that will be useful to our application

df = df[['title','abstract','publish_time', 'cord_uid', 'who_covidence_id']]
df = df.drop_duplicates(subset='abstract', keep="first")
df=df.dropna()
df["abstract"] = df["abstract"].str.lower()
df["title"] = df["title"].str.lower()

Only choose the articles related to COVID-19 for this project. I assume that papers with a who_covidence_id are related to COVID-19.

In [None]:
df.shape

In [None]:
df.head()

We use scispaCy for preprocessing, which is a Python package containing spaCy models for processing scientific text.

In [None]:
nlp = en_core_sci_lg.load(disable=["parser", "ner"])
nlp.max_length = 2000000

In [None]:
def clean_text(sentence):
  sentence = html.unescape(sentence) # replaces HTML charachter codes with ASCII equivalent
  sentence = re.sub(r"http\S+", "", sentence) # removes URL links
  wordsList = []
  for word in nlp(sentence):
    if not (word.is_stop or word.is_space or word.like_num or word.is_punct or len(word)<2):
      wordsList.append(word.lemma_)
  return " ".join(wordsList)

Add customize stop words

In [None]:
customize_stop_words = [
    'doi', 'preprint', 'copyright', 'https', 'author', 'figure', 'table',
    'rights', 'reserved', 'permission', 'use', 'biorxiv', 'medrxiv', 
    'license', 'fig', 'fig.', 'al.', 'PMC', 'CZI', 
    'br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \
    "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', \
    'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", \
    'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', \
    'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', \
    'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', \
    'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
    'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', \
    'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', \
    'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', \
    'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', \
    'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", \
    'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", \
    'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', \
    "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', \
    "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", \
    'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"
]
for word in customize_stop_words:
    nlp.vocab[word].is_stop = True

Apply the nlp model to title and abstract columns.

In [None]:
def clean_df(data):
  data['cleaned_title'] = data['title'].progress_apply(clean_text)
  data['cleaned_abstract'] = data['abstract'].progress_apply(clean_text)
clean_df(df)

Convert the publish time to datetime object, and categorize the time into year and month separately.

In [None]:
df['publish_time'] = pd.to_datetime(df['publish_time'])
def categorize_time(data):
  data['publish_year'] = data['publish_time'].dt.year
  data['publish_month'] = data['publish_time'].dt.month
categorize_time(df)

Save the preprocessed dataframe

In [None]:
df.to_csv(base_path + "preproced_metadata.csv")

# Data visualization

In [None]:
proced_df = pd.read_csv(base_path + "preproced_metadata.csv", low_memory=False)

In [None]:
proced_df = proced_df.dropna()

In [None]:
proced_df.head()

## publish year distribution

As we can see, most of the articles articles published in 2020, so filtering the articles by year would not be very helpful. Therefore, we can add month to the filtering condition to help user filter the articles according to the publish time more accurately. 

In [None]:
proced_df['publish_year'].plot.hist()
plt.xlabel('Publish year')  
plt.ylabel('Counts')
plt.title('Publish year distribution')
plt.show()

## Plot word frequency for abstract

In [None]:
import nltk
nltk.download('punkt')

In [None]:
def get_word_freq(data, top_N=20):
  preproced_data = data.str.cat(sep=' ')
  words = nltk.tokenize.word_tokenize(preproced_data)
  words_selected = [i for i in words if len(i) > 2]
  words_dist = nltk.FreqDist(words_selected)
  words_freq = pd.DataFrame(words_dist.most_common(top_N),
                      columns=['Word', 'Frequency'])
  return words_freq, words_dist

In [None]:
from wordcloud import WordCloud
def draw_word_cloud(word_dist, title):
  wordcloud = WordCloud(width=800, height=500, max_font_size=110).generate_from_frequencies(word_dist)
  plt.figure(figsize=(10, 10))
  plt.imshow(wordcloud, interpolation="bilinear")
  plt.title("Title words cloud")
  plt.axis('off')
  plt.show()

In [None]:
abstract_words_freq, abstract_words_dist = get_word_freq(proced_df['cleaned_abstract'], top_N=20)

In [None]:
abstract_words_freq

In [None]:
abstract_words_freq.plot(x='Word', y='Frequency', kind='bar')

In [None]:
draw_word_cloud(abstract_words_dist, "Abstract word cloud")

## Plot word frequency for title

In [None]:
title_words_freq, title_words_dist = get_word_freq(proced_df['cleaned_title'], top_N=20)

In [None]:
title_words_freq

In [None]:
title_words_freq.plot(x='Word', y='Frequency', kind='bar')

In [None]:
draw_word_cloud(title_words_dist, "Title word cloud")

## Data visualization summary

By visualizing the data, I find out that words directly related Covid-19 appears very frequently in both title and abstract. Therefore, if we want to make distictions and calculate meaningful distances between articles, we have to ignore words that appears most frequent in title and abstract

# Model selection and fitting to the data

Since there are no easy ways to evaluate an unsupervised model, I only choose a small subset data as test set to test the functionality of the model.

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(proced_df, test_size=100, random_state=42)

In [None]:
print(f"Training set shape: {train_df.shape}")
print(f"Testing set shape: {test_df.shape}")

In [None]:
from sklearn.base import TransformerMixin
from sklearn.neighbors import NearestNeighbors
class NNWrapper(TransformerMixin):
  '''
  Nearest Neighbour wrapper for pipeline
  '''
  def __init__(self, knn_args):
    self.knn = NearestNeighbors(**knn_args)

  def fit(self, X, y=None):
    self.knn.fit(X)

  def predict(self, X, **predict_params):
    return self.knn.kneighbors(X, predict_params['n_neighbors'], predict_params['return_distance'])

I use a simple pipeline to train my model by following steps below:


1.   I use a column transformer to process title and abstract independently and I choose TFIDF as my vectorizer since it usually performs better than word frequency in my experience.
2.   Then the preprocessed data is fed into the knn model for training.



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


def train(train_df, title_args, abstract_args, knn_args):
  preprocess = ColumnTransformer([
                                  ('title', TfidfVectorizer(**title_args), 'cleaned_title'),
                                  ('abstract', TfidfVectorizer(**abstract_args), 'cleaned_abstract')
                                  ], remainder = 'drop', n_jobs=-1)
          
  pipe = Pipeline([
                  ('preprocess', preprocess),
                  ('model', NNWrapper(knn_args))
          ],
          verbose=True)
  pipe.fit(train_df)
  return pipe

In the cell below, I specify the arguments for the models in the pipeline.
In particular, note that by setting min_df and max_df, I only take the words that appear more than 10% and less than 90%. By setting min_df, we don't consider the extremely rare words, and the reason for setting max_df has been stated in data visualization part.

In [None]:
MAX_FEATURES_TITLE = 1000
MAX_FEATURES_ABSTRACT = 10000
title_args = {'min_df': 0.1,  'max_df':0.9,'max_features': MAX_FEATURES_TITLE, 'ngram_range': (1, 3)}
abstract_args = {'min_df': 0.1,  'max_df':0.9, 'max_features': MAX_FEATURES_ABSTRACT, 'ngram_range': (1, 4)}
knn_args = {'n_neighbors': 3, 'n_jobs': -1}
model = train(train_df, title_args, abstract_args, knn_args)

In [None]:
def get_kneighbors(model, X, k=3, return_distance=True):
  return model.predict(X, n_neighbors=k, return_distance=return_distance)

In [None]:
def recommand(model, paper, database, no_earlier=None, k=3):
  '''
  The function takes an input of a single paper and recommands the k most relevant papers in the database.
  no_earlier(str): is a filter to discard the papers strictly earlier than the argument. It should be in the format of year-month(e.g. '2020-08')
  '''
  no_earlier_dt = pd.to_datetime(no_earlier)
  dists, nn = get_kneighbors(model, paper, k)
  recommand_papers = database.iloc[nn[0], :].copy()
  recommand_papers['distance'] = dists[0]
  if no_earlier:
    return recommand_papers[recommand_papers['publish_time'] > no_earlier]
  return recommand_papers

## Test the model

In [None]:
test_paper = test_df.iloc[[1]]

In [None]:
test_paper

In [None]:
recommand_papers = recommand(model, test_paper, train_df, k=5)

In [None]:
recommand_papers

In [None]:
recommand_papers_filtered = recommand(model, test_paper, train_df, k=5, no_earlier='2020-09')

In [None]:
recommand_papers_filtered

In [None]:
recommand_papers.plot(x='cord_id', y='distance', kind='bar')

In [None]:
def retrieve_all_recom(model, papers, database, no_earlier=None, k=2):
  paper_storer = []

  for i in range(0, papers.shape[0]):
    orig_paper = papers.iloc[[i]]
    recom_papers = recommand(model, orig_paper, database, k=k)
    recom_papers['original_paper_title'] = orig_paper['title'].values[0]
    paper_storer.append(recom_papers)
  all_recommand_papers = pd.concat(paper_storer, axis=0)
  if no_earlier:
    return all_recommand_papers[all_recommand_papers['publish_time'] > no_earlier]
  return all_recommand_papers

In [None]:
all_recommand_papers = retrieve_all_recom(model, test_df, train_df, k=2)

In [None]:
all_recommand_papers