In [1]:
import os
import re
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import json
import time
import warnings 
warnings.filterwarnings('ignore')
import datetime

In [12]:
df = pd.read_csv('/Users/xinyifang/Desktop/Kaggle/biorxiv_clean.csv')
print(df.shape) #803 papers

(803, 9)


In [11]:
df.head(5)['abstract'][0]

'Abstract\n\nViruses possessing class I fusion proteins require proteolytic activation by host cell proteases to mediate 18 fusion with the host cell membrane. The mammalian SPINT2 gene encodes a protease inhibitor that 19 targets trypsin-like serine proteases. Here we show the protease inhibitor, SPINT2, restricts cleavage-20 activation efficiently for a range of influenza viruses and for human metapneumovirus (HMPV). SPINT2 21 treatment resulted in the cleavage and fusion inhibition of full-length influenza A/CA/04/09 (H1N1) HA, 22\n\n'

In [14]:
df.info() #abstract: 707 non-null, 100 null

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 803 entries, 0 to 802
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   paper_id          803 non-null    object
 1   title             768 non-null    object
 2   authors           784 non-null    object
 3   affiliations      784 non-null    object
 4   abstract          707 non-null    object
 5   text              803 non-null    object
 6   bibliography      803 non-null    object
 7   raw_authors       803 non-null    object
 8   raw_bibliography  803 non-null    object
dtypes: object(9)
memory usage: 56.6+ KB


In [15]:
col='abstract'
keep = df.dropna(subset=[col])
print(keep.shape)
docs = keep[col].tolist() #the list of abstracts, no null value

(707, 9)


In [17]:
docs[0]

'Abstract\n\nViruses possessing class I fusion proteins require proteolytic activation by host cell proteases to mediate 18 fusion with the host cell membrane. The mammalian SPINT2 gene encodes a protease inhibitor that 19 targets trypsin-like serine proteases. Here we show the protease inhibitor, SPINT2, restricts cleavage-20 activation efficiently for a range of influenza viruses and for human metapneumovirus (HMPV). SPINT2 21 treatment resulted in the cleavage and fusion inhibition of full-length influenza A/CA/04/09 (H1N1) HA, 22\n\n'

In [18]:
### Tokenize the documents.

In [20]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.corpora import Dictionary

In [21]:
tokenizer = RegexpTokenizer(r'\w+') # what does this mean?
for idx in range(len(docs)):
    # Convert to lowercase.
    docs[idx] = docs[idx].lower()  
    # Split into words.
    docs[idx] = tokenizer.tokenize(docs[idx]) #list of list

In [23]:
print(docs[0]) #each abstract becomes a list of strings. all abstract is a larger list

['abstract', 'viruses', 'possessing', 'class', 'i', 'fusion', 'proteins', 'require', 'proteolytic', 'activation', 'by', 'host', 'cell', 'proteases', 'to', 'mediate', '18', 'fusion', 'with', 'the', 'host', 'cell', 'membrane', 'the', 'mammalian', 'spint2', 'gene', 'encodes', 'a', 'protease', 'inhibitor', 'that', '19', 'targets', 'trypsin', 'like', 'serine', 'proteases', 'here', 'we', 'show', 'the', 'protease', 'inhibitor', 'spint2', 'restricts', 'cleavage', '20', 'activation', 'efficiently', 'for', 'a', 'range', 'of', 'influenza', 'viruses', 'and', 'for', 'human', 'metapneumovirus', 'hmpv', 'spint2', '21', 'treatment', 'resulted', 'in', 'the', 'cleavage', 'and', 'fusion', 'inhibition', 'of', 'full', 'length', 'influenza', 'a', 'ca', '04', '09', 'h1n1', 'ha', '22']


In [24]:
# Remove numbers
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

In [25]:
# Remove one-character words
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [26]:
# Remove stopwords 
stop_words = stopwords.words("english")
docs = [[token for token in doc if token not in stop_words] for doc in docs]

In [27]:
#Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as
#a single item.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs] #different forms of a word become one

In [28]:
# Create a dictionary representation of the documents
dictionary = Dictionary(docs) #12729 unique tokens: ['abstract', 'activation', 'ca', 'cell', 'class']...

In [43]:
# Filter out words that occur less than 20 documents, or more than 50% of the documents
dictionary.filter_extremes(no_below=10, no_above=0.5)

In [44]:
# Create Bag-of-words representation of the documents
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [49]:
print(corpus[1]) #the second abstract's word occurance with order

[(15, 1), (20, 1), (28, 3), (29, 1), (30, 1), (31, 1), (32, 2), (33, 1), (34, 1), (35, 2), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 2), (43, 1), (44, 1), (45, 1), (46, 9), (47, 1), (48, 1), (49, 1), (50, 2), (51, 1), (52, 1), (53, 1), (54, 2), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 5), (65, 3), (66, 2), (67, 1), (68, 1), (69, 1), (70, 2), (71, 1), (72, 2), (73, 1), (74, 1), (75, 2), (76, 1), (77, 2), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 4), (84, 2), (85, 1), (86, 1), (87, 1), (88, 1), (89, 3), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 2), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 2), (105, 13), (106, 3), (107, 1), (108, 1), (109, 1), (110, 1), (111, 1), (112, 2), (113, 1), (114, 2), (115, 1), (116, 1), (117, 3), (118, 1), (119, 1), (120, 1), (121, 1), (122, 1), (123, 1), (124, 4), (125, 1), (126, 4)]


In [50]:
print('Number of unique tokens: %d' % len(dictionary)) #we only have 1719 now from 12729 original tokens
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 1719
Number of documents: 707


In [51]:
###Train LDA Model

In [52]:
from gensim.models import LdaModel, LdaMulticore

In [62]:
# Set training parameters.
num_topics = 14

In [54]:
# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token #[???]what does this mean???

In [63]:
model = LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    chunksize=2000,
    eta='auto',
    iterations=10,
    num_topics=num_topics,
    passes=10,
    eval_every=None,
    workers=4
)

In [64]:
top_topics = model.top_topics(corpus) 
for i, (topic, sc) in enumerate(top_topics): 
    print("\nTopic {}: ".format(i) + ", ".join([w for score,w in topic]))


Topic 0: preprint, doi, license, copyright, reviewed, holder, made, peer, cc, available, author, international, org, human, funder, http, cov, sars, nd, biorxiv

Topic 1: patient, covid, case, clinical, sars, severe, infection, cov, study, disease, pneumonia, symptom, coronavirus, wuhan, respiratory, medrxiv, group, result, preprint, china

Topic 2: case, number, outbreak, epidemic, china, transmission, time, model, estimate, data, wuhan, city, day, control, infected, infection, covid, estimated, spread, disease

Topic 3: model, transmission, disease, population, individual, dynamic, host, number, infection, pathogen, rate, infectious, data, human, selection, system, epidemic, approach, virus, outbreak

Topic 4: specie, virus, viral, host, pathogen, genome, sample, zika, study, sequencing, diversity, human, sequence, based, strain, tool, data, may, one, method

Topic 5: cov, sars, virus, ncov, coronavirus, protein, human, host, viral, ace2, receptor, drug, protease, coronaviruses, bin