# # COVID-19 Literature Clusters

Use subversion in Linux to download just the relevant directory with the XML files from GitHub by replacing tree/master with trunk.

In [None]:
#svn checkout https://github.com/midas-network/COVID-19/trunk/documents/mendeley_library_files/xml_files

In [None]:
from __future__ import unicode_literals
import os
import subprocess
#subprocess.check_call(['python','-m','pip','install','https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz'])
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import OrderedDict
from collections import Counter
from collections import defaultdict
from bs4 import BeautifulSoup as bs
from io import StringIO
import string
from tqdm import tqdm
from xml.etree.ElementTree import iterparse
from xml.parsers.expat import ParserCreate
import xml.etree.cElementTree as et
#Relevant Modeling Libraries
import pyLDAvis
import pyLDAvis.sklearn
import tensorflow as tf
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
import spacy
import en_core_web_sm # model for common English
import en_core_sci_lg # model for biomedical text
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from itertools import groupby 
from operator import itemgetter 

In [None]:
# Read the most recent MIDAS Mendeley paper library XML file.
xfile="xml_files/mendeley_document_library_2020-03-25.xml"
# Use ElementTree to parse xml and convert levels to lists
tree=ET.parse(xfile)
root=tree.getroot()
tags=[elem.tag for elem in root.iter()]

In [None]:
# Extract tags from xml and get frequency of each tag
class OrderedCounter(Counter, OrderedDict):
     'Counter that remembers the order elements are first seen'
     def __repr__(self):
         return '%s(%r)' % (self.__class__.__name__,
                            OrderedDict(self))
     def __reduce__(self):
         return self.__class__, (OrderedDict(self),)
oc=dict(OrderedCounter(tags))
ocl=list(oc)
# xml to BeautifulSoup(bs) table
with open(xfile, "r", encoding="utf-8") as file:
    content=file.readlines()
    content="".join(content)
    bs_content=bs(content,"lxml")
mList=[]
for i in ocl:
    j=bs_content.find_all(i)
    k=[t.text for t in j]
    mList.append(k)
# Number labels
listn=[]
i=0
for j in ocl:
    k=(i,j)
    listn.append(k)
    i+=1        
#(33, 'abstract')
abt=mList[33]
abt

In [None]:
listn

In [None]:
# Use listnn to index desired label and add to df procedures below
#authos=mList[29]

In [None]:
# Tokenize words and phrases
# import stopwords variable
exec(open('covid19_stopwords.py').read())
punctuations = string.punctuation
stopwords=covid19_stopwords
# Convert text to lowercase and separate words with commas
data = [line.strip() for line in abt]
texts = [[word.lower() for word in text.split()] for text in data]
# remove xml tags
no={'<p>','<bold>background</bold>',':','</p>'}
ts=[]
for i in texts:
    j=[e for e in i if e not in no]
    ts.append(str(j))
tss=[]
for i in ts:
    j=i.replace("', '"," ")
    tss.append(j)

In [None]:
# Tokenize words and phrases
# Parser
parser = en_core_sci_lg.load()
parser.max_length = 7000000
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens
# Convert to dataframe with word counts
df=pd.DataFrame(tss)
df['abstract']=df[0]
df.drop(df.columns[0],axis=1,inplace=True)
df['abstract_word_count']=df['abstract'].apply(lambda x: len(x.strip().split()))
tqdm.pandas()
df["processed_text"] = df["abstract"].progress_apply(spacy_tokenizer)

In [None]:
# Use tf-idf to convert str data to measure importance each word is to instance of literture as whole.
# Vectorize data: cluster based on content of abstracts. 
from sklearn.feature_extraction.text import TfidfVectorizer
def vectorize(text, maxx_features):
    vectorizer = TfidfVectorizer(max_features=maxx_features)
    X = vectorizer.fit_transform(text)
    return X
text=df['processed_text'].values
X = vectorize(text, 2 ** 12)
X.shape

In [None]:
# # PCA & Clustering
# Apply Principal Component Analysis (PCA) to vectorized data
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95, random_state=42)
X_reduced= pca.fit_transform(X.toarray())
X_reduced.shape

In [None]:
# k-means categorizes each vector by taking mean distance to randomly initialized centroid. 
# Find best k value via distortion at different k values.
# Distortion computes the sum of squared distances from each point to its assigned center. 
# When distortion is plotted against k there will be a k value after which decreases in distortion are minimal. This is the desired number of clusters.
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
# run kmeans with many different k
distortions = []
K = range(2, 50)
for k in K:
    k_means = KMeans(n_clusters=k, random_state=42).fit(X_reduced)
    k_means.fit(X_reduced)
    distortions.append(sum(np.min(cdist(X_reduced, k_means.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
    print('Found distortion for {} clusters'.format(k))   

In [None]:
X_line = [K[0], K[-1]]
Y_line = [distortions[0], distortions[-1]]
# Plot the elbow
plt.plot(K, distortions, 'b-')
plt.plot(X_line, Y_line, 'r')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
k = 10
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(X_reduced)
df['y'] = y_pred

In [None]:
# # Dimensionality Reduction with t-SNE
# t-SNE reduces high dimension feature vector to 2 dimensions
# t-Distributed Stochastic Neighbor Embedding (t-SNE) reduces dimensionality while trying to keep similar instances close and dissimilar instances apart.
from sklearn.manifold import TSNE
tsne = TSNE(verbose=1, perplexity=100, random_state=42)
X_embedded = tsne.fit_transform(X.toarray())

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
# sns settings
sns.set(rc={'figure.figsize':(15,15)})
# colors
palette = sns.color_palette("bright", 1)
# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], palette=palette)
plt.title('t-SNE with no Labels')
plt.savefig("t-sne_covid19.png")
plt.show()

In [None]:
get_ipython().run_line_magic('matplotlib', 'inline')
from matplotlib import pyplot as plt
import seaborn as sns
# sns settings
sns.set(rc={'figure.figsize':(15,15)})
# colors
#palette = sns.hls_palette(20, l=.4, s=.9)
palette = sns.hls_palette(10, l=.4, s=.9)
# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y_pred, legend='full', palette=palette)
plt.title('t-SNE with Kmeans Labels')
plt.savefig("improved_cluster_tsne.png")
plt.show()

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
vectorizers = []
#for ii in range(0, 20):
for ii in range(0, 10):
    # Creating a vectorizer
    vectorizers.append(CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}'))
df['processed_text']    

In [None]:
vectorized_data = []
for current_cluster, cvec in enumerate(vectorizers):
    try:
        vectorized_data.append(cvec.fit_transform(df.loc[df['y'] == current_cluster, 'processed_text']))
    except Exception as e:
        print("Not enough instances in cluster: " + str(current_cluster))
        vectorized_data.append(None)

In [None]:
# number of topics per cluster
#NUM_TOPICS_PER_CLUSTER = 20
NUM_TOPICS_PER_CLUSTER = 10
lda_models = []
#for ii in range(0, 20):
for ii in range(0, 10):
    # Latent Dirichlet Allocation Model
    lda = LatentDirichletAllocation(n_components=NUM_TOPICS_PER_CLUSTER, max_iter=10, learning_method='online',verbose=False, random_state=42)
    lda_models.append(lda)
lda_models[0]

In [None]:
clusters_lda_data = []
for current_cluster, lda in enumerate(lda_models):
    # print("Current Cluster: " + str(current_cluster))
    if vectorized_data[current_cluster] != None:
        clusters_lda_data.append((lda.fit_transform(vectorized_data[current_cluster])))
clusters_lda_data

In [None]:
def selected_topics(model, vectorizer, top_n=3):
    current_words = []
    keywords = []    
    for idx, topic in enumerate(model.components_):
        words = [(vectorizer.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]]
        for word in words:
            if word[0] not in current_words:
                keywords.append(word)
                current_words.append(word[0])               
    keywords.sort(key = lambda x: x[1])  
    keywords.reverse()
    return_values = []
    for ii in keywords:
        return_values.append(ii[0])
    return return_values

In [None]:
all_keywords = []
for current_vectorizer, lda in enumerate(lda_models):
    # print("Current Cluster: " + str(current_vectorizer))
    if vectorized_data[current_vectorizer] != None:
        all_keywords.append(selected_topics(lda, vectorizers[current_vectorizer]))

In [None]:
# Sort processed_test and keywords to match order of y_pred
# lab=list(df['processed_text'])
# yp=list(y_pred)
# ys=sorted(y_pred)
# dft=pd.DataFrame(list(zip(ys,lab,yp)),columns=['ys','abstract','yp'])
# mask = dft.applymap(lambda x: x is None)
# cols = dft.columns[(mask).any()]
# for col in dft[cols]:
#     dft.loc[mask[col], col] = ''
# dft['abstract']=dft['abstract'].str.replace('abstract','')
# dft['abstract']=dft['abstract'].str.replace('background','')
# dtf=dft

# # Sorter1
# dtf=dtf.sort_values(by='yp')

# dfk=dtf
# dfk=pd.DataFrame(list(zip(dtf['ys'],dtf['abstract'])),columns=['ys','abstract'])

# kc=all_keywords
# k2=[i for j in kc for i in j]
# gs=[]
# ali=list(dfk['abstract'])
# ix=list(dfk['ys'])
# dic1=dict(zip(ix,ali))
# for i,j in dic1.items():
#     for k in k2:
#         if k in ali:
#             gs.append([i,j,k])
            
# ali0=[i for i in k2 if i in ali[0]]
# ali1=[i for i in k2 if i in ali[1]]
# ali2=[i for i in k2 if i in ali[2]]
# ali3=[i for i in k2 if i in ali[3]]
# ali4=[i for i in k2 if i in ali[4]]
# ali5=[i for i in k2 if i in ali[5]]
# ali6=[i for i in k2 if i in ali[6]]
# ali7=[i for i in k2 if i in ali[7]]
# ali8=[i for i in k2 if i in ali[8]]
# ali9=[i for i in k2 if i in ali[9]]

# alis=list(zip([ali0,ali1,ali2,ali3,ali4,ali5,ali6,ali7,ali8,ali9]))
# alis=str(alis)

In [None]:
f=open('topics.txt','w')
count = 0
for ii in all_keywords:
    if vectorized_data[count] != None:
        f.write(', '.join(ii) + "\n")
    else:
        f.write("Not enough instances to be determined. \n")
        f.write(', '.join(ii) + "\n")
    count += 1
f.close()

In [None]:
import pickle
# save the COVID-19 DataFrame, too large for github
pickle.dump(df, open("df_covid.p", "wb" ))
# save the final t-SNE
pickle.dump(X_embedded, open("X_embedded.p", "wb" ))
# save the labels generate with k-means(20)
pickle.dump(y_pred, open("y_pred.p", "wb" ))

Classify
Use supervised learning to see how well the clustering generalizes

In [None]:
def classification_report(model_name, test, pred):
    from sklearn.metrics import precision_score, recall_score
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import f1_score
    
    print(model_name, ":\n")
    print("Accuracy Score: ", '{:,.3f}'.format(float(accuracy_score(test, pred)) * 100), "%")
    print("     Precision: ", '{:,.3f}'.format(float(precision_score(test, pred, average='macro')) * 100), "%")
    print("        Recall: ", '{:,.3f}'.format(float(recall_score(test, pred, average='macro')) * 100), "%")
    print("      F1 score: ", '{:,.3f}'.format(float(f1_score(test, pred, average='macro')) * 100), "%")

In [None]:
from sklearn.model_selection import train_test_split
# test set size of 20% of the data and the random seed 42 <3
X_train, X_test, y_train, y_test = train_test_split(X.toarray(),y_pred, test_size=0.2, random_state=42)
print("X_train size:", len(X_train))
print("X_test size:", len(X_test), "\n")

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import SGDClassifier
# SGD instance
sgd_clf = SGDClassifier(max_iter=10000, tol=1e-3, random_state=42, n_jobs=4)
# train SGD
sgd_clf.fit(X_train, y_train)
# cross validation predictions
sgd_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3, n_jobs=4)
# print out the classification report
classification_report("Stochastic Gradient Descent Report (Training Set)", y_train, sgd_pred)

**Precision** is ratio of True Positives to True Positives + False Positives. **Recall** (also known as TPR) measures the ratio of True Positives to True Positives + False Negatives. It measures the ratio of positive instances that are correctly detected by the classifer.
**F1 score** is the harmonic average of the precision and recall. F1 score will only be high if both precision and recall are high

In [None]:
# test for overfitting
# cross validation predictions
sgd_pred = cross_val_predict(sgd_clf, X_test, y_test, cv=3, n_jobs=4)
# print out the classification report
classification_report("Stochastic Gradient Descent Report (Training Set)", y_test, sgd_pred)

In [None]:
# See how model generalizes across whole dataset
sgd_cv_score = cross_val_score(sgd_clf, X.toarray(), y_pred, cv=10)
print("Mean cv Score - SGD: {:,.3f}".format(float(sgd_cv_score.mean()) * 100), "%")

In [None]:
from bokeh.models import CustomJS
# handle the currently selected article
def selected_code():
    code = """
            var titles = [];
            var authors = [];
            var journals = [];
            var links = [];
            cb_data.source.selected.indices.forEach(index => titles.push(source.data['titles'][index]));
            cb_data.source.selected.indices.forEach(index => authors.push(source.data['authors'][index]));
            cb_data.source.selected.indices.forEach(index => journals.push(source.data['journal'][index]));
            cb_data.source.selected.indices.forEach(index => links.push(source.data['links'][index]));
            title = "<h4>" + titles[0].toString().replace(/<br>/g, ' ') + "</h4>";
            authors = "<p1><b>Authors:</b> " + authors[0].toString().replace(/<br>/g, ' ') + "<br>"
            // journal = "<b>Journal</b>" + journals[0].toString() + "<br>"
            link = "<b>Link:</b> <a href='" + "http://doi.org/" + links[0].toString() + "'>" + "http://doi.org/" + links[0].toString() + "</a></p1>"
            current_selection.text = title + authors + link
            current_selection.change.emit();
    """
    return code

# handle the keywords and search
def input_callback(plot, source, out_text, topics): 

    # slider call back for cluster selection
    callback = CustomJS(args=dict(p=plot, source=source, out_text=out_text, topics=topics), code="""
				var key = text.value;
				key = key.toLowerCase();
				var cluster = slider.value;
                var data = source.data; 
                x = data['x'];
                y = data['y'];
                x_backup = data['x_backup'];
                y_backup = data['y_backup'];
                labels = data['desc'];
                abstract = data['abstract'];
                titles = data['titles'];
                authors = data['authors'];
                journal = data['journal'];
                if (cluster == '20') {
                    out_text.text = 'Keywords: Slide to specific cluster to see the keywords.';
                    for (i = 0; i < x.length; i++) {
						if(abstract[i].includes(key) || 
						titles[i].includes(key) || 
						authors[i].includes(key) || 
						journal[i].includes(key)) {
							x[i] = x_backup[i];
							y[i] = y_backup[i];
						} else {
							x[i] = undefined;
							y[i] = undefined;
						}
                    }
                }
                else {
                    out_text.text = 'Keywords: ' + topics[Number(cluster)];
                    for (i = 0; i < x.length; i++) {
                        if(labels[i] == cluster) {
							if(abstract[i].includes(key) || 
							titles[i].includes(key) || 
							authors[i].includes(key) || 
							journal[i].includes(key)) {
								x[i] = x_backup[i];
								y[i] = y_backup[i];
							} else {
								x[i] = undefined;
								y[i] = undefined;
							}
                        } else {
                            x[i] = undefined;
                            y[i] = undefined;
                        }
                    }
                }
            source.change.emit();
            """)
    return callback

In [None]:
# Plot data
# required libraries for plot
#subprocess.check_call(['python','-m','pip','install','call_backs'])
#from call_backs import input_callback, selected_code  # file with customJS callbacks for bokeh
# github.com/MaksimEkin/COVID19-Literature-Clustering/blob/master/lib/call_backs.py
import bokeh
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper, CustomJS, Slider, TapTool, TextInput
from bokeh.palettes import Category20
from bokeh.transform import linear_cmap, transform
from bokeh.io import output_file, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import RadioButtonGroup, TextInput, Div, Paragraph
from bokeh.layouts import column, widgetbox, row, layout
from bokeh.layouts import column
from bokeh.plotting import figure
from bokeh.resources import CDN
from bokeh.embed import file_html

In [None]:
import os

topic_path = 'topics.txt'
with open(topic_path) as f:
    topics = f.readlines()

In [None]:
output_notebook()
# target labels
y_labels = y_pred

# data sources
source = ColumnDataSource(data=dict(
    x= X_embedded[:,0], 
    y= X_embedded[:,1],
    x_backup = X_embedded[:,0],
    y_backup = X_embedded[:,1],
    desc= y_labels, 
    abstracts= df['abstract'],
    labels = ["C-" + str(x) for x in y_labels],
    ))

# hover over information
hover = HoverTool(tooltips=[
    ("Abstract", "@abstracts{safe}"),
],
point_policy="follow_mouse")

# map colors
mapper = linear_cmap(field_name='desc', 
                     palette=Category20[20],
                     low=min(y_labels) ,high=max(y_labels))

# prepare the figure
plot = figure(plot_width=1200, plot_height=850, 
           tools=[hover, 'pan', 'wheel_zoom', 'box_zoom', 'reset', 'save', 'tap'], 
           title="Clustering of the COVID-19 Literature with t-SNE and K-Means", 
           toolbar_location="above")

# plot settings
plot.scatter('x', 'y', size=5, 
          source=source,
          fill_color=mapper,
          line_alpha=0.3,
          line_color="black",
          legend = 'labels')
plot.legend.background_fill_alpha = 0.6

In [None]:
# Keywords
text_banner = Paragraph(text= 'Keywords: Slide to specific cluster to see the keywords.', height=45)
input_callback_1 = input_callback(plot, source, text_banner, topics)

# currently selected article
div_curr = Div(text="""Article points closer to center of cluster better represent the group (As configured by LDA) .""",height=150)
callback_selected = CustomJS(args=dict(source=source, current_selection=div_curr), code=selected_code())
taptool = plot.select(type=TapTool)
taptool.callback = callback_selected

# WIDGETS
slider = Slider(start=0, end=10, value=10, step=1, title="Cluster #", callback=input_callback_1)
keyword = TextInput(title="Search:", callback=input_callback_1)

# pass call back arguments
input_callback_1.args["text"] = keyword
input_callback_1.args["slider"] = slider

In [None]:
# STYLE
slider.sizing_mode = "stretch_width"
slider.margin=15

keyword.sizing_mode = "scale_both"
keyword.margin=15

div_curr.style={'color': '#BF0A30', 'font-family': 'Helvetica Neue, Helvetica, Arial, sans-serif;', 'font-size': '1.1em'}
div_curr.sizing_mode = "scale_both"
div_curr.margin = 20

text_banner.style={'color': '#0269A4', 'font-family': 'Helvetica Neue, Helvetica, Arial, sans-serif;', 'font-size': '1.1em'}
#text_banner.sizing_mode = "scale_both"
text_banner.sizing_mode = "stretch_width"
text_banner.margin = 20

plot.sizing_mode = "scale_both"
plot.margin = 5

r = row(div_curr,text_banner)
r.sizing_mode = "stretch_width"
#r.sizing_mode = "stretch_both"

In [65]:
# LAYOUT OF THE PAGE
l = layout([
    [slider, keyword],
    [text_banner],
    [div_curr],
    [plot],
])
l.sizing_mode = "scale_both"
#l.sizing_mode = "stretch_both"

# show
output_file('t-sne_covid-19_interactive.html')
show(l)