# Setting Up

In [None]:
# update your credentials if needed
!git config --global user.email "abduallahw10@gmail.com"
!git config --global user.name "Abdullah Al-Hayali"

In [None]:
# check that we're in the right repo, branch and that we are caught up
!git status

# Data Loading

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json

import matplotlib.pyplot as plt

In [None]:
# Loading metadata

root_path = "C:\My files\Courses\CIS6050\Data"
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.head()

In [None]:
# Entries for each column in the DF

meta_df.info()

## Fetch JSON Files

In [None]:
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)
len(all_json)

In [None]:
all_json[:5]

In [None]:
all_json[0]

In [None]:
# File reader class

class FileReader:
    def __init__(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.author_info = []
            self.abstract = []
            self.body_text = []
            # Author
            for entry in content['metadata']['authors']:
                self.author_info.append(entry['first'])
                self.author_info.append(entry['last'])
#             print(self.author_info)
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
#             self.author_info = '\n'.join(self.author_info)
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'AUTHOR: {self.author_info}; PAPER ID: {self.paper_id}; ABSTRACT: {self.abstract[:50]}; BODY TEXT: {self.body_text[:50]}'

first_row = FileReader(all_json[2])
print(first_row)

In [None]:
# Filter non-complying JSONS

from tqdm import tqdm

all_json_clean = list()

for idx, content in tqdm(enumerate(all_json)):    
    try:
        content = FileReader(content)
    except Exception as e:
        continue  # invalid paper format, skip
    
    if len(content.abstract) == 0:
        continue
    
    all_json_clean.append(all_json[idx])
    
all_json = all_json_clean
len(all_json)

In [None]:
from tqdm import tqdm
dict_ = {'paper_id': [], 'doi':[], 'abstract': [], 'authors': [], 
         'title': [], 'journal': []}
for idx, entry in tqdm(enumerate(all_json), total = len(all_json)):
    
    try:
        content = FileReader(entry)
    except Exception as e:
        continue  # invalid paper format, skip
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    if len(content.body_text) == 0:
        continue
    dict_['abstract'].append(content.abstract)
    dict_['paper_id'].append(content.paper_id)
                
    try:
        # if more than one author
        authors = meta_data['authors'].values[0].split(';')

        dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if only one author - or Null
        dict_['authors'].append(meta_data['authors'].values[0])
    
    # add title
    dict_['title'].append(meta_data['title'].values[0])
    
    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])
    
    # add doi
    dict_['doi'].append(meta_data['doi'].values[0])
    
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'doi', 'abstract', 'authors',
                                        'title', 'journal'])
df_covid.head()

In [None]:
df_covid = pd.read_csv("C:\My files\Courses\CIS6050\mod_df.csv")

In [None]:
df_covid.info()

In [None]:
df_covid.dropna(inplace=True)

In [None]:
df_covid.info()

### NOTE:

The data isn't only English! here's the breakdown (https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge/discussion/139146):

('en', 28575),
('fr', 323),
('es', 281),
('de', 54),
('it', 19),

In [None]:
from langdetect import detect
from langdetect import DetectorFactory
from tqdm import tqdm

In [None]:
# set seed
DetectorFactory.seed = 0

# hold label - language
languages = []

# go through each text
for ii in tqdm(range(0,len(df_covid))):
    # split by space into list, take the first x intex, join with space
    text = df_covid.iloc[ii]['abstract'].split(" ")
    
    lang = "en"
    try:
        if len(text) > 50:
            lang = detect(" ".join(text[:50]))
        elif len(text) > 0:
            lang = detect(" ".join(text[:len(text)]))
    # ught... beginning of the document was not in a good format
    except Exception as e:
        all_words = set(text)
        try:
            lang = detect(" ".join(all_words))
        # what!! :( let's see if we can find any text in abstract...
        except Exception as e:
            
            try:
                # let's try to label it through the abstract then
                lang = detect(df_covid.iloc[ii]['abstract'])
            except Exception as e:
                lang = "unknown"
                pass
    
    # get the language    
    languages.append(lang)

In [None]:
from pprint import pprint

languages_dict = {}
for lang in set(languages):
    languages_dict[lang] = languages.count(lang)
    
print("Total: {}\n".format(len(languages)))
pprint(languages_dict)

In [None]:
df_covid['language'] = languages
df_covid = df_covid[df_covid['language'] == 'en'] 
df_covid.info()

# In case of reloading, START HERE and load clean_df.csv

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_sci_lg

In [None]:
df_covid=pd.read_csv("C:\My files\Courses\CIS6050\clean_df.csv")

In [None]:
df_covid.head()

When doing topic modelling, the word embeddings are critical

In [None]:
# Remove noisy data
stopwords = list(STOP_WORDS)
stopwords[:10]

You can increase accuracy by adding custom stopwords but we'll skip them for now.

In [None]:
# Text parsing to change everything to lowercase, remove punctuation, find and remove stopwords
# en_core_sci_lg will be used to process biomedical text
"https://allenai.github.io/scispacy/"
#https://github.com/allenai/scispacy SciBERT could be a future suggestion


parser = en_core_sci_lg.load(disable=["tagger", "ner"]) 
parser.max_length = 7000000

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [None]:
tqdm.pandas()
df_covid["processed_text"] = df_covid["abstract"].progress_apply(spacy_tokenizer)

In [None]:
df_covid.head()

In [None]:
# tf-idf will be used to convert the string into a measure of importance

from sklearn.feature_extraction.text import TfidfVectorizer
def vectorize(text, maxx_features):
    
    vectorizer = TfidfVectorizer(max_features=maxx_features)
    X = vectorizer.fit_transform(text)
    return X

In [None]:
# Vectorize the data

text = df_covid['processed_text'].values
#not sure how this works. Got it from the discussions
max_words = 2**10 #max number of features/words of interest

X = vectorize(text, max_words)

# Dimensionality reduction

In [None]:
#TODO: Try without PCA

Let's see how much we can reduce while we keep 95% variance

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95, random_state=42)
X_reduced= pca.fit_transform(X.toarray())
X_reduced.shape

In [None]:
# k means clustering
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans

In [None]:
# elbow method
# https://www.geeksforgeeks.org/elbow-method-for-optimal-value-of-k-in-kmeans/

from sklearn import metrics
from scipy.spatial.distance import cdist

# run kmeans with many different k
distortions = []
K = range(2, 21)
for k in K:
    k_means = KMeans(n_clusters=k, random_state=42).fit(X_reduced)
    k_means.fit(X_reduced)
    distortions.append(sum(np.min(cdist(X_reduced, k_means.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])

In [None]:
X_line = [K[0], K[-1]]
Y_line = [distortions[0], distortions[-1]]

# Plot the elbow
plt.plot(K, distortions, 'b-')
plt.plot(X_line, Y_line, 'r')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
k = 5
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(X_reduced)
df_covid['y_pred_k5'] = y_pred

In [None]:
k = 19
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(X_reduced)
df_covid['y_pred_k19'] = y_pred

In [None]:
df_covid.head()

In [None]:
df_covid = df_covid.drop('y_pred',1)

In [None]:
df_covid.head()

In [None]:
df_covid.to_csv("df_clean.csv", index=False)

# LDA

## RUN df_clean.csv here, you can ignore everything beforehand

In [None]:
df_covid.head()

In [None]:
df_covid['processed_text']

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

create vectorizers for the k clusters

In [None]:
vectorizers = []
    
for ii in range(0, 20):
    # Creating a vectorizer
    vectorizers.append(CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}'))

In [None]:
vectorizers[0]

vectorize the data from each cluster

In [None]:
# Vectorize the data for each vector

vectorized_data = []

for current_cluster, cvec in enumerate(vectorizers):
    try:
        vectorized_data.append(cvec.fit_transform(df_covid.loc[df_covid['y_pred_k19'] == current_cluster, 'processed_text']))
    except Exception as e:
        print("Not enough instances in cluster: " + str(current_cluster))
        vectorized_data.append(None)

In [None]:
len(vectorized_data)

In [None]:
# number of topics per cluster, this was RANDOMLY selected
NUM_TOPICS_PER_CLUSTER = 20


lda_models = []

for ii in range(0, 20):
    # Latent Dirichlet Allocation Model
    lda = LatentDirichletAllocation(n_components=NUM_TOPICS_PER_CLUSTER, max_iter=10, learning_method='online',verbose=False, random_state=42)
    lda_models.append(lda)
    
lda_models[0]

In [None]:
clusters_lda_data = []

for current_cluster, lda in enumerate(lda_models):
    print("Current Cluster: " + str(current_cluster))
    
    if vectorized_data[current_cluster] != None:
        clusters_lda_data.append((lda.fit_transform(vectorized_data[current_cluster])))

In [None]:
for current_vectorizer, lda in enumerate(lda_models):
    print("Current Cluster: " + str(lda))

In [None]:
# Functions for printing keywords for each topic 
#From discussion forums

def selected_topics(model, vectorizer, top_n=3):
    current_words = []
    keywords = []
    
    for idx, topic in enumerate(model.components_):
        words = [(vectorizer.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]]
        for word in words:
            if word[0] not in current_words:
                keywords.append(word)
                current_words.append(word[0])
                
    keywords.sort(key = lambda x: x[1])  
    keywords.reverse()
    return_values = []
    for ii in keywords:
        return_values.append(ii[0])
    return return_values

In [None]:
all_keywords = []
for current_vectorizer, lda in enumerate(lda_models):
    print("Current Cluster: " + str(current_vectorizer))

    if vectorized_data[current_vectorizer] != None:
        all_keywords.append(selected_topics(lda, vectorizers[current_vectorizer]))

In [None]:
print(all_keywords,"\n", len(all_keywords))

### LDA for every single paper

Adpoted from https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [None]:
df_covid=pd.read_csv("C:\My files\Courses\CIS6050\df_clean.csv")

In [None]:
paper_ids = df_covid['paper_id']

In [None]:
paper_ids[0]

In [None]:
df_text_s = df_covid['processed_text']

In [None]:
df_text_s

In [None]:
import gensim

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [None]:
data = df_text_s.values.tolist()
data_words = list(sent_to_words(data))

In [None]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

Gensim creates unique id for each word in the document. Its mapping of word_id and word_frequency. Example: (4,1) above indicates, word_id 4 occurs once in the document and so on.

In [None]:
# View
print(corpus[2])

In [None]:
from pprint import pprint

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=NUM_TOPICS_PER_CLUSTER)
# Print the Keyword in the 20 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
def extractDigits(lst):
    return [[el] for el in lst]

#Change a list of words to list of lists
ll_words = extractDigits(data_words[0])

# Create Dictionary
id2word_t = corpora.Dictionary(ll_words)

# Create Corpus
texts_t = ll_words

# Term Document Frequency
corpus_t = [id2word.doc2bow(text) for text in texts_t]    

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus_t,
                                       id2word=id2word_t,
                                       num_topics=NUM_TOPICS_PER_CLUSTER)

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus_t]

In [None]:
data_words[2]

In [None]:
corpus_t

In [None]:
def extractDigits(lst):
    return [[el] for el in lst]

In [None]:
from pprint import pprint

NUM_TOPICS_PER_CLUSTER = 10
abs_lda = []

for i in tqdm(range(0,5)):
    
    print(i)
    
    #Change a list of words to list of lists
    ll_words = extractDigits(data_words[i])
#     print(ll_words)

    # Create Dictionary
    id2word_t = corpora.Dictionary(ll_words)
#     print(id2word_t)
    
    # Create Corpus
    texts_t = ll_words
    print(texts_t)
    
    # Term Document Frequency
    corpus_t = [id2word_t.doc2bow(text) for text in texts_t]

    # Build LDA model
    lda_model = gensim.models.LdaMulticore(corpus=corpus_t,
                                           id2word=id2word_t,
                                           num_topics=NUM_TOPICS_PER_CLUSTER)
        
    abs_lda.append(lda_model[corpus_t])

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[abs_lda(1)]

abs_lda

In [None]:
abs_lda

#### The next cells export the filtered all_json list into a DF

In [None]:
new_jj = all_json[:]
len(new_jj)

In [None]:
import os

In [None]:
# Return the longest prefix of all list elements.
def commonprefix(m):
    "Given a list of pathnames, returns the longest common leading component"
    if not m: return ''
    s1 = min(m)
    s2 = max(m)
    for i, c in enumerate(s1):
        if c != s2[i]:
            return s1[:i]
    return s1

In [None]:
commonprefix(new_jj)

In [None]:
len(new_jj)

In [None]:
df_test22 = pd.DataFrame(new_jj)
df_test22.to_csv('Paths_JSON_clean.csv', index=False)

In [None]:
# Scrap code

print("empty count", meta_df['abstract'].isnull().sum())

In [None]:
print(meta_df['abstract'].notnull().sum())

# Pushing The Code

In [None]:
!git branch "a_new"

In [None]:
!git checkout "a_new"

In [None]:
!git status

In [None]:
!git add .

In [None]:
!git push origin a_new

In [None]:
!git checkout main

In [None]:
!git commit -m"More clean df"

In [None]:
!git status