In [1]:
from bs4 import BeautifulSoup
import pandas as pd
from functools import reduce
import re
import collections
import numpy as np

import nltk
#nltk.download('stopwords')
#nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stopword = set(stopwords.words('english'))

In [22]:
df = pd.read_csv("MLproj.csv")

In [23]:
df

Unnamed: 0.1,Unnamed: 0,abstract,tags
0,0,real-world experience covid-19 including direc...,"['Aged' 'Aged, 80 and over'\n 'Antibodies, Mon..."
1,1,successful outcome pre-engraftment covid-19 hc...,['COVID-19*' 'Hematopoietic Stem Cell Transpla...
2,2,impact covid-19 oncology professionals-one yea...,"['Burnout, Professional* / epidemiology' 'COVI..."
3,3,icu admission mortality classifier covid-19 pa...,['Bayes Theorem' 'COVID-19*' 'Hospitalization'...
4,4,clinical evaluation nasopharyngeal midturbinat...,['COVID-19 Testing' 'COVID-19* / diagnosis' 'H...
...,...,...,...
5468,5468,hypersensitivity reaction vaccine current evid...,['Anaphylaxis* / chemically induced' 'COVID-19...
5469,5469,rooming-in breastfeeding neonatal follow-up in...,['Breast Feeding*' 'COVID-19*' 'Female' 'Follo...
5470,5470,acute abducens nerve palsy following second do...,['Abducens Nerve Diseases* / chemically induce...
5471,5471,planning implementing protocol psychosocial in...,['COVID-19*' 'Delivery of Health Care' 'Humans...


In [73]:
def remove_punctuation(text):
    
    # punctuations except -  
    punc ='''?!.,:;_—[](){}'"`~|\/@#$%^&+=*'''
    for i in text:
        if i in punc:
            text = text.replace(i, ' ')            
    return text.strip()

def preprocess(text):
    
    # lower casing
    text=text.lower()
    
    # stopword removal
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    
    # lemmatization
    text = [lemmatizer.lemmatize(word) for word in text.split(' ')]
    text = " ".join(text)
    
    # removing words containing only numbers
    text = re.sub(r'\s[0-9]+\s', '', text)
    
    # remove extra spaces
    text = re.sub("\s\s+", " ", text)   
    return text.strip()

In [74]:
df['title']=df['title'].apply(remove_punctuation)
df['title']=df['title'].apply(preprocess)

df['abstract']=df['abstract'].apply(remove_punctuation)
df['abstract']=df['abstract'].apply(preprocess)

df['tags']=df['tags'].apply(preprocess)

In [75]:
df

Unnamed: 0,title,abstract,tags
0,real-world experience covid-19 including direc...,article summarizes experience covid-19 patient...,"['aged', 'aged,over', 'antibodies, monoclonal,..."
1,successful outcome pre-engraftment covid-19 hc...,coronavirus disease covid-19 caused severe acu...,"['covid-19*', 'hematopoietic stem cell transpl..."
2,impact covid-19 oncology professionals-one yea...,background covid-19 significant impact well-be...,"['burnout, professional* / epidemiology', 'cov..."
3,icu admission mortality classifier covid-19 pa...,coronavirus disease covid-19 caused severe acu...,"['bayes theorem', 'covid-19*', 'hospitalizatio..."
4,clinical evaluation nasopharyngeal midturbinat...,setting supply chain shortage nasopharyngeal n...,"['covid-19 testing', 'covid-19* / diagnosis', ..."
...,...,...,...
5468,hypersensitivity reaction vaccine current evid...,first report hypersensitivity reaction followi...,"['anaphylaxis* / chemically induced', 'covid-1..."
5469,rooming-in breastfeeding neonatal follow-up in...,introduction due growing evidence suggesting c...,"['breast feeding*', 'covid-19*', 'female', 'fo..."
5470,acute abducens nerve palsy following second do...,author report case otherwise healthy 65-year-o...,['abducens nerve diseases* / chemically induce...
5471,planning implementing protocol psychosocial in...,present study aim plan protocol providing psyc...,"['covid-19*', 'delivery health care', 'humans'..."


In [76]:
df['tags'][0]

"['aged', 'aged,over', 'antibodies, monoclonal, humanized / therapeutic use*', 'antibodies, neutralizing / therapeutic use', 'antigens, viral / analysis', 'covid-19 testing', 'covid-19* / diagnosis', 'covid-19* / therapy', 'hospitals', 'humans', 'middle aged', 'retrospective studies', 'south dakota']"

In [24]:
def conv(val):
    val = val.replace("'","")
    val = val.strip('][').split(', ')
    return val

In [25]:
df['tags'] = df['tags'].apply(conv)

In [26]:
df['tags'][0]

['Aged Aged',
 '80 and over\n Antibodies',
 'Monoclonal',
 'Humanized / therapeutic use*\n Antibodies',
 'Neutralizing / therapeutic use Antigens',
 'Viral / analysis\n COVID-19 Testing COVID-19* / diagnosis COVID-19* / therapy\n Hospitals Humans Middle Aged Retrospective Studies South Dakota']

In [28]:
# Make sure there is no spaces in the empty elements. Example: ' ' -> ''
for i in range(len(df)):
    for j, word in enumerate(df['tags'][i]):
        df['tags'][i][j] = df['tags'][i][j].strip()
        
# Mark None for the the keyword, covid-19 and humans, since almost all the abstracts have as keyword.
# Mark None for the empty elements '' 
for i in range(len(df)):
    for j, word in enumerate(df['tags'][i]):
        if word == '':
            df['tags'][i][j] = None

In [29]:
for n,word in enumerate(df['tags'][0]):
    print(df['tags'][0][n])

Aged Aged
80 and over
 Antibodies
Monoclonal
Humanized / therapeutic use*
 Antibodies
Neutralizing / therapeutic use Antigens
Viral / analysis
 COVID-19 Testing COVID-19* / diagnosis COVID-19* / therapy
 Hospitals Humans Middle Aged Retrospective Studies South Dakota


In [30]:
df['tags'] = [list(filter(None, df['tags'][i])) for i in range(len(df))]

In [39]:
# Generate a list of all the tags 
tags_list = []
for i in range(len(df)):
    tags_list = tags_list + df['tags'][i]

# Counts for each unique keyword
counter=collections.Counter(tags_list)
print('Number of total unique tags: {}'.format(len(counter)))

# Pull the 20 most common words
most_common_words= [word for word, word_count in collections.Counter(tags_list).most_common(100)]
print('100 most common tags: {}'.format(most_common_words))

Number of total unique tags: 8817
100 most common tags: ['Antibodies', 'Adult Aged Aged', 'Attitudes', 'Aged Aged', 'Animals Antibodies', 'Monoclonal', 'Adaptation', 'Adolescent Adult Aged Aged', 'Coronavirus / genetics\n Spike Glycoprotein', 'Synthetic mRNA Vaccines', 'Adult Antibodies', 'X-Ray Computed', 'Neutralizing Antibodies', 'Coronavirus / chemistry\n Spike Glycoprotein', 'COVID-19* Education', 'Coronavirus', 'Medical*', 'Administration', 'Neutralizing / immunology Antibodies', 'Adult Aged Antibodies', 'Nursing*', 'Neutralizing / immunology\n Antibodies', 'Medical', 'Coronavirus / genetics', 'Burnout', 'Coronavirus / metabolism', 'Neutralizing / immunology*\n Antibodies', 'Coronavirus / immunology', 'Neutralizing / blood\n Antibodies', 'Coronavirus / genetics*\n Spike Glycoprotein', 'Nursing', 'Mechanical', '80 and over\n Antibodies', 'Viral / blood\n Antibodies', 'Coronavirus / immunology*', 'Coronavirus / immunology\n Spike Glycoprotein', 'Aged Antibodies', 'Coronavirus / che

In [40]:
# Keep only the most common tags and if abstract doesn't include any of the common tags, remove completely
for i in range(len(df)):
    for j, word in enumerate(df['tags'][i]):
        if word not in most_common_words:
            df['tags'][i][j] = None
                 
df['tags'] = [list(filter(None, df['tags'][i])) for i in range(len(df))]
df = df[df['tags'].map(lambda d: len(d)) > 0]
df.reset_index(inplace = True, drop = True)

print('Number of total abstracts after removal: {}'.format(len(df)))

Number of total abstracts after removal: 1060


In [41]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import hamming_loss

In [42]:
X = np.array(df['abstract'])
y = df['tags']

# MultiLabelBinarizer is used to transform the tags to a binary matrix for multilabel modeling
# Fit all labels to binarizer
mlb = MultiLabelBinarizer().fit(y)

# Split data into a features matrix and target vector
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 9000)

# Transform train and test tags into the binary matrix format
y_train_binary = mlb.transform(y_train)
y_test_binary = mlb.transform(y_test)
print('y_train_binary shape: {} \ny_test_binary shape: {}'.format(y_train_binary.shape, y_test_binary.shape))
print()
print('y_train_binary array: \n{}'.format(y_train_binary[:10]))

y_train_binary shape: (848, 100) 
y_test_binary shape: (212, 100)

y_train_binary array: 
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 

In [43]:
vectorizer = CountVectorizer(analyzer = 'word', strip_accents = 'unicode', stop_words = 'english')

# Fit and transform the training abstract text and transform the test abstract text
X_train_tok = vectorizer.fit_transform(X_train)
X_test_tok = vectorizer.transform(X_test)

In [44]:
# Transform a count matrix to a normalized tf or tf-idf representation
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_tok)
X_test_tfidf = tfidf_transformer.transform(X_test_tok)

In [45]:
# Look at the C parameter
clf = OneVsRestClassifier(LinearSVC(max_iter=700))
clf.fit(X_train_tfidf,y_train_binary)
y_preds_binary = clf.predict(X_test_tfidf)

actual_key = mlb.inverse_transform(y_test_binary)[1:30]
predicted_key = mlb.inverse_transform(y_preds_binary)[1:30]

print("Standard training set score: {:.3f}".format(clf.score(X_train_tfidf, y_train_binary)))
print("Standard test set score: {:.3f}".format(clf.score(X_test_tfidf, y_test_binary)))
print("Hamming loss measure: {:.10f}".format(hamming_loss(y_test_binary,y_preds_binary)))

print()
count = 0
print('Actual labels:')
for label in actual_key:
    count += 1
    print(count, label)
    
print()

count = 0
print('Predicted labels:')
for label in predicted_key:
    count +=1
    print(count, label)

Standard training set score: 1.000
Standard test set score: 0.137
Hamming loss measure: 0.0123113208

Actual labels:
1 ('Attitudes',)
2 ('Adult Aged Aged',)
3 ('Attitudes',)
4 ('Coronavirus / immunology',)
5 ('Arthritis',)
6 ('Adaptation',)
7 ('Multiple',)
8 ('Coronavirus',)
9 ('Coronavirus / genetics\n Spike Glycoprotein',)
10 ('Attitudes',)
11 ('Burnout',)
12 ('X-Ray Computed',)
13 ('Psychological / epidemiology',)
14 ('Adaptation',)
15 ('Aged Antibodies', 'Coronavirus')
16 ('Adult Aged Aged',)
17 ('COVID-19* Education',)
18 ('Antibodies',)
19 ('80 and over\n Antibodies', 'Aged Aged', 'Monoclonal')
20 ('Coronavirus / chemistry\n Spike Glycoprotein', 'Coronavirus / genetics\n Spike Glycoprotein', 'Coronavirus / immunology*\n Spike Glycoprotein', 'Coronavirus / metabolism')
21 ('Medical*',)
22 ('Burnout',)
23 ('COVID-19* Emergency Service',)
24 ('Animals Antibodies', 'Neutralizing / blood Antibodies')
25 ('Antibodies', 'Coronavirus / genetics')
26 ('COVID-19* Humans Pandemics SARS-CoV-

In [46]:
classifier2 = Pipeline([
    ('vectorizer', CountVectorizer(analyzer = 'word', strip_accents = 'unicode', stop_words = 'english')),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(max_iter = 2000)))], verbose = True)

classifier2 = classifier2.fit(X_train, y_train_binary)
y_preds_binary_2 = classifier2.predict(X_test)

actual_2 = mlb.inverse_transform(y_test_binary)[1:30]
predicted_2 = mlb.inverse_transform(y_preds_binary_2)[1:30]

print("Standard training set score: {:.3f}".format(classifier2.score(X_train, y_train_binary)))
print("Standard test set score: {:.3f}".format(classifier2.score(X_test, y_test_binary)))
print("Hamming loss measure: {:.10f}".format(hamming_loss(y_test_binary,y_preds_binary_2)))

print()
count = 0
print('Actual labels:')
for label in actual_2:
    count += 1
    print(count, label)
    
print()

count = 0
print('Predicted labels:')
for label in predicted_2:
    count +=1
    print(count, label)

[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   0.1s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.3s
Standard training set score: 1.000
Standard test set score: 0.137
Hamming loss measure: 0.0123113208

Actual labels:
1 ('Attitudes',)
2 ('Adult Aged Aged',)
3 ('Attitudes',)
4 ('Coronavirus / immunology',)
5 ('Arthritis',)
6 ('Adaptation',)
7 ('Multiple',)
8 ('Coronavirus',)
9 ('Coronavirus / genetics\n Spike Glycoprotein',)
10 ('Attitudes',)
11 ('Burnout',)
12 ('X-Ray Computed',)
13 ('Psychological / epidemiology',)
14 ('Adaptation',)
15 ('Aged Antibodies', 'Coronavirus')
16 ('Adult Aged Aged',)
17 ('COVID-19* Education',)
18 ('Antibodies',)
19 ('80 and over\n Antibodies', 'Aged Aged', 'Monoclonal')
20 ('Coronavirus / chemistry\n Spike Glycoprotein', 'Coronavirus / genetics\n Spike Glycoprotein', 'Coronavirus / immunology*\n Spike Glycoprotein', 'Coronavirus / metabolism')
21

In [24]:
import matplotlib.pyplot as plt

In [27]:
type(classifier2)

sklearn.pipeline.Pipeline