# Text Classification & Clustering

This project uses MEDLINE articles, which are uniquely identified with their corresponding PubMed ID, with their abstract and title. NLP techniques, such as unigrams and TF-IDF, were used to obtain features to classify/predict the articles to a MeSH term. Additionally, KMeans clustering was used as a feature engineering technique to classify/predict. A SVM classifier (LinearSVC) was used to make a binary yes/no prediction.

In [1]:
import gzip
import math
import re
import pandas as pd
import sklearn
from sklearn import svm
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

NOTE: This project contained multiple files containing MEDLINE articles, however, only one file is displayed here for demonstration purposes.

In [2]:
file_list = []
file_list.append('medline.0.txt.gz')

In [3]:
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome', 'Neoplasms', 'Prognosis', 'Risk Factors',
             'Breast Neoplasms', 'Lung Neoplasms']

## Load Data & Preprocessing

In [4]:
def read_data(filenames):

    data = {}
    for file in filenames:
        with gzip.open(file, 'rb') as f:

            contents = f.read().decode('utf-8')
            pmid = re.findall('PMID- ([0-9]+)', contents, re.DOTALL)
            title = re.findall('TI  - (.+?)\n[A-Z][A-Z]*', contents, re.DOTALL)
            abstracts = re.findall('AB  - (.+?)\n[A-Z][A-Z]*', contents, re.DOTALL)
            for id_, ti, ab in zip(pmid, title, abstracts):
                data[id_] = {'AB' : ab.replace('\n      ', ' '),
                            'TI': ti.replace('\n      ', ' ')}
                                   
            for line in contents.splitlines():
                spl_str = '- '
                if 'PMID' in line and line.find('PMID') == 0:
                    pmid_ = line.partition(spl_str)[2].rstrip()
                    data[pmid_]['MH'] = []
                elif 'MH ' in line and line.find('MH ') == 0:
                    mh = line.partition(spl_str)[2].rstrip()
                    data[pmid_]['MH'].append(mh)

    return data

In [5]:
data = read_data(file_list)
data

{'22997744': {'AB': 'To diagnose recurrent colorectal cancer is an urgent problem of oncoproctology. Eighty patients with suspected recurrent colon tumor were examined. All the patients underwent irrigoscopy, colonoscopy, magnetic resonance imaging of the abdomen and small pelvis. The major magnetic resonance symptoms of recurrent colon tumors were studied; a differential diagnosis of recurrent processes and postoperative changes at the site of intervention was made.',
  'TI': '[Value of magnetic resonance imaging in the diagnosis of recurrent colorectal cancer].',
  'MH': ['Adult',
   'Aged',
   'Colon/pathology/surgery',
   'Colorectal Neoplasms/*diagnosis/pathology/surgery',
   'Diagnosis, Differential',
   'Female',
   'Humans',
   'Magnetic Resonance Imaging/*methods',
   'Male',
   'Middle Aged',
   'Neoplasm Recurrence, Local/*diagnosis',
   'Postoperative Complications/*diagnosis',
   'Rectum/pathology/surgery',
   'Reproducibility of Results']},
 '22997834': {'AB': 'OBJECTIVE:

## Tokenize

In [6]:
tokenizer = re.compile('\w+|[^\s\w]+')

def tokenize(text):
    return tokenizer.findall(text.lower())

## Create Training & Testing Data

In [7]:
def train_test(data, train, test, mesh_list, prob):
    
    features_trn = {}
    features_tst = {}
    pmid_mesh = []
    pmid_mesh_trn = []
    pmid_mesh_tst = []

    if prob == 'cluster':
        for id_ in data:
            mh = mesh(data, id_)
            for word in mesh_list:
                if word in mh:
                    pmid_mesh.append([id_, word])
                    
        labels = pd.DataFrame(pmid_mesh, columns=['PMID', 'Label'])
        train_df = labels[labels.PMID.isin(train)]
        test_df = labels[labels.PMID.isin(test)]
        
    else:
        if prob == 'unigram':
            for id_ in train:
                features_trn.update({id_: unigrams(data, id_)})
                mh = mesh(data, id_)
                for word in mesh_list:
                    if word in mh:
                        pmid_mesh_trn.append([id_, word])
            for id_ in test:
                features_tst.update({id_: unigrams(data, id_)})
                mh = mesh(data, id_)
                for word in mesh_list:
                    if word in mh:
                        pmid_mesh_tst.append([id_, word])                        
        else:
            for id_ in train:
                features_trn.update({id_: tfidf(data, id_)})
                mh = mesh(data, id_)
                for word in mesh_list:
                    if word in mh:
                        pmid_mesh_trn.append([id_, word])
            for id_ in test:
                features_tst.update({id_: tfidf(data, id_)})
                mh = mesh(data, id_)
                for word in mesh_list:
                    if word in mh:
                        pmid_mesh_tst.append([id_, word])    
           
        df_trn = pd.DataFrame(features_trn).T.reset_index().rename(columns={'level_0': 'PMID'})
        labels_trn = pd.DataFrame(pmid_mesh_trn, columns=['PMID', 'Label'])
        df_trn = df_trn.merge(labels_trn, left_on='PMID', right_on='PMID')
        
        df_tst = pd.DataFrame(features_tst).T.reset_index().rename(columns={'level_0': 'PMID'})
        labels_tst = pd.DataFrame(pmid_mesh_tst, columns=['PMID', 'Label'])
        df_tst = df_tst.merge(labels_tst, left_on='PMID', right_on='PMID')
        
        df = pd.concat([df_trn, df_tst[df_trn.columns.intersection(df_tst.columns)]]).fillna(0)
        train_df = df[df.PMID.isin(train)]
        test_df = df[df.PMID.isin(test)]
        
    return train_df, test_df

## Classifier

In [8]:
def predict(trn, tst, word):
    
    new_trn = trn.copy()
    new_tst = tst.copy()
    
    new_trn['Label'] = new_trn['Label'].map({word: 1}).fillna(0)
    new_tst['Label'] = new_tst['Label'].map({word: 1}).fillna(0)
    
    new_trn = new_trn.sort_values(['PMID','Label']).drop_duplicates('PMID', keep='last').reset_index(drop=True)
    new_tst = new_tst.sort_values(['PMID','Label']).drop_duplicates('PMID', keep='last').reset_index(drop=True)
            
    X_train = new_trn.drop(['PMID','Label'], axis=1)
    y_train = new_trn['Label']
    X_test = new_tst.drop(['PMID','Label'], axis=1)
    y_test = new_tst['Label']
    
    clf = svm.LinearSVC()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
        
    pmid_pred = pd.DataFrame({'PMID': new_tst.PMID, 'Predicted': y_pred})
    predict_yes = pmid_pred[pmid_pred.Predicted == 1]
    ids = list(predict_yes.PMID)
    
    return ids

In [9]:
# Predict with Unigrams
def svm_predict_unigram(data, train, test, mesh):
    predictions = {m:[] for m in mesh}
    
    train_df, test_df = train_test(data, train, test, mesh, 'unigram')
    for word in mesh:
        predictions[word] = predict(train_df, test_df, word)

    return predictions

In [10]:
# Predict with TD-IDF
def svm_predict_tfidf(data, train, test, mesh):
    predictions = {m:[] for m in mesh}
    
    train_df, test_df = train_test(data, train, test, mesh, 'tfidf')
    for word in mesh:
        predictions[word] = predict(train_df, test_df, word) 

    return predictions

#### Return List of PMIDs for a Document

In [11]:
def list_pmids(data):
    pmids = []
    for id_ in data:
        pmids.append(id_)

    return pmids

In [12]:
pmids = list_pmids(data)

#### Return List of MeSH Terms for a Document

In [13]:
def mesh(data, pmid):
    
    mesh = []
    for mh in data[pmid]['MH']:
        mesh.append(mh.split('/')[0].replace('*', ''))

    return mesh

#### Return Unigrams

Returns a dict of all the unique words in the title and abstract of an article with the given PMID with a value of 1.0

In [14]:
def unigrams(data, pmid):
    unigrams = {}
    
    title = tokenize(data[pmid]['TI'])
    abstract = tokenize(data[pmid]['AB'])
    unique = set(title + abstract)

    for word in unique:
        if word in unigrams:
            unigrams[word] += 1.0
        else:
            unigrams[word] = 1.0

    return unigrams

#### Return TD-IDF

Returns a dict of all unique words with a value based on the TF-IDF score of the word. TF = term frequency, the number of times a term (token) appears in the given document (combination of title and abstract in this case). IDF = inverse document frequency.

In [15]:
def tfidf(data, pmid):
    tfidf = {}
    
    title = tokenize(data[pmid]['TI'])
    abstract = tokenize(data[pmid]['AB'])
    words = title + abstract

    unigrams = {}
    for word in words:
        if word in unigrams:
            unigrams[word] += 1.0
        else:
            unigrams[word] = 1.0
    
    doc_words = dict.fromkeys(unigrams, 0)
    for id_ in data:
        ti = tokenize(data[id_]['TI'])
        ab = tokenize(data[id_]['AB'])
        ti_ab = set(ti + ab)
        for word in unigrams:
            if word in ti_ab:
                doc_words[word] += 1.0

    for word, count in unigrams.items():
        tfidf[word] = count * math.log(len(data)/doc_words[word])

    return tfidf

## Predictions

In [16]:
tts = int(len(pmids) * 0.8)
train = pmids[:tts]
test = pmids[tts:]

#### Using Unigrams as Features

In [17]:
svm_unigram = svm_predict_unigram(data, train, test, mesh_list)
svm_unigram

{'Humans': ['23016912',
  '23016985',
  '23016986',
  '23017037',
  '23017053',
  '23017100',
  '23017137',
  '23017138',
  '23017142',
  '23017148',
  '23017157',
  '23017162',
  '23017173',
  '23017217',
  '23017243',
  '23017372',
  '23017386',
  '23017388',
  '23017402',
  '23017513',
  '23017516',
  '23017517',
  '23017522',
  '23017528',
  '23017529',
  '23017550',
  '23017603',
  '23017604',
  '23017605',
  '23017612',
  '23017613',
  '23017617',
  '23017618',
  '23017622',
  '23017624',
  '23017625',
  '23017627',
  '23017633',
  '23017657',
  '23017667',
  '23017669',
  '23017671',
  '23017673',
  '23017679',
  '23017724',
  '23017769',
  '23017787',
  '23017819',
  '23017820',
  '23017821',
  '23017832',
  '23017866',
  '23017870',
  '23017941',
  '23017942',
  '23017944',
  '23017983',
  '23017985',
  '23018034',
  '23018093',
  '23018096',
  '23018112',
  '23018169',
  '23018179',
  '23018213',
  '23018214',
  '23018234',
  '23018243',
  '23018280',
  '23018353',
  '2301841

#### Using TF-IDF as Features

In [18]:
svm_tfidf = svm_predict_tfidf(data, train, test, mesh_list)
svm_tfidf

{'Humans': ['23016912',
  '23016985',
  '23016986',
  '23017037',
  '23017053',
  '23017100',
  '23017137',
  '23017138',
  '23017142',
  '23017148',
  '23017157',
  '23017162',
  '23017173',
  '23017217',
  '23017243',
  '23017372',
  '23017386',
  '23017388',
  '23017402',
  '23017513',
  '23017516',
  '23017517',
  '23017522',
  '23017528',
  '23017529',
  '23017550',
  '23017603',
  '23017604',
  '23017605',
  '23017612',
  '23017613',
  '23017617',
  '23017618',
  '23017622',
  '23017624',
  '23017625',
  '23017627',
  '23017633',
  '23017657',
  '23017667',
  '23017669',
  '23017671',
  '23017673',
  '23017679',
  '23017724',
  '23017769',
  '23017787',
  '23017819',
  '23017820',
  '23017821',
  '23017832',
  '23017866',
  '23017870',
  '23017941',
  '23017942',
  '23017944',
  '23017983',
  '23017985',
  '23018034',
  '23018093',
  '23018096',
  '23018112',
  '23018169',
  '23018179',
  '23018213',
  '23018214',
  '23018234',
  '23018243',
  '23018280',
  '23018353',
  '2301841

### Evaluation

In [19]:
def evaluate(data, test, mesh_predict):
    evaluation = {}
    # Begin CODE
    
    df = pd.DataFrame()
    for id_ in test:
        mh = mesh(data, id_)
        for word in mesh_predict:
            if word in mh and id_ in mesh_predict[word]:
                df = df.append({'PMID': id_, 'MeSH': word, 'Actual': 1, 'Predicted': 1}, ignore_index=True)
            elif word in mh and id_ not in mesh_predict[word]:
                df = df.append({'PMID': id_, 'MeSH': word, 'Actual': 1, 'Predicted': 0}, ignore_index=True)

            elif word not in mh and id_ in mesh_predict[word]:
                df = df.append({'PMID': id_, 'MeSH': word, 'Actual': 0, 'Predicted': 1}, ignore_index=True)
            elif word not in mh and id_ not in mesh_predict[word]:
                df = df.append({'PMID': id_, 'MeSH': word, 'Actual': 0, 'Predicted': 0}, ignore_index=True)
            
    for word in mesh_predict:
        df2 = df[df.MeSH == word]
        evaluation[word] = {'accuracy': accuracy_score(df2.Actual.values, df2.Predicted.values),
                            'precision': precision_score(df2.Actual.values, df2.Predicted.values),
                            'recall': recall_score(df2.Actual.values, df2.Predicted.values),
                            'f1': f1_score(df2.Actual.values, df2.Predicted.values)}

    # End CODE
    return evaluation

#### Unigrams

In [20]:
evaluate(data, test, svm_unigram)

{'Humans': {'accuracy': 0.965,
  'precision': 0.965,
  'recall': 1.0,
  'f1': 0.9821882951653944},
 'Female': {'accuracy': 0.775,
  'precision': 0.7522123893805309,
  'recall': 0.8333333333333334,
  'f1': 0.7906976744186047},
 'Male': {'accuracy': 0.74,
  'precision': 0.6578947368421053,
  'recall': 0.6578947368421053,
  'f1': 0.6578947368421053},
 'Animals': {'accuracy': 0.845,
  'precision': 0.6206896551724138,
  'recall': 0.47368421052631576,
  'f1': 0.5373134328358208},
 'Treatment Outcome': {'accuracy': 0.87,
  'precision': 0.5,
  'recall': 0.11538461538461539,
  'f1': 0.1875},
 'Neoplasms': {'accuracy': 0.835,
  'precision': 0.3333333333333333,
  'recall': 0.2222222222222222,
  'f1': 0.26666666666666666},
 'Prognosis': {'accuracy': 0.96,
  'precision': 0.75,
  'recall': 0.3,
  'f1': 0.4285714285714285},
 'Risk Factors': {'accuracy': 0.92,
  'precision': 0.5,
  'recall': 0.1875,
  'f1': 0.2727272727272727},
 'Breast Neoplasms': {'accuracy': 0.89,
  'precision': 0.7037037037037037,

#### TF-IDF

In [21]:
evaluate(data, test, svm_tfidf)

{'Humans': {'accuracy': 0.965,
  'precision': 0.965,
  'recall': 1.0,
  'f1': 0.9821882951653944},
 'Female': {'accuracy': 0.835,
  'precision': 0.8108108108108109,
  'recall': 0.8823529411764706,
  'f1': 0.8450704225352113},
 'Male': {'accuracy': 0.79,
  'precision': 0.7575757575757576,
  'recall': 0.6578947368421053,
  'f1': 0.704225352112676},
 'Animals': {'accuracy': 0.88,
  'precision': 0.8181818181818182,
  'recall': 0.47368421052631576,
  'f1': 0.6},
 'Treatment Outcome': {'accuracy': 0.875,
  'precision': 0.5714285714285714,
  'recall': 0.15384615384615385,
  'f1': 0.24242424242424246},
 'Neoplasms': {'accuracy': 0.87,
  'precision': 0.5454545454545454,
  'recall': 0.2222222222222222,
  'f1': 0.3157894736842105},
 'Prognosis': {'accuracy': 0.945,
  'precision': 0.4444444444444444,
  'recall': 0.4,
  'f1': 0.4210526315789474},
 'Risk Factors': {'accuracy': 0.915,
  'precision': 0.4,
  'recall': 0.125,
  'f1': 0.19047619047619047},
 'Breast Neoplasms': {'accuracy': 0.935,
  'prec

## Adding KMeans Clustering

In [22]:
K = 10

In [23]:
def kmeans(data, k):
    clusters = {}

    vector = {}
    for id_ in data:
        vector.update({id_: unigrams(data, id_)})
    df = pd.DataFrame(vector).T.fillna(0)
    
    km = KMeans(n_clusters=k, random_state=0, init='random')
    km.fit(df)
    
    for pmid, label in zip(df.index, km.labels_):
        clusters[pmid] = int(label)

    return clusters

In [24]:
def svm_predict_cluster(data, train, test, mesh, k):
    predictions = {m:[] for m in mesh}

    cluster_class = kmeans(data, k)
    train_df, test_df = train_test(data, train, test, mesh, 'cluster')
    train_df['Cluster_Class'] = train_df['PMID'].map(cluster_class)
    test_df['Cluster_Class'] = test_df['PMID'].map(cluster_class)
    
    for word in mesh:
        predictions[word] = predict(train_df, test_df, word) 

    return predictions

### Predictions

#### Using Cluster Labels as Feature

In [25]:
svm_cluster = svm_predict_cluster(data, train, test, mesh_list, K)
svm_cluster

{'Humans': ['23016912',
  '23016985',
  '23016986',
  '23017037',
  '23017053',
  '23017100',
  '23017137',
  '23017138',
  '23017142',
  '23017148',
  '23017157',
  '23017162',
  '23017173',
  '23017217',
  '23017243',
  '23017372',
  '23017386',
  '23017388',
  '23017402',
  '23017513',
  '23017516',
  '23017517',
  '23017522',
  '23017528',
  '23017529',
  '23017550',
  '23017603',
  '23017604',
  '23017605',
  '23017612',
  '23017613',
  '23017617',
  '23017618',
  '23017622',
  '23017624',
  '23017625',
  '23017627',
  '23017633',
  '23017657',
  '23017667',
  '23017669',
  '23017671',
  '23017673',
  '23017679',
  '23017724',
  '23017769',
  '23017787',
  '23017819',
  '23017820',
  '23017821',
  '23017832',
  '23017866',
  '23017870',
  '23017941',
  '23017942',
  '23017944',
  '23017983',
  '23017985',
  '23018034',
  '23018093',
  '23018096',
  '23018112',
  '23018169',
  '23018179',
  '23018213',
  '23018214',
  '23018234',
  '23018243',
  '23018280',
  '23018353',
  '2301841

#### Evaluate

In [26]:
evaluate(data, test, svm_cluster)

{'Humans': {'accuracy': 0.965,
  'precision': 0.965,
  'recall': 1.0,
  'f1': 0.9821882951653944},
 'Female': {'accuracy': 0.58,
  'precision': 0.5584415584415584,
  'recall': 0.8431372549019608,
  'f1': 0.671875},
 'Male': {'accuracy': 0.615,
  'precision': 0.4,
  'recall': 0.02631578947368421,
  'f1': 0.04938271604938271},
 'Animals': {'accuracy': 0.78, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0},
 'Treatment Outcome': {'accuracy': 0.87,
  'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0},
 'Neoplasms': {'accuracy': 0.865, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0},
 'Prognosis': {'accuracy': 0.95, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0},
 'Risk Factors': {'accuracy': 0.92,
  'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0},
 'Breast Neoplasms': {'accuracy': 0.835,
  'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0},
 'Lung Neoplasms': {'accuracy': 0.925,
  'precision': 0.0,
  'recall': 0.0,
  'f1': 0.0}}

#### Using Unigrams AND Cluster Labels as Features

In [27]:
def svm_predict_cluster_unigrams(data, train, test, mesh, k):
    predictions = {m:[] for m in mesh}

    cluster_class = kmeans(data, k)
    train_df, test_df = train_test(data, train, test, mesh, 'unigram')
    train_df['Cluster_Class'] = train_df['PMID'].map(cluster_class)
    test_df['Cluster_Class'] = test_df['PMID'].map(cluster_class)
    
    for word in mesh:
        predictions[word] = predict(train_df, test_df, word)

    return predictions

In [28]:
svm_cluster_unigrams = svm_predict_cluster_unigrams(data, train, test, mesh_list, K)
svm_cluster_unigrams

{'Humans': ['23016912',
  '23016985',
  '23016986',
  '23017037',
  '23017053',
  '23017100',
  '23017137',
  '23017138',
  '23017142',
  '23017148',
  '23017157',
  '23017162',
  '23017173',
  '23017217',
  '23017243',
  '23017372',
  '23017386',
  '23017388',
  '23017402',
  '23017513',
  '23017516',
  '23017517',
  '23017522',
  '23017528',
  '23017529',
  '23017550',
  '23017603',
  '23017604',
  '23017605',
  '23017612',
  '23017613',
  '23017617',
  '23017618',
  '23017622',
  '23017624',
  '23017625',
  '23017627',
  '23017633',
  '23017657',
  '23017667',
  '23017669',
  '23017671',
  '23017673',
  '23017679',
  '23017724',
  '23017769',
  '23017787',
  '23017819',
  '23017820',
  '23017821',
  '23017832',
  '23017866',
  '23017870',
  '23017941',
  '23017942',
  '23017944',
  '23017983',
  '23017985',
  '23018034',
  '23018093',
  '23018096',
  '23018112',
  '23018169',
  '23018179',
  '23018213',
  '23018214',
  '23018234',
  '23018243',
  '23018280',
  '23018353',
  '2301841

#### Evaluate

In [29]:
evaluate(data, test, svm_cluster_unigrams)

{'Humans': {'accuracy': 0.965,
  'precision': 0.965,
  'recall': 1.0,
  'f1': 0.9821882951653944},
 'Female': {'accuracy': 0.79,
  'precision': 0.7678571428571429,
  'recall': 0.8431372549019608,
  'f1': 0.8037383177570093},
 'Male': {'accuracy': 0.745,
  'precision': 0.6666666666666666,
  'recall': 0.6578947368421053,
  'f1': 0.6622516556291391},
 'Animals': {'accuracy': 0.855,
  'precision': 0.6666666666666666,
  'recall': 0.47368421052631576,
  'f1': 0.5538461538461538},
 'Treatment Outcome': {'accuracy': 0.87,
  'precision': 0.5,
  'recall': 0.11538461538461539,
  'f1': 0.1875},
 'Neoplasms': {'accuracy': 0.825,
  'precision': 0.2777777777777778,
  'recall': 0.18518518518518517,
  'f1': 0.22222222222222224},
 'Prognosis': {'accuracy': 0.96,
  'precision': 0.75,
  'recall': 0.3,
  'f1': 0.4285714285714285},
 'Risk Factors': {'accuracy': 0.92,
  'precision': 0.5,
  'recall': 0.1875,
  'f1': 0.2727272727272727},
 'Breast Neoplasms': {'accuracy': 0.89,
  'precision': 0.7037037037037037