## Replicating code/results from https://github.com/bvanaken/clinical-outcome-prediction  (Mortality)

1. Clone Github profile and generated train/test/val datasets using task
2. Uploaded datasets and concat to Google Colab
3. Apply basic cleaning such as stopword removal, tokenization
4. Create TFIDF vector (50k + columns)
5. Train/Test with undersampling and no undersampling
6. AUC ~ 0.78 (comparable to literature) 

### Installing and importing packages

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install contractions

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer
import re
import contractions
import seaborn as sns
from collections import Counter

import spacy
nlp = spacy.load('en_core_web_sm')
nltk.download('wordnet')
all_stopwords = nlp.Defaults.stop_words


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Cleaning data

In [4]:
mp_results_test = pd.read_csv('/content/drive/MyDrive/MP_RESULTS/MP_IN_adm_test.csv')
mp_results_train = pd.read_csv('/content/drive/MyDrive/MP_RESULTS/MP_IN_adm_train.csv')
mp_results_val = pd.read_csv('/content/drive/MyDrive/MP_RESULTS/MP_IN_adm_val.csv')

df = pd.concat([mp_results_test, mp_results_train, mp_results_val])
df.shape

def filter_admission_text(notes_df) -> pd.DataFrame:
    """
    Filter text information by section and only keep sections that are known on admission time.
    """
    admission_sections = {
        "CHIEF_COMPLAINT": "chief complaint:",
        "PRESENT_ILLNESS": "present illness:",
        "MEDICAL_HISTORY": "medical history:",
        "MEDICATION_ADM": "medications on admission:",
        "ALLERGIES": "allergies:",
        "PHYSICAL_EXAM": "physical exam:",
        "FAMILY_HISTORY": "family history:",
        "SOCIAL_HISTORY": "social history:"
    }

    # replace linebreak indicators
    notes_df['text'] = notes_df['text'].str.replace(r"\n", r"\\n")

    # extract each section by regex
    for key in admission_sections.keys():
        section = admission_sections[key]
        notes_df[key] = notes_df.text.str.extract(r'(?i){}(.+?)\\n\\n[^(\\|\d|\.)]+?:'
                                                  .format(section))

        notes_df[key] = notes_df[key].str.replace(r'\\n', r' ')
        notes_df[key] = notes_df[key].str.strip()
        notes_df[key] = notes_df[key].fillna("")
        notes_df[notes_df[key].str.startswith("[]")][key] = ""

    # filter notes with missing main information
#     notes_df = notes_df[(notes_df.CHIEF_COMPLAINT != "") | (notes_df.PRESENT_ILLNESS != "") |
#                         (notes_df.MEDICAL_HISTORY != "")]

    # add section headers and combine into TEXT_ADMISSION
    notes_df = notes_df.assign(TEXT="CHIEF COMPLAINT: " + notes_df.CHIEF_COMPLAINT.astype(str)
                                    + '\n\n' +
                                    "PRESENT ILLNESS: " + notes_df.PRESENT_ILLNESS.astype(str)
                                    + '\n\n' +
                                    "MEDICAL HISTORY: " + notes_df.MEDICAL_HISTORY.astype(str)
                                    + '\n\n' +
                                    "MEDICATION ON ADMISSION: " + notes_df.MEDICATION_ADM.astype(str)
                                    + '\n\n' +
                                    "ALLERGIES: " + notes_df.ALLERGIES.astype(str)
                                    + '\n\n' +
                                    "PHYSICAL EXAM: " + notes_df.PHYSICAL_EXAM.astype(str)
                                    + '\n\n' +
                                    "FAMILY HISTORY: " + notes_df.FAMILY_HISTORY.astype(str)
                                    + '\n\n' +
                                    "SOCIAL HISTORY: " + notes_df.SOCIAL_HISTORY.astype(str))

    return notes_df

df_filtered = filter_admission_text(df)

my_stop_words = ['discharge', 'diagnosis', 'medications', 'medication', 'disposition', 'condition', 'instructions', 
                 'status', 'secondary', 'changes', 'instruction', 'change', 'home', 'name', 'hospital', 'daily',
                'hour', 'follow', 'care', 'time', 'day', 'week', 'with', 'disp', 'discharged', 'admitted', 
                 'namepattern', 'none', 'chief', 'complaint', 'physical', 'exam', 'present', 'illness', 'family','year', 'history','admission', 'social', 'medical', 'allergies']

stemmer = WordNetLemmatizer()

def clean_string(s):
    # Remove all the special characters
    s_clean = re.sub(r'\W', ' ', s)
    s_clean = re.sub('_', ' ', s)
    # Remove new line characters
    s_clean = s_clean.replace("\n", ' ')
    # Punctuation
    s_clean = re.sub(r'[^\w\s]', ' ', s_clean)
    # remove all single characters
    s_clean = re.sub(r'\s+[a-zA-Z]\s+', ' ', s_clean)
    # Remove single characters from the start
    s_clean = re.sub(r'\^[a-zA-Z]\s+', ' ', s_clean) 
    # Substituting multiple spaces with single space
    s_clean = re.sub(r'\s+', ' ', s_clean, flags=re.I)
    # Removing prefixed 'b'
    s_clean = re.sub(r'^b\s+', '', s_clean)
    #Removing Numbers
    s_clean = ''.join(i for i in s_clean if not i.isdigit())
    #Removing Stopwords
    s_clean = ' '.join([i for i in s_clean.split() if not i in all_stopwords])
    #Removing certain sized words
    s_clean = ' '.join([i for i in s_clean.split() if len(i)>3])
    # Contractions
    s_clean = contractions.fix(s_clean)
    # Converting to Lowercase
    s_clean = s_clean.lower()
    # Lemmatization
    s_clean = s_clean.split()
    s_clean = [stemmer.lemmatize(word) for word in s_clean]
    s_clean = ' '.join(s_clean)
    #Removing my_stop_words
    s_clean = ' '.join([i for i in s_clean.split() if not i in my_stop_words])
    return s_clean

df_filtered['TEXT_cleaned'] = [clean_string(s) for s in df_filtered['TEXT']]
df_filtered['TEXT_cleaned']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0       patient female past sudden onset midback pain ...
1       shortness breath male newly discovered cardiom...
2       mechanical fall hyperlipidemia presenting fall...
3       nausea vomiting patient autoimmune hepatitis l...
4       patient male presented patient laparoscopic ch...
                              ...                        
4903    worst headache life male transferred location ...
4904    transfer macu hemodialysis line replacement po...
4905    sudden onset severe headache spanish speaking ...
4906    known lastname female multiple problem includi...
4907    rectosigmoid colon cancer known firstname fema...
Name: TEXT_cleaned, Length: 48684, dtype: object

### TFIDF Prediction

In [5]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_filtered['TEXT_cleaned'])
y = df_filtered['hospital_expire_flag']

In [6]:
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
tfidf_df.head()

Unnamed: 0,aaainfrarenal,aaax,aabdominal,aabsent,aaccident,aacyclovir,aadls,aado,aads,aafter,aagain,aair,aand,aando,aandox,aaoriented,aaox,aapearing,aaro,aaspirin,aassociated,abacavir,abacivir,abagovomab,abagovomag,abandoned,abandonment,abaout,abasia,abassi,abatacept,abate,abated,abatement,abates,abating,abbd,abbdominal,abberancy,abberency,...,zonisamide,zonisomide,zopenex,zophran,zophrin,zoster,zostrix,zosy,zosyb,zosyn,zosysn,zovirax,zoysn,zoysyn,zozyn,zpack,zpak,zucchini,zxam,zyban,zydis,zydone,zydus,zyflo,zygoesophageal,zygoma,zygomal,zygomatic,zygomatico,zygomaticomaxillary,zygomycetes,zygomycosis,zymar,zyprexa,zyrtec,zysyn,zytec,zytrec,zyvox,zyvoxx
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
#No undersampling:
cv = KFold(n_splits=10, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
feature_importances = pd.DataFrame({'features': tfidf_df.columns, 'importance': rf.feature_importances_})
y_pred_prob = rf.predict_proba(X_test)
y_pred = rf.predict(X_test)
lr_auc = metrics.roc_auc_score(y_test, y_pred_prob[:, 1])
scores = cross_val_score(rf, X_test, y_test, cv=cv, scoring = 'f1_micro')


In [89]:
print('Mortality Prediction: No random undersampling')
print('AUC: ', lr_auc)
print('F1 score:', scores.max())
print('Feature importance:')
print(feature_importances.sort_values(by='importance', ascending=False)[0:10])

Mortality Prediction: No random undersampling
AUC:  0.772340974244477
F1 score: 0.9102739726027397
Feature importance:
           features  importance
26945     intubated    0.007139
53691  unresponsive    0.003213
1621        allergy    0.002896
19470          file    0.002436
20270         found    0.002128
36951       patient    0.002036
52110   transferred    0.001907
36118          pain    0.001895
27862         known    0.001865
11233       corneal    0.001841


In [None]:
#Random undersampling
cv = KFold(n_splits=10, random_state=1)
steps = [('under', RandomUnderSampler()), ('model', RandomForestClassifier())]
pipeline = Pipeline(steps=steps)
pipeline.fit(X_train, y_train)
feature_importances_under = pd.DataFrame({'features': tfidf_df.columns, 'importance': pipeline[1].feature_importances_})
y_pred_prob_under = pipeline.predict_proba(X_test)
y_pred_under = pipeline.predict(X_test)
lr_auc_under = metrics.roc_auc_score(y_test, y_pred_prob_under[:, 1])
scores_under = cross_val_score(pipeline, X_test, y_test, cv=cv, scoring = 'f1_micro')

In [94]:
print('Mortality Prediction - TFIDF: With random undersampling')
print('tfidf df shape: ', tfidf_df.shape)
print('AUC: ', lr_auc_under)
print('F1 score:', scores_under.max())
print('Feature importance:')
print(feature_importances_under.sort_values(by='importance', ascending=False)[0:10])

Mortality Prediction - TFIDF: With random undersampling
tfidf df shape:  (48684, 56591)
AUC:  0.7826861642993256
F1 score: 0.7143835616438357
Feature importance:
           features  importance
26945     intubated    0.007066
53691  unresponsive    0.004185
1621        allergy    0.004088
47081          soft    0.003341
20270         found    0.003305
52110   transferred    0.003250
36118          pain    0.003111
18864       failure    0.002806
27862         known    0.002724
15349          drug    0.002688


In [91]:
scores_under_bal = cross_val_score(pipeline, X_test, y_test, cv=cv, scoring = 'balanced_accuracy')

