In [31]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

import gensim
import gensim.corpora as corpora
import gensim.models

import os
from gensim.models.wrappers import LdaMallet
os.environ.update({'MALLET_HOME':r'/mallet-2.0.8'})
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
from nltk.tokenize import word_tokenize

from pprint import pprint
import numpy as np
import pandas as pd



In [8]:
df = pd.read_csv('MP_RESULTS_text_cleaned.csv')
df = df.fillna('')

In [9]:
tokenized_words = [word_tokenize(i) for i in df['TEXT_cleaned']]

### Topic Modeling

In [10]:
# Create Dictionary
id2word = corpora.Dictionary(tokenized_words) #use entire corpus
id2word.filter_extremes(no_below=20, no_above=0.5)

# Create Corpus
texts = tokenized_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [11]:
print(id2word)

Dictionary(7715 unique tokens: ['aneurysm', 'centimeter', 'confirming', 'country', 'department']...)


In [12]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        print('Calculating {}-topic model'.format(num_topics))
        model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [13]:
#limit=20; start=2; step=1;
#model_list, coherence_values = compute_coherence_values(dictionary=id2word,
#                                                        corpus=corpus,
 #                                                       texts=texts,
  #                                                      start=start,
   #                                                     limit=limit,
    #                                                    step=step)

In [14]:
# show graph
#x = range(start, limit, step)
#plt.figure(figsize=(15, 10))
#plt.plot(x, coherence_values)
#plt.xlabel("Num Topics")
#plt.ylabel("Coherence score")
#plt.show()

In [15]:
# Print the coherence scores
#for m, cv in zip(x, coherence_values):
#    print("Num Topics =", m, " has Coherence Value of", round(cv, 6))

In [16]:
# Building LDA Model = LDA Model removing my_stop_words & featuring 30 topics
optimal_topics = 9
lda_model_optimal = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=optimal_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [17]:
pprint(lda_model_optimal.print_topics(num_topics=optimal_topics, num_words=15))
doc_lda = lda_model_optimal[corpus]

[(0,
  '0.027*"right" + 0.024*"left" + 0.013*"mass" + 0.010*"last" + 0.009*"lower" '
  '+ 0.008*"surgery" + 0.008*"lung" + 0.008*"extremity" + 0.008*"underwent" + '
  '0.007*"edema" + 0.007*"known" + 0.007*"site" + 0.007*"wound" + '
  '0.006*"cancer" + 0.006*"scan"'),
 (1,
  '0.032*"left" + 0.031*"artery" + 0.026*"disease" + 0.025*"coronary" + '
  '0.023*"cardiac" + 0.019*"aortic" + 0.019*"catheterization" + '
  '0.018*"stenosis" + 0.016*"chest" + 0.015*"right" + 0.014*"showed" + '
  '0.013*"revealed" + 0.012*"referred" + 0.012*"male" + 0.012*"severe"'),
 (2,
  '0.035*"blood" + 0.019*"unit" + 0.016*"emergency" + 0.015*"abdominal" + '
  '0.013*"pressure" + 0.013*"transferred" + 0.011*"prior" + 0.010*"nausea" + '
  '0.009*"vomiting" + 0.009*"medical" + 0.009*"department" + 0.009*"episode" + '
  '0.008*"room" + 0.008*"bleeding" + 0.008*"stool"'),
 (3,
  '0.019*"denies" + 0.015*"clear" + 0.015*"neck" + 0.014*"edema" + '
  '0.012*"lung" + 0.011*"bilaterally" + 0.011*"supple" + 0.010*"normal

In [18]:
# Compute Perplexity
print('\nPerplexity: ', lda_model_optimal.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model_optimal, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.228452411799298

Coherence Score:  0.4878073507244822


In [19]:
vector = lda_model_optimal[corpus]  # get topic probability distribution for a document

In [20]:
def Extract(vector):
    return [item[0] for item in vector]

In [21]:
vector_prob = Extract(vector)

In [22]:
vector_prob_df = pd.DataFrame(vector_prob, columns=list(range(0, optimal_topics)))
vector_prob_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,"(0, 0.016113982)","(1, 0.17208478)","(2, 0.14316958)","(3, 0.029177472)","(4, 0.010300385)","(5, 0.37597695)","(6, 0.011933415)","(8, 0.23534687)",
1,"(0, 0.032641087)","(1, 0.66343254)","(2, 0.010727578)","(3, 0.10086044)","(4, 0.085983776)","(5, 0.010597374)","(6, 0.0771762)","(8, 0.014075702)",
2,"(0, 0.11793959)","(2, 0.036749266)","(3, 0.070198)","(4, 0.01389171)","(5, 0.0513266)","(7, 0.6851212)","(8, 0.013545754)",,
3,"(0, 0.1689838)","(2, 0.07162671)","(3, 0.4450032)","(4, 0.01588168)","(5, 0.051688373)","(6, 0.23148713)","(8, 0.010200584)",,
4,"(0, 0.17378408)","(1, 0.05888809)","(2, 0.290153)","(3, 0.015139234)","(6, 0.03818894)","(8, 0.40387017)",,,


In [23]:
def clean_df(df):
    #Create empty dataframe copy
    df2 = pd.DataFrame(np.zeros(df.shape), columns = df.columns)
    nrows = df.shape[0]
    ncolumns = df.shape[1]
    #Populate empty dataframe
    for i in range(nrows):
        #Create key:value pair of each row
        rowdict = dict(zip([x[0] for x in df.iloc[i] if x != None], [x[1] for x in df.iloc[i] if x != None]))
        #Populate row based off of 
        for j in range(ncolumns):
            if j in rowdict.keys():
                df2.iloc[i][j] = rowdict[j]
            else:
                df2.iloc[i][j] = None
    return df2

In [24]:
vector_prob_df_align = clean_df(vector_prob_df)
vector_prob_df_align.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.016114,0.172085,0.14317,0.029177,0.0103,0.375977,0.011933,,0.235347
1,0.032641,0.663433,0.010728,0.10086,0.085984,0.010597,0.077176,,0.014076
2,0.11794,,0.036749,0.070198,0.013892,0.051327,,0.685121,0.013546
3,0.168984,,0.071627,0.445003,0.015882,0.051688,0.231487,,0.010201
4,0.173784,0.058888,0.290153,0.015139,,,0.038189,,0.40387


In [25]:
topics = vector_prob_df_align
labels = df['hospital_expire_flag']

In [26]:
topic_modeling_results = pd.concat([labels, topics], axis=1)
topic_modeling_results = topic_modeling_results.fillna(0)

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [28]:
X = topic_modeling_results.drop(columns=['hospital_expire_flag'])
y = topic_modeling_results['hospital_expire_flag']

In [34]:
topic_modeling_results.to_csv('Mortality_Topic_Modeling_9_topics.csv')

In [29]:
steps = [('under', RandomUnderSampler()), ('model', RandomForestClassifier())]
pipeline = Pipeline(steps=steps)
rf = RandomForestClassifier()
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, cv=cv, n_jobs=-1)
score = scores.mean()
print('F1 Score: %.3f' % score)

NameError: name 'RepeatedStratifiedKFold' is not defined

In [None]:
#No undersampling:
cv = KFold(n_splits=10)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
feature_importances = pd.DataFrame({'features': X.columns, 'importance': rf.feature_importances_})
y_pred_prob = rf.predict_proba(X_test)
y_pred = rf.predict(X_test)
lr_auc = metrics.roc_auc_score(y_test, y_pred_prob[:, 1])
scores = cross_val_score(rf, X_test, y_test, cv=cv, scoring = 'f1_micro')

In [None]:
print('Mortality Prediction, Topic modeling: No random undersampling')
print('AUC: ', lr_auc)
print('Bal. accuracy score:', scores.max())
print('Feature importance:')
print(feature_importances.sort_values(by='importance', ascending=False)[0:10])

In [None]:
#Random undersampling
cv = KFold(n_splits=10)
steps = [('under', RandomUnderSampler()), ('model', RandomForestClassifier())]
pipeline = Pipeline(steps=steps)
pipeline.fit(X_train, y_train)
feature_importances_under = pd.DataFrame({'features': X.columns, 'importance': pipeline[1].feature_importances_})
y_pred_prob_under = pipeline.predict_proba(X_test)
y_pred_under = pipeline.predict(X_test)
lr_auc_under = metrics.roc_auc_score(y_test, y_pred_prob_under[:, 1])
scores_under = cross_val_score(pipeline, X_test, y_test, cv=cv, scoring = 'f1_micro')

In [30]:
print('Mortality Prediction - Topic modeling: With random undersampling')
print('Topic model DF: ', topic_modeling_results.shape)
print('AUC: ', lr_auc_under)
print('F1 score:', scores_under.max())
print('Feature importance:')
print(feature_importances_under.sort_values(by='importance', ascending=False)[0:20])

Mortality Prediction - Topic modeling: With random undersampling
Topic model DF:  (48684, 10)


NameError: name 'lr_auc_under' is not defined