In [None]:
import os

import ipywidgets as widgets
import numpy as np
import pandas as pd
import scipy.sparse
from IPython.display import display
from ipywidgets import interact
from matplotlib import pyplot as plt
%load_ext autoreload
%autoreload 2

In [None]:
import os
# alternatively, you may override the variables in oneinamillion.resources.py
os.environ['PCC_BASE_DIR'] = "Z:/"

In [None]:
from oneinamillion.resources import PCC_BASE_DIR
print(f"RDSF base directory located at {PCC_BASE_DIR}")

In [None]:
from oneinamillion.pc_consultation import PCConsultation

parser = PCConsultation()  # the only class needed to obtain all PC consultation data-pairs
orig_dataset = parser.get_pd()

# orig_dataset.head()  # uncomment to inspect the original dataset

# Data preparation

First split the orig_dataset into train and test set, then we need to
pre-process the transcript data

This includes, cleaning text, stemming and lemmatization

In [None]:
from utils.preprocessing.data import extract_icpc_categories
from utils.transcripts import preprocess_transcripts, read_transcript

orig_dataset['codes'] = orig_dataset['icpc_codes'].apply(extract_icpc_categories)
orig_dataset['transcript__conversation_clean'] = orig_dataset['transcript__conversation'].apply(preprocess_transcripts)
orig_dataset['transcript__conversation_both'] = orig_dataset['transcript__conversation_clean'].apply(
    lambda t: read_transcript(t, return_format='concat'))
orig_dataset['transcript__conversation_gp'] = orig_dataset['transcript__conversation_clean'].apply(
    lambda t: read_transcript(t, show_gp=True, show_patient=False, return_format='concat'))
orig_dataset['transcript__conversation_patient'] = orig_dataset['transcript__conversation_clean'].apply(
    lambda t: read_transcript(t, show_gp=False, show_patient=True, return_format='concat'))

In [None]:
orig_dataset.head()

In [None]:
orig_dataset.info()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

y = orig_dataset['codes']
mult_lbl_enc = MultiLabelBinarizer()
y_hot = mult_lbl_enc.fit_transform(y)
print(f"{len(mult_lbl_enc.classes_)} classification categories: {mult_lbl_enc.classes_}")

## Train test Split

In [None]:
from skmultilearn.model_selection import iterative_train_test_split

X = np.arange(orig_dataset['index'].shape[0]).reshape((-1, 1))

X_train, y_train, X_test, y_test = iterative_train_test_split(X, y_hot, test_size=0.2)

train_set = orig_dataset.iloc[X_train.flatten()]
test_set = orig_dataset.iloc[X_test.flatten()]
train_set.head()

## ICPC descriptions

In [None]:
from oneinamillion.clinical_codes.icpc import IcpcParser

icpc_parser = IcpcParser()
icpc_df = icpc_parser.get_pd()

In [None]:
from utils.preprocessing.text import utils_preprocess_text

icpc_df['cat'] = icpc_df['Code'].astype('string').apply(lambda x: x[0].upper())

clean_col = lambda x: utils_preprocess_text(x) if not pd.isna(x) else x
# building keyword collection from three columns of the ICPC-2 descriptions
icpc_df['criteria_prepared'] = icpc_df['criteria'].apply(clean_col)
icpc_df['inclusion_prepared'] = icpc_df['inclusion'].apply(clean_col)
icpc_df['preferred_prepared'] = icpc_df['preferred'].apply(clean_col)
icpc_df['keywords'] = icpc_df[['preferred_prepared', 'criteria_prepared', 'inclusion_prepared']].fillna('').agg(
    ' '.join, axis=1)

In [None]:
icpc_description_corpus = icpc_df[['cat', 'keywords']].groupby('cat').agg(' '.join).iloc[1:-1]
icpc_description_corpus.index.name = None
#icpc_description_corpus

In [None]:
print(f"dataset categories: {np.array(mult_lbl_enc.classes_).astype('str')}")
print(f"icpc descriptions:  {np.array(icpc_description_corpus.index).astype('str')}")

In [None]:
# Integrate with CKS descriptions
from oneinamillion.clinical_codes.cks import CksParser

# use from_raw to refresh cached cks descriptions, and headings_to_include to use different set of sub-sections to include
cks_parser = CksParser()
cks_description_corpus = cks_parser.get_pd()
cks_description_corpus

In [None]:
selected_mode = None
modes = ['ICPC only', 'CKS only', 'ICPC and CKS']
def control_description(mode=modes[0]):
    global selected_mode
    selected_mode = mode

interact(control_description, mode=modes)

In [None]:
print(f"Description: {selected_mode}")
icpc_description_dic = {}
for icpc_code in mult_lbl_enc.classes_:
    icpc_code = icpc_code.upper()
    if selected_mode == 'ICPC only':
        icpc_description_dic[icpc_code] = f"{icpc_description_corpus.loc[icpc_code]['keywords']}"
    elif selected_mode == 'CKS only':
        icpc_description_dic[icpc_code] = f"{cks_description_corpus.loc[icpc_code]['cks descriptions']}"
    else:
        icpc_description_dic[icpc_code] = f"{icpc_description_corpus.loc[icpc_code]['keywords']} {cks_description_corpus.loc[icpc_code]['cks descriptions']}"

icpc_corpus_df = pd.DataFrame.from_dict(icpc_description_dic, orient='index', columns=['keyword'])
icpc_corpus = icpc_corpus_df['keyword']
icpc_corpus

## Bag of words classifiers

**Tf-idf from ICPC codes**

- extract keyword/ keyphrases from ICPC code descriptions
 - use three columns (inclusion/ preferred and criteria)

- OR, with TF-iDF, assign a score to every word (or bigram) in the utterance,
filter those with only high scores

- For each utterance in a transcript, count the number of keyword or phrase matches for each ICPC code.
- If number of matches > threshold, assign ICPC code

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from utils.stopwords import get_medical_stopwords
# Create Bag-of-Words vector from ICPC code descriptions
# count_vec = CountVectorizer(binary=True, ngram_range=(1, 2)) # tokenize word/ bi-gram

medical_stopwords = get_medical_stopwords()
text_vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words=medical_stopwords)
description_vec = text_vectorizer.fit_transform(icpc_corpus)
print(f"icpc description bag-of-word matrix shape: {description_vec.shape}")
vec_vocab = text_vectorizer.vocabulary_ # dictionary that contain the BOW tokens

print(f"bag-of-word tokens: {', '.join(list(vec_vocab.keys())[:5])}...")

In [None]:
import seaborn as sns
sns.heatmap(description_vec.todense()==0, vmin=0, vmax=1, cbar=False).set_title('bag-of-words Vector')

In [None]:
# from wordcloud import WordCloud
# test_vec = description_vec[1].todense()
# col_names = np.array(text_vectorizer.get_feature_names())
# test = pd.DataFrame(test_vec, columns=col_names).T.to_dict()[0]
# print(test)
# word_cloud = WordCloud(background_color="white").generate_from_frequencies(test)
# plt.imshow(word_cloud)

In [None]:
from wordcloud import WordCloud

# Explain a category

lookup = [x for _,x in sorted(zip(text_vectorizer.vocabulary_.values(), text_vectorizer.vocabulary_.keys()))]
features_arr = np.array(text_vectorizer.get_feature_names())

def explain_bow_vector(vec: scipy.sparse.csr.csr_matrix, ax=plt):
    # _, idxs = scipy.sparse.csr_matrix.nonzero(vec)
    # words = [lookup[k] for k in idxs]
    # word_cloud = WordCloud().generate(' '.join(words))
    test = pd.DataFrame(vec.todense(), columns=features_arr).T.to_dict()[0]
    word_cloud = WordCloud(background_color="white").generate_from_frequencies(test)
    ax.imshow(word_cloud, interpolation='bilinear')
    ax.axis("off")

def explain_category(cat:str, ax=plt):
    target = list(mult_lbl_enc.classes_).index(cat)
    target = description_vec[target]
    explain_bow_vector(target, ax=ax)

def plot_explain_category(cat:str):
    explain_category(cat)

interact(plot_explain_category, cat=mult_lbl_enc.classes_)

In [None]:
key = 'transcript__conversation_both'
# key = 'transcript__conversation_gp'
# key = 'transcript__conversation_patient'

X_train = text_vectorizer.transform(train_set[key])
X_test = text_vectorizer.transform(test_set[key])

In [None]:
# import seaborn as sns
sns.heatmap(X_train.todense()==0, vmin=0, vmax=1, cbar=False).set_title('Train set bag-of-words matrix')

In [None]:
keyword_dist_over_train = np.array(X_train.sum(axis=0)).flatten()

def show_common_keywords(threshold:int = 10):
    frequent_words = [[k,n] for k,n in zip(lookup, keyword_dist_over_train) if n > threshold]
    frequent_words_df = pd.DataFrame(frequent_words,columns=['keyword', 'count']).sort_values('count')

    plt.figure(figsize=(8,4))
    ax = sns.barplot(x='keyword', y='count', data=frequent_words_df)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
    plt.tight_layout()
    plt.show()

    print([k._text for k in ax.get_xticklabels()])

show_common_keywords()

### Nearest Centroid classifier

In [None]:
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier

kn_clf = KNeighborsClassifier(n_neighbors=3, weights='distance', metric='cosine')
kn_clf.fit(description_vec, mult_lbl_enc.classes_)

nc_clf = NearestCentroid(metric='cosine')
nc_clf.fit(description_vec, mult_lbl_enc.classes_)

In [None]:
mult_lbl_enc.inverse_transform(y_train[0:1])

In [None]:
kn_clf.predict_proba(X_train[0:10])

In [None]:
kn_y_pred = kn_clf.predict(X_train)
y_pred = nc_clf.predict(X_train)

In [None]:
def get_idxes_with_cat(_y_train: np.ndarray, code:str):
    y_train_raw = mult_lbl_enc.inverse_transform(_y_train)
    return [i for i,cs in enumerate(y_train_raw) if code in cs]

def get_truth_pred_pairs(_y_train, _y_pred, indices):
    truth = mult_lbl_enc.inverse_transform(_y_train[indices])
    pred = _y_pred[indices]
    data = list(zip(indices, truth, pred))
    return pd.DataFrame(data, columns=['id', 'truth', 'predicted'])

def show_truth_pred_tbl(code):
    cat_idxes = get_idxes_with_cat(y_train, code)
    return get_truth_pred_pairs(y_train, y_pred, cat_idxes)

interact(show_truth_pred_tbl, code=mult_lbl_enc.classes_)

Seems like the baseline is not performing well on predicting classes for:

A, F, N, S, T, W, X, Y

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cat_dropdown = widgets.Dropdown(options=mult_lbl_enc.classes_)
id_dropdown = widgets.Dropdown(options=get_idxes_with_cat(y_train, cat_dropdown.value))

def refresh_id_dropdown(*args):
    id_dropdown.options = get_idxes_with_cat(y_train, cat_dropdown.value)

cat_dropdown.observe(refresh_id_dropdown, 'value')

def show_cosine_similarity(_id):
    cos_sim = cosine_similarity(X_train[_id], description_vec)[0]
    cos_sim = pd.DataFrame(list(zip(mult_lbl_enc.classes_, cos_sim)), columns=['category', 'cos sim'])
    cos_sim = cos_sim.set_index('category').T
    grid_kws = {"height_ratios": (.9, .05), "hspace": -0.5}
    f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws)
    sns.heatmap(cos_sim, square=True, ax=ax,
                cbar_ax=cbar_ax,
                cbar_kws={"orientation": "horizontal"},
                cmap='Greens')

def show_keywords_true_pred(_id):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,5))
    ax1.title.set_text('BOW from sample transcipt')
    ax2.title.set_text('BOW of predicted class')
    explain_bow_vector(X_train[_id], ax=ax1)
    explain_category(y_pred[_id], ax=ax2)

def view_record(_cat, _id):
    global test
    record = train_set.iloc[_id]
    info = [f"Index: {_id}",
           f"Id: {record['record_id']}",
           f"Actual: {mult_lbl_enc.inverse_transform(y_train[_id:_id+1])}",
           f"Predicted: {y_pred[_id]}"]
    display(*info)
    show_keywords_true_pred(_id)
    show_cosine_similarity(_id)

interact(view_record, _cat=cat_dropdown, _id=id_dropdown)

In [None]:
y_train_mat = np.matrix(y_train)
y_pred_mat = np.matrix(mult_lbl_enc.transform(y_pred))


In [None]:
# Note: print statements does not work in PyCharm

from sklearn.metrics import accuracy_score, classification_report, f1_score, multilabel_confusion_matrix, precision_recall_fscore_support, precision_score, recall_score, roc_auc_score
from sklearn.metrics import plot_confusion_matrix

print(f"classification_report:\n{classification_report(y_train_mat, y_pred_mat, target_names=mult_lbl_enc.classes_)}")

print(f"multilabel_confusion_matrix:")
conf_mat = multilabel_confusion_matrix(y_train_mat, y_pred_mat)

for cls, mat in zip(mult_lbl_enc.classes_, conf_mat):
    fig = plt.figure(figsize=(1,1))
    ax = fig.add_subplot()
    ax.set_title(cls)
    sns.heatmap(mat, ax=ax, cmap='Blues', annot=True, fmt="d")
    ax.set_xlabel('pred')
    ax.set_ylabel('true')
    fig.show()

For `average` parameter to take the average metrics over all classes, please see [documentation](https://scikit-learn.org/stable/modules/model_evaluation.html#from-binary-to-multiclass-and-multilabel).

In [None]:

def show_result(average='weighted'):
    # measures the subset accuracy (only considered as accurate if the whole set matches)
    print(f"accuracy score: {accuracy_score(y_train_mat, y_pred_mat)}")

    print(f"f1_score: {f1_score(y_train_mat, y_pred_mat, average=average)}")

    print(f"precision_recall_fscore_support:\n{precision_recall_fscore_support(y_train_mat, y_pred_mat, average=average)}")

    print(f"roc_auc_score: {roc_auc_score(y_train_mat, y_pred_mat, average=average)}")

interact(show_result, average=['macro', 'weighted', 'micro', 'samples'])