In [1]:
from typing import Union, List

import numpy as np
import pandas as pd
import scipy.sparse
%load_ext autoreload
%autoreload 2

In [2]:
from oneinamillion.pc_consultation import PCConsultation

parser = PCConsultation()  # the only class needed to obtain all PC consultation data-pairs
orig_dataset = parser.get_pd()

orig_dataset.head()  # inspect the original dataset

The current IDs only have record documents.
['011119' '020105' '030311' '030501' '030509' '030510' '050609' '050717'
 '050719' '071010' '071201' '071202' '071203' '071204' '071205' '071206'
 '071207' '071208' '071209' '071210' '071211' '071212' '071213' '071214'
 '081310' '081608' '081610' '091410' '091411' '091416' '101705' '101707'
 '111906' '111908' '112004' '122110']
The current IDs only have transcript documents.
['-081308' '010103' '010105' '020107' '030501(p.2)' '030501(p1)'
 '030509(p.1)' '030509(p.2)' '030510 (p.1)' '030510 (p.2)' '040417'
 '050605' '050609a' '050717 (Dr reads wrong number)' '060806' '060811'
 '060812' '060906' '071002' '071010(a)' '071012' '081310(b)' '081310(c)'
 '081310a' '081601' '081606' '081608 (1 of 2)' '081608 (2 of 2)' '091402'
 '091404' '091405' '091410 & 091411' '091503' '101705 and 101707' '101801'
 '111906 (2of2)' '111906(1 of 2)' '112002' '112004 & 1112005' '112014'
 '200105' '50719']


Unnamed: 0,index,record_id,icpc_codes,pt_records,transcript__start_date,transcript__duration,transcript__conversation
0,0,10112,"['K85', 'P76']","[{'date': datetime.datetime(2014, 8, 13, 0, 0)...",2014-06-16 10:33:28,0:16:17,"[('GP', 'How are you sir?'), ('Patient', ""I'm ..."
1,1,50709,"['R04', 'U04']","[{'date': datetime.datetime(2015, 1, 26, 0, 0)...",2014-11-05 19:40:49,0:13:20,"[('info', 'Oh, can I 0:00:00'), ('GP', ""___. Y..."
2,2,20208,['H82'],"[{'date': datetime.datetime(2014, 11, 3, 0, 0)...",2014-09-08 10:43:09,0:11:33,"[('Doc', 'Morning NAME.'), ('Pat', 'Hi, how yo..."
3,3,50601,"['K85', 'P15']","[{'date': datetime.datetime(2014, 12, 5, 0, 0)...",2014-11-05 09:27:26,0:19:21,"[('GP', ""Thank you very much for this. Right, ..."
4,4,111915,"['D95', 'P74']","[{'date': datetime.datetime(2015, 5, 29, 0, 0)...",2015-04-28 12:17:47,0:19:41,"[('DOC', 'Thank you, what number are you on? S..."


# Data preparation

First split the orig_dataset into train and test set, then we need to
pre-process the transcript data

This includes, cleaning text, stemming and lemmatization

In [3]:
from utils.preprocessing.data import extract_icpc_categories
from utils.transcripts import preprocess_transcripts

orig_dataset['codes'] = orig_dataset['icpc_codes'].apply(extract_icpc_categories)
orig_dataset['transcript__conversation_clean'] = orig_dataset['transcript__conversation'].apply(preprocess_transcripts)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Vico\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
orig_dataset.head()

Unnamed: 0,index,record_id,icpc_codes,pt_records,transcript__start_date,transcript__duration,transcript__conversation,codes,transcript__conversation_clean
0,0,10112,"['K85', 'P76']","[{'date': datetime.datetime(2014, 8, 13, 0, 0)...",2014-06-16 10:33:28,0:16:17,"[('GP', 'How are you sir?'), ('Patient', ""I'm ...","[K, P]","[[GP, How sir], [Patient, Im bad moment actual..."
1,1,50709,"['R04', 'U04']","[{'date': datetime.datetime(2015, 1, 26, 0, 0)...",2014-11-05 19:40:49,0:13:20,"[('info', 'Oh, can I 0:00:00'), ('GP', ""___. Y...","[R, U]","[[info, Oh I 00000], [GP, Youre 050709 So I gi..."
2,2,20208,['H82'],"[{'date': datetime.datetime(2014, 11, 3, 0, 0)...",2014-09-08 10:43:09,0:11:33,"[('Doc', 'Morning NAME.'), ('Pat', 'Hi, how yo...",[H],"[[Doc, Morning NAME], [Pat, Hi], [Doc, How], [..."
3,3,50601,"['K85', 'P15']","[{'date': datetime.datetime(2014, 12, 5, 0, 0)...",2014-11-05 09:27:26,0:19:21,"[('GP', ""Thank you very much for this. Right, ...","[K, P]","[[GP, Thank much Right youre 050601 according]..."
4,4,111915,"['D95', 'P74']","[{'date': datetime.datetime(2015, 5, 29, 0, 0)...",2015-04-28 12:17:47,0:19:41,"[('DOC', 'Thank you, what number are you on? S...","[D, P]","[[DOC, Thank number Sixteen Hi I Doctor Name I..."


In [5]:
orig_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 9 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   index                           241 non-null    int64 
 1   record_id                       241 non-null    int64 
 2   icpc_codes                      241 non-null    object
 3   pt_records                      241 non-null    object
 4   transcript__start_date          241 non-null    object
 5   transcript__duration            241 non-null    object
 6   transcript__conversation        241 non-null    object
 7   codes                           241 non-null    object
 8   transcript__conversation_clean  241 non-null    object
dtypes: int64(2), object(7)
memory usage: 17.1+ KB


In [6]:
from sklearn.preprocessing import MultiLabelBinarizer

y = orig_dataset['codes']
mult_lbl_enc = MultiLabelBinarizer()
y_hot = mult_lbl_enc.fit_transform(y)
print(f"{len(mult_lbl_enc.classes_)} classification categories: {mult_lbl_enc.classes_}")

16 classification categories: ['A' 'B' 'D' 'F' 'H' 'K' 'L' 'N' 'P' 'R' 'S' 'T' 'U' 'W' 'X' 'Y']


In [7]:
from skmultilearn.model_selection import iterative_train_test_split

X = np.arange(orig_dataset['index'].shape[0]).reshape((-1, 1))

X_train, y_train, X_test, y_test = iterative_train_test_split(X, y_hot, test_size=0.2)

train_set = orig_dataset.iloc[X_train.flatten()]
test_set = orig_dataset.iloc[X_test.flatten()]

train_set.head()

Unnamed: 0,index,record_id,icpc_codes,pt_records,transcript__start_date,transcript__duration,transcript__conversation,codes,transcript__conversation_clean
0,0,10112,"['K85', 'P76']","[{'date': datetime.datetime(2014, 8, 13, 0, 0)...",2014-06-16 10:33:28,0:16:17,"[('GP', 'How are you sir?'), ('Patient', ""I'm ...","[K, P]","[[GP, How sir], [Patient, Im bad moment actual..."
2,2,20208,['H82'],"[{'date': datetime.datetime(2014, 11, 3, 0, 0)...",2014-09-08 10:43:09,0:11:33,"[('Doc', 'Morning NAME.'), ('Pat', 'Hi, how yo...",[H],"[[Doc, Morning NAME], [Pat, Hi], [Doc, How], [..."
5,5,101708,"['K84', 'Y08']","[{'date': datetime.datetime(2015, 4, 24, 0, 0)...",2015-02-24 12:39:49,0:18:49,"[('PAT', 'He was supposed to come in and take ...","[K, Y]","[[PAT, He supposed come take], [info, 00001], ..."
6,6,20212,"['L10', 'R05']","[{'date': datetime.datetime(2014, 9, 17, 0, 0)...",2014-09-08 14:24:39,0:07:58,"[('Doc', ""...you have symptoms of acid reflux,...","[L, R]","[[Doc, symptom acid reflux four week trial ant..."
8,8,111911,"['B80', 'D18']","[{'date': datetime.datetime(2015, 7, 21, 0, 0)...",2015-04-28 10:48:20,0:11:01,"[('DOC', 'Hi, I am Doctor Name thanks for wait...","[B, D]","[[DOC, Hi I Doctor Name thanks waiting How I h..."


In [8]:
# from matplotlib import pyplot as plt
#
# fig, (ax1, ax2) = plt.subplots(1,2)
# fig.suptitle('Number of records')
# ax1.set_title('train set')
# ax2.set_title('test set')
#
#
# train_set['codes'].value_counts().sort_index().plot(kind='bar', ax=ax1)
# test_set['codes'].value_counts().sort_index().plot(kind='bar', ax=ax2)

In [9]:
from oneinamillion.clinical_codes.icpc import IcpcParser

icpc_parser = IcpcParser()
icpc_df = icpc_parser.get_pd()

In [10]:
from utils.preprocessing.text import utils_preprocess_text

icpc_df['cat'] = icpc_df['Code'].astype('string').apply(lambda x: x[0].lower())

clean_col = lambda x: utils_preprocess_text(x) if not pd.isna(x) else x
icpc_df['criteria_prepared'] = icpc_df['criteria'].apply(clean_col)
icpc_df['inclusion_prepared'] = icpc_df['inclusion'].apply(clean_col)
icpc_df['preferred_prepared'] = icpc_df['preferred'].apply(clean_col)
icpc_df['keywords'] = icpc_df[['preferred_prepared', 'criteria_prepared', 'inclusion_prepared']].fillna('').agg(
    ' '.join, axis=1)

icpc_corpus = icpc_df[['cat', 'keywords']].groupby('cat').agg(' '.join).iloc[1:, 0]

icpc_corpus = icpc_corpus.iloc[0:-1]

In [11]:
print(f"dataset categories: {np.array([x for x in icpc_corpus.index if x.upper() in mult_lbl_enc.classes_]).astype('str')}")
print(f"icpc descriptions:  {np.array(icpc_corpus.index).astype('str')}")


dataset categories: ['a' 'b' 'd' 'f' 'h' 'k' 'l' 'n' 'p' 'r' 's' 't' 'u' 'w' 'x' 'y']
icpc descriptions:  ['a' 'b' 'd' 'f' 'h' 'k' 'l' 'n' 'p' 'r' 's' 't' 'u' 'w' 'x' 'y']


## Baseline - Tf-idf from ICPC codes

- extract keyword/ keyphrases from ICPC code descriptions
 - use three columns (inclusion/ preferred and criteria)

- OR, with TF-iDF, assign a score to every word (or bigram) in the utterance,
filter those with only high scores

- For each utterance in a transcript, count the number of keyword or phrase matches for each ICPC code.
- If number of matches > threshold, assign ICPC code

First, a CountVectoriser is used to tokenize the ICPC code descriptions, and to create bag-of-word vectors
for every ICPC categories. The vector indicates the presence of word/ bi-gram in each ICPC code description

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

# Create Bag-of-Words vector from ICPC code descriptions
count_vec = CountVectorizer(binary=True, ngram_range=(1, 2)) # tokenize word/ bi-gram
icpc_count_vec = count_vec.fit_transform(icpc_corpus)
vec_vocab = count_vec.vocabulary_ # dictionary that contain the BOW tokens

print(f"bag-of-word tokens: {', '.join(list(vec_vocab.keys())[:5])}...")

bag-of-word tokens: pain, general, multiple, site, chronic...


This KDTree stores N-dimensional bag-of-words vector for all ICPC categories, and allow quick closest neighbour lookup
using any bag-of-words vector from transcripts.

In [13]:
import scipy
from scipy.spatial import KDTree

lookup_tree = KDTree(icpc_count_vec.todense())

In [14]:
from skmultilearn.base import MLClassifierBase
from sklearn.base import BaseEstimator

class KDTreeMLClassifier(BaseEstimator):
    def __init__(self, feature_vec=None, k=5, distance_upper_bound=50, p_threshold=0.2):
        self._tree = KDTree(feature_vec)
        self.k = k
        self.distance_upper_bound = distance_upper_bound
        self.p_threshold = p_threshold
        super().__init__()

    def fit(self, X, y):
        return self
        pass

    def _bow_vec_2_cat(self, vec):
        """
        :param vec: bag-of-word vector from a transcript
        :return: probability vector of length class
        """
        ds, cs = lookup_tree.query(vec, k=5, distance_upper_bound=self.distance_upper_bound)
        ds = np.array([d for (d,c) in zip(ds, cs) if not np.isinf(d)])
        cs = np.array([c for (d,c) in zip(ds, cs) if not np.isinf(d)])

        ds = (self.distance_upper_bound/ds)  # set distance to class as probability of being that class
        ds = ds/ds.sum()  # normalize to sum up to 1
        # create an output vector of class probabilities
        x = np.zeros(len(self._tree.data))
        for d,c in zip(ds, cs):
            x[c] = d
        x = np.where(x >= self.p_threshold, 1, 0)
        return x

    def predict(self, X):
        """
        Make class predictions for n samples give (n,m) matrix
        :param X: bag-of-word matrix (n,m) for n samples and m features
        :return:
        """
        return np.apply_along_axis(self._bow_vec_2_cat, axis=1, arr=X)

In [15]:
from functools import reduce
from utils.transcripts import apply_to_transcript

def logic_or_seq(xs: Union[List[int], List[bool]]) -> Union[int, bool]:
    """Return the reduced OR value of a sequence of boolean values
    [1,0] -> 1
    [True, False] -> True
    """
    res = reduce((lambda a, b: a or b), xs, False)
    return res

In [16]:
def transcript_to_bow_vec(_dialogue, _vectorizer):
    """Convert transcript to bag-of-word vector"""
    [_, transcript_mat] = apply_to_transcript(_dialogue, fn_utterance=_vectorizer.transform, merge=True)
    return np.apply_along_axis(logic_or_seq, 0, np.array(transcript_mat.todense()))

In [17]:
# # Get bag-of-words count vector on a single transcript
#
# idx = 7  # idx of train set
#
# dialogue = train_set.iloc[idx]['transcript__conversation_clean']
#
# transcript_vec = transcript_to_bow_vec(dialogue, count_vec)
# print(transcript_vec)
# print(f"shape: {transcript_vec.shape}")
# print(f"contain 1s?: {not np.all(transcript_vec == 0)}")

In [18]:
# Now apply the bag-of-word transformation to all samples in train set
transcript_bow_vecs = train_set['transcript__conversation_clean'].apply(lambda x: transcript_to_bow_vec(x, count_vec))
X_train = np.stack(transcript_bow_vecs)  # np 2d array for bag-of-word vectors for all train samples

In [19]:
# def bow_vec_2_cat(vec):
#     distance_upper_bound = 50
#     ds, cs = lookup_tree.query(vec, k=5, distance_upper_bound=distance_upper_bound)
#     ds = np.array([d for (d,c) in zip(ds, cs) if not np.isinf(d)])
#     cs = np.array([c for (d,c) in zip(ds, cs) if not np.isinf(d)])
#
#     ds = (distance_upper_bound/ds)
#     ds = ds/ds.sum()
#
#
#     x = np.zeros(17)
#     for d,c in zip(ds, cs):
#         x[c] = d
#
#     return x
#
# xx = bow_vec_2_cat(transcript_bow_vecs.iloc[idx])
# print(f"predictions: {xx}")

In [20]:
mlc = KDTreeMLClassifier(feature_vec=icpc_count_vec.todense())

In [21]:
y_pred = mlc.predict(X_train)

In [22]:
print(f"y_train: {y_train.shape}")
print(f"y_pred:  {y_pred.shape}")

y_train: (187, 16)
y_pred:  (187, 16)


In [23]:
# Test the performance of the classifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

print(f"f1: {f1_score(y_train, y_pred, average='weighted')}")
print(f"accuracy: {accuracy_score(y_train, y_pred)}")


f1: 0.043760199405360696
accuracy: 0.016042780748663103


In [24]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        'k':5,
        'distance_upper_bound': np.linspace(10, 50, 6),
        'p_threshold': np.linspace(0.1, 0.5, 6),
    }
]

mlc = KDTreeMLClassifier()

grid_search = GridSearchCV(mlc, )

ValueError: data must be 2 dimensions

In [None]:
from sklearn.neighbors import NearestNeighbors