In [None]:
!pip install pyhealth inflect autocorrect torchtext gensim==3.6.0

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [None]:
import numpy as np
import pandas as pd
from pyhealth.medcode import InnerMap
from pyhealth.datasets import MIMIC4Dataset

import nltk
nltk.download('stopwords')
nltk.download('punkt')
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import os
import csv
import pickle
import inflect
from autocorrect import spell
from collections import OrderedDict


import gensim
from gensim.models import Word2Vec
import pickle

import torch
import torchtext
from torchtext.data import get_tokenizer
import numpy as np
import statistics
# for progress bar
from tqdm import tqdm_notebook
import random
import json
import tqdm
from sklearn.metrics import *

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [None]:
with open('data/781/embeddings/tokenized_notes.pckl', 'rb') as f:
    input_ids = pickle.load(f)

with open('data/781/embeddings/embedding_matrix_GNV.pckl', 'rb') as f:
    embedding_matrix_GNV = pickle.load(f)
    embedding_matrix_GNV = torch.tensor(embedding_matrix_GNV)

with open('data/781/embeddings/embedding_matrix_w2v.pckl', 'rb') as f:
    embedding_matrix_w2v = pickle.load(f)
    embedding_matrix_w2v = torch.tensor(embedding_matrix_w2v)

with open('data/781/embeddings/word_index_eff.pckl', 'rb') as f:
    word2idx = pickle.load(f)

with open('data/781/embeddings/max_len_eff.pckl', 'rb') as f:
    normal_max_len = pickle.load(f)

with open('data/781/labels.pckl', 'rb') as f:
    labels = pickle.load(f)

with open('data/781/df_notes_discharge.pckl', 'rb') as f:
    df_notes_discharge = pickle.load(f)

with open('data/781/cleaned_notes.pckl', 'rb') as f:
    notes = pickle.load(f)
    
#with open('data/embeddings/pretrain.pckl', 'rb')
#    pretrain = pickle.load(f)

In [None]:
print('pretrained GNV num embeddings ', len(embedding_matrix_GNV))
print('GNV embedding dimensions ', len(embedding_matrix_GNV[0]))

print('pretrained w2v num embeddings ', len(embedding_matrix_w2v))
print('w2v embedding dimensions ', len(embedding_matrix_w2v[0]))

print('len encoded notes, or total notes is ', len(input_ids))
print('len or word index, or total unique words is ', len(word2idx))
print('len of labels is', len(labels))
print('len of first note is ', len(input_ids[0]))
print('max len is ', normal_max_len)


In [None]:
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler, SequentialSampler)
from sklearn.model_selection import train_test_split


def data_loader(x_train, x_test, y_train, y_test, batch_size=8):
    """Convert train and test sets to tensors and load them to a dataLoader
    """

    # Convert data type to torch.Tensor
    x_train, x_test, y_train, y_test = tuple(torch.tensor(data) for data in [x_train, x_test, y_train, y_test])

    # Create DataLoader for training data
    train_data = TensorDataset(x_train, y_train)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Create DataLoader for validation data
    val_data = TensorDataset(x_test, y_test)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    return train_dataloader, val_dataloader


 # Train Test Split
x_train, x_test, y_train, y_test = train_test_split(input_ids, labels, test_size=0.33, random_state=seed)

batch_size=8
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = data_loader(x_train, x_test, y_train, y_test, batch_size=batch_size)

In [None]:
# Classic machine learning

from sklearn.datasets import load_svmlight_file
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer # add reference

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


def logistic_regression_pred(X_train, Y_train, X_test):
    
    """
    logistic regression classifier using X_train and Y_train to predict labels of X_train
    """
    logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5, penalty='l2', max_iter=1000)),
               ])
    logreg.fit(X_train, Y_train)
    Y_pred = logreg.predict(X_test)
    return Y_pred

def svm_pred(X_train, Y_train, X_test):
    
    """
    SVM classifier using X_train and Y_train to predict labels of X_train
    """

    sgd = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=seed, max_iter=5, tol=None)),
                ])
    sgd.fit(X_train, Y_train)
    Y_pred = sgd.predict(X_test)
    return Y_pred

def naive_bayes_pred(X_train, Y_train, X_test):
    
    """
    Naive Bayes using X_train and Y_train to predict labels of X_train
    """
    nb = Pipeline([('vect', CountVectorizer()),
                   #The multinomial distribution normally requires integer feature counts. fractional counts such as tf-idf might also work. 
                   #https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html
                   #tdif is a normalization optimizaiton to reduce impact of very common words
                   #in this case of NB on this vector tdif does not work as good, it results in all 0, as it is an optimization it is not necessary.
               ('clf', MultinomialNB(alpha=1)),
              ])
    nb.fit(X_train, Y_train)
    Y_pred = nb.predict(X_test)
    return Y_pred

def classification_metrics(Y_pred, Y_true):
    
    accuracy = accuracy_score(Y_true, Y_pred)
    precision = precision_score(Y_true, Y_pred)
    recall = recall_score(Y_true, Y_pred)
    f1 = f1_score(Y_true, Y_pred)
    
    #tn, fp, fn, tp = confusion_matrix(y_true, y_pred)
    return accuracy, precision, recall, f1

    
#input: Name of classifier, predicted labels, actual labels
def display_metrics(classifierName, Y_pred, Y_true):
    print("______________________________________________")
    print(("Classifier: "+classifierName))
    acc, precision, recall, f1score = classification_metrics(Y_pred,Y_true)
    print(("Accuracy: "+str(acc)))
    print(("Precision: "+str(precision)))
    print(("Recall: "+str(recall)))
    print(("F1-score: "+str(f1score)))
    print("______________________________________________")
    print("")


#x_train, x_test, y_train, y_test
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(notes, labels, test_size=.33, random_state = seed)

display_metrics("Logistic Regression", logistic_regression_pred(X_train_b, y_train_b, X_test_b), y_test_b)
display_metrics("SVM",svm_pred(X_train_b, y_train_b, X_test_b),y_test_b)
display_metrics("Naive Bayes", naive_bayes_pred(X_train_b, y_train_b, X_test_b), y_test_b)
