In [1]:
!pip install pyhealth inflect autocorrect torchtext gensim==3.6.0

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [12]:
import numpy as np
import pandas as pd
from pyhealth.medcode import InnerMap
from pyhealth.datasets import MIMIC4Dataset

import nltk
nltk.download('stopwords')
nltk.download('punkt')
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import os
import csv
import pickle
import inflect
from autocorrect import spell
from collections import OrderedDict


import gensim
from gensim.models import Word2Vec
import pickle

import torch
import torchtext
from torchtext.data import get_tokenizer
import numpy as np
import statistics
# for progress bar
from tqdm import tqdm_notebook
import random
import json
import tqdm
from sklearn.metrics import *

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
dataset = MIMIC4Dataset(
        root="data/mimic4_subset",
        tables=["diagnoses_icd", "procedures_icd"],
    )
#dataset.stat()
#dataset.info()

In [4]:
icd9cm = InnerMap.load("ICD9CM")
icd10cm = InnerMap.load("ICD10CM")
#smoker = icd9cm.lookup("V15.82")

counter = 0
patient_dict = dataset.patients
labels = []
for subject_id, patient in patient_dict.items():
    #if counter > 100:
    #    break
    #counter += 1
    tobacco = 0
    visit_dict = patient.visits
    for visit_id, visit in visit_dict.items():
        #print(visit.encounter_time, visit.available_tables)
        events = visit.get_event_list('diagnoses_icd')
        for event in events:
            if event.vocabulary == 'ICD9CM' and event.code in ['V1582', '3051']:
                tobacco = 1
                #explain = icd9cm.lookup(event.code)
                #print(event.patient_id, event.visit_id, visit.encounter_time, event.vocabulary, event.code, explain)
            elif event.vocabulary == 'ICD10CM' and event.code.startswith('F17'):
                tobacco = 1
                #explain = icd10cm.lookup(event.code)
                #print(event.patient_id, event.visit_id, visit.encounter_time, event.vocabulary, event.code, explain)
    labels.append({'subject_id':subject_id,'label':tobacco})


In [5]:
# function that cleans text
# still need to account for contractions, abbreviations, and numbers/fractions
default_stemmer = PorterStemmer()
default_stopwords = stopwords.words('english') # or any other list of your choice
def clean_text(text, replace_numbers = False, remove_rare = False, remove_punctuation = False, stem_text = False, remove_stopwords = False, remove_num = False , spell_check = False, remove_repeat = False):
        def misc_cleaning(text):
                text = re.sub("-([a-zA-Z]+)", r"\1", text) # replaces hyphen with spaces in case of strings
                text = re.sub(' y ', '', text) # gets rid of random y accent stuff scattered through the text
                text = re.sub('yyy', 'y', text)
                text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
                text = re.sub(r"what's", "what is ", text)
                text = re.sub(r"\'s", " ", text)
                text = re.sub(r"\'ve", " have ", text)
                text = re.sub(r"can't", "cannot ", text)
                text = re.sub(r"n't", " not ", text)
                text = re.sub(r"i'm", "i am ", text)
                text = re.sub(r"\'re", " are ", text)
                text = re.sub(r"\'d", " would ", text)
                text = re.sub(r"\'ll", " will ", text)
                text = re.sub(r",", " ", text)
                text = re.sub(r"\.", " ", text)
                text = re.sub(r"!", " ! ", text)
                text = re.sub(r"\/", " ", text)
                text = re.sub(r"\^", " ^ ", text)
                text = re.sub(r"\+", " + ", text)
                text = re.sub(r"\-", " - ", text)
                text = re.sub(r"\=", " = ", text)
                text = re.sub(r"'", " ", text)
                text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
                text = re.sub(r":", " : ", text)
                text = re.sub(r" e g ", " eg ", text)
                text = re.sub(r" b g ", " bg ", text)
                text = re.sub(r" u s ", " american ", text)
                text = re.sub(r"\0s", "0", text)
                text = re.sub(r" 9 11 ", "911", text)
                text = re.sub(r"e - mail", "email", text)
                text = re.sub(r"j k", "jk", text)
                text = re.sub(r"\s{2,}", " ", text)
                return text

        # function to tokenize text which is used in a lot of the later processing
        def tokenize_text(text):
                return [w for s in sent_tokenize(text) for w in word_tokenize(s)]

        text = text.strip(' ') # strip whitespaces
        text = text.lower() # lowercase
        text = misc_cleaning(text) # look at function, random cleaning stuff
        
        if remove_repeat:
                sentences = sent_tokenize(text)
                sentences = list(dict.fromkeys(sentences))
                text = " ".join(sentences)
        
        # removes punctuation
        if remove_punctuation:
                text = text.translate(str.maketrans('', '', string.punctuation))

        # optional: replaces numbers ("3") with their word counterparts ("three")
        if replace_numbers:
                words = word_tokenize(text)
                p = inflect.engine()
                new_words = []
                for word in words:
                        if word.isdigit():
                                new_word = p.number_to_words(word)
                                new_words.append(new_word)
                        else:
                                new_words.append(word)
                text = " ".join(new_words)

        # optional: removes the rarest words in each text --> right now it's 10
        if remove_rare:
                tokens = word_tokenize(text)
                freq_dist = nltk.FreqDist(tokens)
                rarewords = list(freq_dist.keys())[-10:]
                new_words = [word for word in tokens if word not in rarewords]
                text = " ".join(new_words)

        # optional: stems text using Porter Stemmer
        if stem_text:
                stemmer = default_stemmer
                tokens = tokenize_text(text)
                text = " ".join([stemmer.stem(t) for t in tokens])

        # removes stop words such as "a", "the", etc.
        if remove_stopwords:
                stop_words = default_stopwords
                tokens = [w for w in tokenize_text(text) if w not in stop_words]
                text = " ".join(tokens)
        
        # optional: removes numbers completely from the ext
        if remove_num:
                text=text.split()
                text=[x for x in text if not x.isnumeric()]
                text= " ".join(text)
        
        #remove headers from discharge notes
        #name unit admission date discharge date date birth sex service medicine allergies known allergies adverse drug reactions attending chief complaint 
        headers = text.find(" chief complaint ",1,300)
        if headers > -1:
            headers += 17
            text = text[headers:]
        return text


In [6]:
df_notes_discharge = pd.read_csv("data/mimic4_notes/discharge.csv")
#df_notes_discharge_detail = pd.read_csv("data/mimic4_notes/discharge_detail.csv")
#df_notes_radiology = pd.read_csv("data/mimic4_notes/radiology.csv")

print(df_notes_discharge.columns)
print('total len', len(df_notes_discharge))
#df_notes_discharge_detail.columns

#print(df_notes_discharge['subject_id'])
#print(df_notes_radiology['text'][0])

#group by patient and concatenate all notes for one patient
df_notes_discharge = df_notes_discharge.groupby(['subject_id'], as_index = False).agg({'text': ' '.join})
print(df_notes_discharge.columns)
print('len of patients', len(df_notes_discharge))

#limit to 781 patients
df_notes_discharge = df_notes_discharge.head(781)
print('final len', len(df_notes_discharge))

Index(['note_id', 'subject_id', 'hadm_id', 'note_type', 'note_seq',
       'charttime', 'storetime', 'text'],
      dtype='object')
total len 331794
Index(['subject_id', 'text'], dtype='object')
len of patients 145915
final len 781


In [7]:
import dask.dataframe as dd
from dask.multiprocessing import get

#ddf = dd.from_pandas(df_notes_discharge, npartitions=7)
#meta_df = pd.DataFrame(columns=["subject_id", "text", "new_text"], dtype=object)

#ddf['text'] = ddf['text'].apply(lambda text: clean_text(text, remove_punctuation = True, remove_stopwords = True, remove_repeat = True))

#res = ddf.map_partitions(lambda df: df.assign(new_text = clean_text(df['text'], remove_punctuation = True, remove_stopwords = True, remove_repeat = True)), meta=meta_df)
#res.to_csv("data/mimic4_notes/discharge_clean.csv", index=False)

#pandas_df = ddf.compute()
#pandas_df.to_csv("data/mimic4_notes/discharge_clean.csv", index=False)


# without dask
df_notes_discharge['text'] = df_notes_discharge['text'].apply(lambda text: clean_text(text, remove_punctuation = True, remove_stopwords = True, remove_repeat = True, remove_num = True))


#save notes for embeddings
notes = list(df_notes_discharge['text'])     

# save cleaned notes into a pickle file
f = open('data/cleaned_notes.pckl', 'wb')
pickle.dump(notes, f)
f.close()
print("Saved cleansed notes")

f = open('data/df_notes_discharge.pckl', 'wb')
pickle.dump(df_notes_discharge, f)
f.close()
print("Saved cleansed df_notes_discharge ")

# save labels of same size and order

label_df = pd.DataFrame(labels)

label_df["subject_id"] = pd.to_numeric(label_df["subject_id"])
# return just the labels of the patients in the correct order as y, order of left table is maintained.
labels = df_notes_discharge.merge(label_df, on='subject_id', how='inner')['label']
labels = labels.to_numpy()

with open("data/labels.pckl", "wb") as f:
    pickle.dump(labels, f)


Saved cleansed notes
Saved cleansed df_notes_discharge 


In [8]:
# process notes converted to index array of numbers of same length

# transforms text to a sequence of integers padded to same length
#from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences

def textTokenize(notes):
    """For each patients text, find max length, build a dict of words
    
    Returns:
        tokenized_texts (List[List[str]]): List of list of tokens
        word2idx (Dict): Dict built from the corpus
        max_len (int): max sentence length
    """
    t = get_tokenizer("basic_english")
    lengths = []
    tokenized_texts = []
    word2idx = {}
    # Add padding and unknown tokens to the dictionary
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1
    idx = 2
    for text in notes:
        tokenized_text = t(text)
        tokenized_texts.append(tokenized_text)
        # Add new token to `word2idx`
        for token in tokenized_text:
            if token not in word2idx:
                word2idx[token] = idx
                idx += 1
        lengths.append(len(tokenized_text))
    mean_len = np.mean(lengths)
    std_len = np.std(lengths)
    max_len = np.max(lengths)
    return tokenized_texts, word2idx, max_len, mean_len, std_len

def encodeTokenizedText(tokenized_texts, word2idx, normal_max_len):
    """Pad each sentence to the max length and encode tokens to their index in the all words dict.
    Make it more efficient -  instead of max length, make it mean len + 4x std dev, to eliminate few outliers

    Returns:
        input_ids (np.array): Array of token indexes in the vocabulary with
            shape (N, max_len). It will the input to the CNN.
    """
    input_ids = []
    for tokenized_sent in tokenized_texts:
        # Pad sentences to normal_max_len
        if (len(tokenized_sent) > normal_max_len):
            tokenized_sent = tokenized_sent[0:normal_max_len]
        else:
            tokenized_sent += ['<pad>'] * (normal_max_len - len(tokenized_sent))
        if len(tokenized_sent) != normal_max_len:
            print(len(tokenized_sent))
            print(i)
            
        # Encode tokens to input_ids, input_id is just the idx position when it was inserted, so it converts words to numbers
        input_id = [word2idx.get(token) for token in tokenized_sent]
        input_ids.append(input_id)    
    return np.array(input_ids, dtype=int)



tokenized_texts, word2idx, max_len, mean_len, std_len = textTokenize(notes)
normal_max_len = int((mean_len + 4*std_len) + 1)


# input_ids are the input to cnn and rnn models, as the tokenized text
input_ids = encodeTokenizedText(tokenized_texts, word2idx, normal_max_len)

In [9]:
len(input_ids)

781

In [13]:
# MAKE EMBEDDING MATRIX
import gensim.downloader as api

#pretrain = api.load('word2vec-google-news-300')

#f = open('data/embeddings/pretrain.pckl', 'wb')
#pickle.dump(pretrain, f)
#f.close()
#print("Saved pretrain")

with open('data/embeddings/pretrain.pckl', 'rb') as f:
    pretrain = pickle.load(f)

# Make Word2Vec embeddings from the notes themselves
f = open('data/cleaned_notes.pckl', 'rb')
notes = pickle.load(f)
f.close()

def make_w2v_model(notes, window, workers, epochs, vector_size, min_count):
    model = gensim.models.Word2Vec(notes, size=vector_size, window=window, min_count=min_count, workers=workers)
    print('Start training process...') 
    model.train(notes,total_examples=len(notes),epochs=epochs)
    model.save("w2v.model")
    print("Model Saved")

make_w2v_model(notes,  window=5, workers=1, epochs=20, vector_size=300, min_count=2)

def word_Embed_w2v(word_index, model):   
    w2v = model
    #convert pretrained word embedding to a dictionary
    embedding_index=dict()
    print('word vectors len is ',len(w2v.wv.vocab))
    for i in range(len(w2v.wv.vocab)):
        word=w2v.wv.index2word[i]
        if word is not None:
            embedding_index[word]=w2v.wv[word]  
    #extract word embedding for train and test data
    
    # create matrix of shape
    embedding_matrix = np.random.uniform(-0.25, 0.25, (len(word_index), 300))    
    embedding_matrix[word_index['<pad>']] = np.zeros((300,))

    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


def word_Embed_GNV(word_index):   
    """ Load the pretrained vectors for each token in our vocabulary. 
    For tokens with no pretraiend vectors, we will initialize random word vectors with the same length and variance.
    
     Returns:
        embeddings (np.array): Embedding matrix with shape (N, d) where N is
            the size of word2idx and d is embedding dimension
    """
    #pretrain = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
    # convert pretrained word embedding to a dictionary
    print('pretrain len is ',len(pretrain.wv.vocab))
    # fill embedding_index with every word from the pretrain
    embedding_index=dict()
    for i in range(len(pretrain.wv.vocab)):
        word=pretrain.wv.index2word[i]
        if word is not None:
            embedding_index[word]=pretrain.wv[word] 
            
    # create matrix of shape
    embedding_matrix = np.random.uniform(-0.25, 0.25, (len(word_index), 300))    
    embedding_matrix[word_index['<pad>']] = np.zeros((300,))
    
    for word, i in tqdm_notebook(word_index.items()):
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

w2v_model = Word2Vec.load("w2v.model")
embedding_matrix_w2v = word_Embed_w2v(word2idx, w2v_model)

embedding_matrix_GNV = word_Embed_GNV(word2idx)

Start training process...
Model Saved
word vectors len is  37
pretrain len is  3000000


  print('pretrain len is ',len(pretrain.wv.vocab))
  for i in range(len(pretrain.wv.vocab)):
  word=pretrain.wv.index2word[i]
  embedding_index[word]=pretrain.wv[word]
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for word, i in tqdm_notebook(word_index.items()):


  0%|          | 0/29898 [00:00<?, ?it/s]

In [14]:
#dump encoded notes and embeddings

f = open('data/embeddings/tokenized_notes.pckl', 'wb')
pickle.dump(input_ids, f)
f.close()
print("Saved Tokenized Notes")

f = open('data/embeddings/embedding_matrix_GNV.pckl', 'wb')
pickle.dump(embedding_matrix_GNV, f)
f.close()
print("Saved Google Vector Word Embedding Matrix")

f = open('data/embeddings/embedding_matrix_w2v.pckl', 'wb')
pickle.dump(embedding_matrix_w2v, f)
f.close()
print("Saved Word 2 Vector Embedding Matrix")

f = open('data/embeddings/word_index_eff.pckl', 'wb')
pickle.dump(word2idx, f)
f.close()
print("Saved Word Indices")

f = open('data/embeddings/max_len_eff.pckl', 'wb')
pickle.dump(normal_max_len, f)
f.close()
print("Saved Maximum Length of One Patient's Notes")

Saved Tokenized Notes
Saved Google Vector Word Embedding Matrix
Saved Word 2 Vector Embedding Matrix
Saved Word Indices
Saved Maximum Length of One Patient's Notes


In [17]:
with open('data/embeddings/tokenized_notes.pckl', 'rb') as f:
    input_ids = pickle.load(f)

with open('data/embeddings/embedding_matrix_GNV.pckl', 'rb') as f:
    embedding_matrix_GNV = pickle.load(f)
    embedding_matrix_GNV = torch.tensor(embedding_matrix_GNV)

with open('data/embeddings/embedding_matrix_w2v.pckl', 'rb') as f:
    embedding_matrix_w2v = pickle.load(f)
    embedding_matrix_w2v = torch.tensor(embedding_matrix_w2v)

with open('data/embeddings/word_index_eff.pckl', 'rb') as f:
    word2idx = pickle.load(f)

with open('data/embeddings/max_len_eff.pckl', 'rb') as f:
    normal_max_len = pickle.load(f)

with open('data/labels.pckl', 'rb') as f:
    labels = pickle.load(f)

with open('data/df_notes_discharge.pckl', 'rb') as f:
    df_notes_discharge = pickle.load(f)
    
#with open('data/embeddings/pretrain.pckl', 'rb')
#    pretrain = pickle.load(f)

In [18]:
print('pretrained GNV num embeddings ', len(embedding_matrix_GNV))
print('GNV embedding dimensions ', len(embedding_matrix_GNV[0]))

print('pretrained w2v num embeddings ', len(embedding_matrix_w2v))
print('w2v embedding dimensions ', len(embedding_matrix_w2v[0]))

print('len encoded notes, or total notes is ', len(input_ids))
print('len of first note is ', len(input_ids[0]))
print('max len is ', normal_max_len)
print('len or word index, or total unique words is ', len(word2idx))

pretrained GNV num embeddings  29898
GNV embedding dimensions  300
pretrained w2v num embeddings  29898
w2v embedding dimensions  300
len encoded notes, or total notes is  781
len of first note is  16410
max len is  16410
len or word index, or total unique words is  29898


In [19]:
#embeddings = torch.tensor(embeddings)

#from pyhealth.datasets.splitter import split_by_patient
#from pyhealth.datasets import split_by_patient, get_dataloader
# data split
#train_dataset, val_dataset, test_dataset = split_by_patient(dataset, [0.8, 0.1, 0.1])
# create dataloaders (they are <torch.data.DataLoader> object)
#train_loader = get_dataloader(train_dataset, batch_size=64, shuffle=True)
#val_loader = get_dataloader(val_dataset, batch_size=64, shuffle=False)
#test_loader = get_dataloader(test_dataset, batch_size=64, shuffle=False)

In [20]:
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler, SequentialSampler)
from sklearn.model_selection import train_test_split


def data_loader(x_train, x_test, y_train, y_test, batch_size=8):
    """Convert train and test sets to tensors and load them to a dataLoader
    """

    # Convert data type to torch.Tensor
    x_train, x_test, y_train, y_test = tuple(torch.tensor(data) for data in [x_train, x_test, y_train, y_test])

    # Create DataLoader for training data
    train_data = TensorDataset(x_train, y_train)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Create DataLoader for validation data
    val_data = TensorDataset(x_test, y_test)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    return train_dataloader, val_dataloader


#train_size = int(len(total) * 2/3)
#test_size = len(total) - train_size
#train_df = total.head(train_size)
#test_df = total.tail(test_size)

#train = train_df.to_numpy()
#test = test_df.to_numpy()

#y_train = train[0]
#x_train = train[1]

#y_test = test[0]
#x_test = test[1]


    
 # Train Test Split
x_train, x_test, y_train, y_test = train_test_split(input_ids, labels, test_size=0.33, random_state=seed)

batch_size=8
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = data_loader(x_train, x_test, y_train, y_test, batch_size=batch_size)

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self,
                 pretrained_embedding=None,
                 freeze_embedding=False,
                 vocab_size=None,
                 embed_dim=300,
                 num_classes=2,
                 dropout=0.1):
        
        super(CNN, self).__init__()
                
        # embeddings
        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding, freeze=freeze_embedding)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embed_dim, padding_idx=0)
            
        self.filters_out = 128
        # Conv Network
        self.conv1 =  nn.Conv1d(self.embed_dim, self.filters_out, kernel_size=5)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(self.filters_out,10)
        self.dropout = nn.Dropout(p=dropout)

        #output layer
        if num_classes == 2:
            #binary
            self.fc2 = nn.Linear(10, 1)
            self.out = nn.Sigmoid()
        else:
            self.fc2 = nn.Linear(10, num_classes)
            self.out = nn.Softmax()         
                
        nn.init.kaiming_normal_(self.fc1.weight, nonlinearity='relu')
        nn.init.kaiming_normal_(self.fc2.weight, nonlinearity='relu')
        

    def forward(self, input_ids):
        
        # get embeddings. Output shape: (b, max_len, embed_dim)
        x = self.embedding(input_ids).float()
        #print(x)
        # Permute to match input shape requirement of nn.Conv1d. Output shape: (b, embed_dim, max_len)
        x = x.permute(0, 2, 1)
        x = F.relu(self.conv1(x))

        #Max pooling. Output shape: (b, self.filters_out, 1)
        #input is a,b,in output is a,b,out
        x = self.pool(x)
        
        # Output shape: (b, self.filters_out)
        x = x.squeeze()
        x = F.relu(self.fc1(x))
        #x = self.dropout(x)
        x = self.fc2(x)
        # final output activation function
        x = self.out(x)
        
        return x


In [22]:
model = CNN(pretrained_embedding=embedding_matrix_GNV)
criterion = torch.nn.BCELoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), weight_decay=0.0, amsgrad=False)

In [23]:
def eval_model(model, dataloader):
    model.eval()
    Y_pred = []
    Y_true = []
    with torch.no_grad():
        for data, target in dataloader:
            # your code here
            Y_true.append(target)
            Y_scores = model(data).squeeze()
            # the class with the highest energy is what we choose as prediction
            predicted = (Y_scores > .5).int()
            Y_pred.append(predicted)
        Y_pred = np.concatenate(Y_pred, axis=0)
        Y_true = np.concatenate(Y_true, axis=0)
        #f1 precision recall accuracy
        p, r, f, _ = precision_recall_fscore_support(Y_true, Y_pred, average='weighted')
        a = accuracy_score(Y_true, Y_pred)
    
    return p, r, f, a

In [24]:
n_epochs = 10

def train_model(model, train_dataloader, n_epoch=n_epochs, optimizer=optimizer, criterion=criterion):

    model.train() # prep model for training
    
    for epoch in range(n_epoch):
        curr_epoch_loss = []
        #count = 0
        for data, target in train_dataloader:
            #if count > 1:
            #    break
            #count = 1
            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize
            outputs = model(data)
            outputs = outputs.squeeze()
            loss = criterion(outputs, target.float())
            loss.backward()
            optimizer.step()

            curr_epoch_loss.append(loss.cpu().data.numpy())
        print(f"Epoch {epoch + 1}: curr_epoch_loss={np.mean(curr_epoch_loss)}")
        p, r, f, a = eval_model(model, val_dataloader)
        print('Epoch: {} \t Validation precision: {:.2f}, recall:{:.2f}, f1_score: {:.2f}, accuracy: {:.2f}'.format(epoch + 1, p, r, f, a))
    return model

In [25]:
model = train_model(model, train_dataloader)

Epoch 1: curr_epoch_loss=0.6217027306556702
Epoch: 1 	 Validation precision: 0.68, recall:0.65, f1_score: 0.66, accuracy: 0.65
Epoch 2: curr_epoch_loss=0.4941687285900116
Epoch: 2 	 Validation precision: 0.69, recall:0.65, f1_score: 0.66, accuracy: 0.65
Epoch 3: curr_epoch_loss=0.3355555534362793
Epoch: 3 	 Validation precision: 0.81, recall:0.74, f1_score: 0.63, accuracy: 0.74
Epoch 4: curr_epoch_loss=0.17250755429267883
Epoch: 4 	 Validation precision: 0.70, recall:0.71, f1_score: 0.70, accuracy: 0.71
Epoch 5: curr_epoch_loss=0.06368406862020493
Epoch: 5 	 Validation precision: 0.72, recall:0.71, f1_score: 0.71, accuracy: 0.71
Epoch 6: curr_epoch_loss=0.01895950548350811
Epoch: 6 	 Validation precision: 0.75, recall:0.77, f1_score: 0.75, accuracy: 0.77
Epoch 7: curr_epoch_loss=0.006719441153109074
Epoch: 7 	 Validation precision: 0.75, recall:0.77, f1_score: 0.74, accuracy: 0.77
Epoch 8: curr_epoch_loss=0.004408594686537981
Epoch: 8 	 Validation precision: 0.75, recall:0.77, f1_score

In [26]:
p, r, f, a = eval_model(model, val_dataloader)
print('Epoch: {} \t Validation precision: {:.2f}, recall:{:.2f}, f1_score: {:.2f}, accuracy: {:.2f}'.format(n_epochs, p, r, f, a))

Epoch: 10 	 Validation precision: 0.76, recall:0.78, f1_score: 0.75, accuracy: 0.78


In [29]:
# Classic machine learning

from sklearn.datasets import load_svmlight_file
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer # add reference

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


RANDOM_STATE = seed

def logistic_regression_pred(X_train, Y_train, X_test):
    
    """
    logistic regression classifier using X_train and Y_train to predict labels of X_train
    """
    #clf = LogisticRegression(random_state=RANDOM_STATE).fit(X_train, Y_train)
    #Y_pred = clf.predict(X_train)
    #return Y_pred
    logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
    logreg.fit(X_train, Y_train)
    Y_pred = logreg.predict(X_test)
    return Y_pred

def svm_pred(X_train, Y_train, X_test):
    
    """
    SVM classifier using X_train and Y_train to predict labels of X_train
    """
    #clf = LinearSVC(random_state=RANDOM_STATE).fit(X_train, Y_train)
    #Y_pred = clf.predict(X_train)
    #return Y_pred
    sgd = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                ])
    sgd.fit(X_train, Y_train)
    Y_pred = sgd.predict(X_test)
    return Y_pred

def naive_bayes_pred(X_train, Y_train, X_test):
    
    """
    Naive Bayes using X_train and Y_train to predict labels of X_train
    """
    nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
    nb.fit(X_train, Y_train)
    Y_pred = nb.predict(X_test)
    return Y_pred

def classification_metrics(Y_pred, Y_true):
    
    accuracy = accuracy_score(Y_true, Y_pred)
    precision = precision_score(Y_true, Y_pred)
    recall = recall_score(Y_true, Y_pred)
    f1 = f1_score(Y_true, Y_pred)
    
    #tn, fp, fn, tp = confusion_matrix(y_true, y_pred)
    return accuracy, precision, recall, f1

    
#input: Name of classifier, predicted labels, actual labels
def display_metrics(classifierName, Y_pred, Y_true):
    print("______________________________________________")
    print(("Classifier: "+classifierName))
    acc, precision, recall, f1score = classification_metrics(Y_pred,Y_true)
    print(("Accuracy: "+str(acc)))
    print(("Precision: "+str(precision)))
    print(("Recall: "+str(recall)))
    print(("F1-score: "+str(f1score)))
    print("______________________________________________")
    print("")


#x_train, x_test, y_train, y_test
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(notes, labels, test_size=0.33, random_state = seed)

display_metrics("Logistic Regression", logistic_regression_pred(X_train_b, y_train_b, X_test_b), y_test_b)
display_metrics("SVM",svm_pred(X_train_b, y_train_b, X_test_b),y_test_b)
display_metrics("Naive Bayes", naive_bayes_pred(X_train_b, y_train_b, X_test_b), y_test_b)


______________________________________________
Classifier: Logistic Regression
Accuracy: 0.7596899224806202
Precision: 0.5777777777777777
Recall: 0.37681159420289856
F1-score: 0.45614035087719296
______________________________________________

______________________________________________
Classifier: SVM
Accuracy: 0.7596899224806202
Precision: 0.6521739130434783
Recall: 0.21739130434782608
F1-score: 0.32608695652173914
______________________________________________

______________________________________________
Classifier: Naive Bayes
Accuracy: 0.7325581395348837
Precision: 0.0
Recall: 0.0
F1-score: 0.0
______________________________________________



  _warn_prf(average, modifier, msg_start, len(result))
