By Guillaume Thibault - Matricule 1948612

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.neighbors import KNeighborsClassifier
data_path = './medical_dataset/'
submit_path = './hwk3_submit/'

# Medical Text Classification

In this assignment, we will build a classifier with medical-NLP corpus. This is a classification
task with the input as a medical transcription (text) and the output as the corresponding
medical transcript type. This is a clinical dataset which consists of a medical transcript from
one of the 4 classes {Surgery - 1 , Medical Records - 2, Internal Medicine - 3 and Other - 4}
as an input. The task is to classify the transcript (text) to the corresponding classes i.e the
transcript type. The dataset consists of 4000 transcripts in the training set and 500 in each
of the validation and test sets.

In [2]:
# Import the data
pre_train = pd.read_csv(data_path + 'train.csv', sep=",")
pre_valid = pd.read_csv(data_path + 'valid.csv', sep=",")
pre_test = pd.read_csv(data_path + 'test.csv', sep=",")

## 1. Vectorisation of natural language text [10 marks]  

Most of the algorithms described in the class take the input as a vector. However, the
reviews are natural language text of varying number of words. The first step would be to
convert this varying-length movie review to a fixed-length vector representation. We will
consider two different ways of vectorizing the natural language text: binary bag-of-
words representation and frequency bag-of-words representation

Instructions for dataset submission are given in the end of the assignment (do not include the dataset in the report). 


In [3]:
import string
import re
exclude = set(string.punctuation)

def add_word_occ_to_dict(words: str, occ_dict: dict) -> None:
    """
    Add word occurence to a dictionnary
    @params:
        - words (str): conatining the words separeted by ' '
        - occ_dict (dict): dictionnay containg the words as the keys and the occurence number as a value
    """
    word = words.split(' ')   
    sorted_= sorted(set(word))
    for val_sort in sorted_:
        if val_sort in occ_dict.keys():
            occ_dict[val_sort] += word.count(val_sort) 
        else:
            occ_dict[val_sort] = word.count(val_sort) 


def remove_ponctuaion(text: str) -> str:
    """
    Remove all ponctuations from a string
    @params: 
        - string (str): string to remove ponct.
    """
    text = re.sub(' +', ' ', text)
    return text.translate(str.maketrans('', '', string.punctuation))


def sort_tuple(tup, most_freq=True): 
    """
    key is set to sort using second element of sublist lambda has been used 
    """
    tup.sort(key = lambda x: x[1], reverse=most_freq) 
    return tup


def get_top_n_occ(df, n: int)-> list:
    occ = {}
    for index, row in df.iterrows():
        text = remove_ponctuaion(row[1]).lower() # Lower case all char
        add_word_occ_to_dict(text, occ)
    sorted_tup = sort_tuple([(k, v) for k, v in occ.items()])
    return [sorted_tup[i] for i in range(0, n)]

### 1.1 Binary Bag of words 

In [4]:
def binary_bag_of_word(df, n) -> pd.DataFrame:
    # Step 1 and 2
    top_n_words_tup = get_top_n_occ(df, n)
    cols = [k for k, v in top_n_words_tup]
    
    
    # Step 3
    bow_df = pd.DataFrame(columns=["label"] + cols)
    for index, row in df.iterrows():
        bow_row = [row[0]]
        text = remove_ponctuaion(row[1]).lower()
        word_set = sorted(set(text.split(' ')))
        for word in cols:
            if word in word_set:
                bow_row.append(1)
            else:
                bow_row.append(0)
        bow_df.loc[len(bow_df)] = bow_row
    return bow_df

In [5]:
start_time = time.time()

# Process dataset - Binary Bag of word
n = 10000
b_test = binary_bag_of_word(pre_test, n)  
b_train = binary_bag_of_word(pre_train, n)  
b_valid = binary_bag_of_word(pre_valid, n)  

print(f"--- {(time.time() - start_time)}s seconds ---")

# Save bag of words dataframe
b_test.to_csv(submit_path + 'b_test.csv')
b_valid.to_csv(submit_path + 'b_valid.csv')
b_train.to_csv(submit_path + 'b_train.csv')

--- 919.4922947883606s seconds ---


In [6]:
# Extract bag of words dataframe from file
b_test = pd.read_csv(submit_path + 'b_test.csv', sep=",")
b_valid = pd.read_csv(submit_path + 'b_valid.csv', sep=",")
b_train = pd.read_csv(submit_path + 'b_train.csv', sep=",")

### 1.2 Frequency Bag of words

In [7]:
def frequency_bag_of_word(df, n) -> pd.DataFrame:
    # Step 1 and 2
    top_n_words_tup = get_top_n_occ(df, n)
    cols = [k for k, v in top_n_words_tup]
    
    # Step 3
    bow_df = pd.DataFrame(columns=["label"] + cols)
    for index, row in df.iterrows():
        bow_row = [row[0]]
        text = remove_ponctuaion(row[1]).lower()
        word_set = sorted(set(text.split(' ')))
        for word_tup in top_n_words_tup:
            bow_row.append(text.count(word_tup[0]) / word_tup[1])
        bow_df.loc[len(bow_df)] = bow_row
        
    return bow_df

In [8]:
start_time = time.time()

# Process dataset - Frequency Bag of word
n = 10000
f_test = frequency_bag_of_word(pre_test, n)  
f_valid = frequency_bag_of_word(pre_valid, n)  
f_train = frequency_bag_of_word(pre_train, n) 

print(f"--- {(time.time() - start_time)}s seconds ---")

# Save bag of words dataframe
f_test.to_csv(submit_path + 'f_test.csv')
f_valid.to_csv(submit_path + 'f_valid.csv')
f_train.to_csv(submit_path + 'f_train.csv')

--- 585.1088128089905s seconds ---


In [9]:
# Extract bag of words dataframe from file
f_test = pd.read_csv(submit_path + 'f_test.csv', sep=",")
f_valid = pd.read_csv(submit_path + 'f_valid.csv', sep=",")
f_train = pd.read_csv(submit_path + 'f_train.csv', sep=",")

### 1.3 Submission

In [18]:
def submit_vocab(vocab):
    with open('medical_nlp-vocab.txt', 'w') as f:
        for i in range(1, len(vocab) + 1):
            f.writelines(f"{vocab[i-1][0]}\t{i}\t{vocab[i-1][1]}\n")
            
def submit_set(prefix, df, vocab):
    cols = [k for k, v in vocab]
    
    with open(f'medical {prefix}-traint.txt', 'w') as f:
        for index, row in df.iterrows():
            for i in range(len(cols)):
                if row[i+1] != 0:
                    f.write(f'{i+1} ')
            f.write(f'\t{row[0]}\n')

In [19]:
n = 10000

submit_vocab(get_top_n_occ(pre_train, n))
submit_set('test', b_test, get_top_n_occ(pre_test, n)) 
submit_set('valid', b_valid, get_top_n_occ(pre_valid, n)) 
submit_set('train', b_train, get_top_n_occ(pre_train, n)) 

## 2 2. Models using binary bag-of-words [19 marks]

For this question, we will focus on the Medical-NLP dataset with binary bag-of-words
(BBoW) representation. We will use the F1-score as the evaluation metric for the entire
assignment. 

### (a) Random classifier [2 marks]
As a baseline, report the performance of the random classifier (a classifier which
classifies a review into a uniformly random class) and the majority-class classifier
(a classifier which computes the majority class in the training set and classifies all
test instances as that majority class). 

In [40]:
from sklearn.metrics import f1_score

In [74]:
# 1. Random classifier
def random_model_f1(df):
    n_value = len(df)
    possible_value = df['label'].unique()
    pred_values = [ possible_value[int(np.random.uniform(low=0.0, high=0.9999, size=None) * len(possible_value))] for _ in range(n_value)]
    pred_values = np.array(pred_values)
    real_values = df['label'].to_numpy()
    f1 = f1_score(real_values, pred_values, labels=possible_value, average='macro')
    return f1
    
def most_freq_model_f1(df):
    real_values = df['label'].to_numpy()
    counts = np.bincount(real_values)
    most_freq_value = np.argmax(counts)
    pred_values = [most_freq_value] * len(real_values)
    f1 = f1_score(real_values, pred_values, labels=df['label'].unique(), average='macro')
    return f1

def test_model(model, fn, df):
    print(f"f1 {model}_set score: {fn(df)}")
    

In [77]:
print(" ~~ Random Model ~~")
test_model('train', random_model_f1, b_train)
test_model('valid', random_model_f1, b_valid)
test_model('test', random_model_f1, b_test)

print("\n")

print(" ~~ Most Frequence Model ~~")
test_model('train', most_freq_model_f1, b_train)
test_model('valid', most_freq_model_f1, b_valid)
test_model('test', most_freq_model_f1, b_test)

 ~~ Random Model ~~
f1 train_set on random_model score: 0.2552066930419413
f1 valid_set on random_model score: 0.2574430583248011
f1 test_set on random_model score: 0.2635717389954678


 ~~ Most Frequence Model ~~
f1 train_set on random_model score: 0.120996778472617
f1 valid_set on random_model score: 0.12424698795180723
f1 test_set on random_model score: 0.14183381088825217


### (b) Naive Bayes, Decision Trees, Logistic regression and Linear SVM [8 marks]
Now train Naive Bayes, Decision Trees, Logistic regression and Linear SVM for this
task. 

[Note: You should do a thorough hyper-parameter tuning by using the given
validation set. Also, note that you should use the appropriate naive Bayes classifier
for binary input features (also called Bernoulli naive Bayes).] 

In [195]:
def extract_data(df):
    X = df.drop(columns=['label']).to_numpy()
    y = df['label'].to_numpy()
    return X, y

def f1_bench(clf, df):
    # Prediction
    X, real_values = extract_data(df)
    pred_values = clf.predict(X)
    
    # f1 score
    f1 = f1_score(real_values, pred_values, labels=df_test['label'].unique(), average='macro')
    print(f"f1 score: {f1}")

In [196]:
from sklearn.naive_bayes import BernoulliNB

In [216]:
def bernoulli_naive_bayes(df, **kwargs):
    X, y = extract_data(df)
    return BernoulliNB(**kwargs).fit(X, y)

clf = bernoulli_naive_bayes(b_train)

f1_bench(clf, b_valid)

f1 score: 0.19590225563909774


  'precision', 'predicted', average, warn_for)


In [115]:
from sklearn.tree import DecisionTreeClassifier

In [245]:
def decision_tree(df, **kwargs):
    X, y = extract_data(df)
    return DecisionTreeClassifier(**kwargs).fit(X, y)

clf = decision_tree(b_train)

f1_bench(clf, b_valid)

f1 score: 0.3159825974867466


In [170]:
from sklearn.linear_model import LogisticRegression

In [211]:
def logistic_regression(df, **kwargs):
    X, y = extract_data(df)
    return LogisticRegression(**kwargs).fit(X, y)

clf = logistic_regression(b_train)

f1_bench(clf, b_valid)



f1 score: 0.2677315221096108


In [125]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [215]:
def SVM(df, **kwargs):
    X, y = extract_data(df)
    clf = make_pipeline(StandardScaler(), SVC(**kwargs))
    clf.fit(X, y)
    return clf

clf = SVM(b_train)

f1_bench(clf, b_valid)

f1 score: 0.22382720790368255


### (c) Report the list of hyper-parameters [3 marks]
Report the list of hyper-parameters you considered for each classifier, their range,
as well as the best values for these hyper-parameters, chosen based on the validation
set performance1.

In [238]:
# Naive bayes
hyper_params = {
    'alpha': 1.0e-10,
    'binarize': 0.0,
    'fit_prior': True
}
clf = bernoulli_naive_bayes(b_train, **hyper_params)

f1_bench(clf, b_valid)

f1 score: 0.21121649184149183


In [308]:
hyper_params = {
    'criterion':"gini",
    'splitter':"best", 
    'max_depth': 4000,
    'min_samples_split': 2, 
    'min_samples_leaf' : 10,
    'min_weight_fraction_leaf': 0.0,
    'max_features': None,
    'random_state': 1000,
    'max_leaf_nodes': 100,
    'min_impurity_decrease': 0.0,
    'class_weight': None
}
clf = decision_tree(b_train, **hyper_params)

f1_bench(clf, b_valid)

f1 score: 0.3613305487071276


In [316]:
hyper_params = { 
    'solver':'newton-cg', 
    'penalty': 'l2', 
    'C':1.0, 
    'dual': False,
    'tol': 1e-4,
    'fit_intercept': True,
    'random_state': None,
    'intercept_scaling': 1,
    'max_iter':100, 
    'multi_class':'multinomial'
}
clf = logistic_regression(b_train, **hyper_params)

f1_bench(clf, b_valid)

f1 score: 0.262772782271148


In [324]:
hyper_params = {
    'gamma':'auto',
    'kernel': 'precomputed',
    'degree': 3,
    'coef0': 0.0,
    'shrinking': True,
    'probability': False,
    'tol': 1e-3,
    'cache_size': 200,
    'max_iter': -1,
    'random_state': None
}
clf = SVM(b_train, **hyper_params)

f1_bench(clf, b_valid)

ValueError: X.shape[0] should be equal to X.shape[1]

### (d) F1-score [3 marks]
Report the training, validation, and test F1-score for all the classifiers  (with best hyper-parameter configuration). 

In [None]:
linear: 0.23550706333938282
sigmoid: 0.2426287107763022

### (e) Comment on the performance of different classifiers. [3 marks]

Why did a particular classifier
perform better than the rest? What was the role of the hyper-parameters in finding
the best results. 

## 3. Model using frequency bag-of-words [21 marks]

Now we will repeat question 2 but with frequency bag-of-words (FBoW) representation.

### (a) Naive Bayes, Decision Tree, Logistic regression and Linear SVM  [8 marks]
Train Naive Bayes, Decision Tree, Logistic regression and Linear SVM for this task.

[Note: Again, you should do a thorough hyper-parameter tuning by using the given
validation set. Also, note that you should use the appropriate naive Bayes classifier
for real valued input features (also called Gaussian naive Bayes).] [8 marks]

In [220]:
from sklearn.naive_bayes import GaussianNB

def gaussian_naive_bayes(df, **kwargs):
    X, y = extract_data(df)
    return BernoulliNB(**kwargs).fit(X, y)

In [222]:
clf = bernoulli_naive_bayes(f_train)
f1_bench(clf, f_valid)

clf = decision_tree(f_train)
f1_bench(clf, f_valid)

clf = logistic_regression(f_train)
f1_bench(clf, f_valid)

clf = SVM(f_train)
f1_bench(clf, f_valid)

  'precision', 'predicted', average, warn_for)


f1 score: 0.10999999999999999
f1 score: 0.22872563336704382




f1 score: 0.1757513807786575
f1 score: 0.090311986863711


  'precision', 'predicted', average, warn_for)


### (b) Report the list of hyper-parameters [3 marks]
Report the list of hyper-parameters you considered for each classifier, their range,
as well as the best values for these hyper-parameters, chosen based on the validation
set performance. 

In [None]:
hyper_params = {
    'min_samples_split':10, 
    'splitter':"random", 
    'criterion':"gini"
}
clf = decision_tree(b_train, **hyper_params)

f1_bench(clf, b_valid)

hyper_params = { 
    'penalty':'l2', 
    'C':0.01, 
    'solver':'newton-cg', 
    'max_iter':1000, 
    'multi_class':'multinomial'
}
clf = logistic_regression(b_train, **hyper_params)

f1_bench(clf, b_valid)

hyper_params = {
    'gamma':'auto'
}
clf = SVM(b_train, **hyper_params)

f1_bench(clf, b_valid)

### (c) F1-score [3 marks]
Report the training, validation, and test F1-score for all the classifiers (with best
hyper-parameter configuration). 

### (d) Comment on the performance of different classifiers [3 mark]
Why did a particular classifier perform better than the rest? What was the role of the hyper-parameters in finding
the best results. 

### (e) Compare the performance with the binary bag-of-words based classifiers. [2 mark]

Why is
there a difference in the performance? Give a brief explanation comparing BBoW
Naive Bayes and FBoW Naive Bayes and similarly for other models. 

### (f)  Comment on the representation [2 mark]
Which representation is better? Why?

In [None]:
start_time = time.time()
print(f"--- {(time.time() - start_time)}s seconds ---")