By Guillaume Thibault - Matricule 1948612

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.neighbors import KNeighborsClassifier
data_path = './medical_dataset/'
submit_path = './hwk3_submit/'

# Medical Text Classification

In this assignment, we will build a classifier with medical-NLP corpus. This is a classification
task with the input as a medical transcription (text) and the output as the corresponding
medical transcript type. This is a clinical dataset which consists of a medical transcript from
one of the 4 classes {Surgery - 1 , Medical Records - 2, Internal Medicine - 3 and Other - 4}
as an input. The task is to classify the transcript (text) to the corresponding classes i.e the
transcript type. The dataset consists of 4000 transcripts in the training set and 500 in each
of the validation and test sets.

In [2]:
# Import the data
pre_train = pd.read_csv(data_path + 'train.csv', sep=",")
pre_valid = pd.read_csv(data_path + 'valid.csv', sep=",")
pre_test = pd.read_csv(data_path + 'test.csv', sep=",")
pre_train.head()

Unnamed: 0,label,text
0,2,"2-D STUDY,1. Mild aortic stenosis, widely calc..."
1,1,"PREOPERATIVE DIAGNOSES: , Dysphagia and esopha..."
2,2,"CHIEF COMPLAINT:, The patient comes for three..."
3,1,"PROCEDURE: , Bilateral L5, S1, S2, and S3 radi..."
4,2,"DISCHARGE DIAGNOSES:,1. Chronic obstructive pu..."


## 1. Vectorisation of natural language text [10 marks]  

Most of the algorithms described in the class take the input as a vector. However, the
reviews are natural language text of varying number of words. The first step would be to
convert this varying-length movie review to a fixed-length vector representation. We will
consider two different ways of vectorizing the natural language text: binary bag-of-
words representation and frequency bag-of-words representation

Instructions for dataset submission are given in the end of the assignment (do not include the dataset in the report). 


In [3]:
import string
import re
exclude = set(string.punctuation)

def add_word_occ_to_dict(words: str, occ_dict: dict) -> None:
    """
    Add word occurence to a dictionnary
    @params:
        - words (str): conatining the words separeted by ' '
        - occ_dict (dict): dictionnay containg the words as the keys and the occurence number as a value
    """
    word = words.split(' ')   
    sorted_= sorted(set(word))
    for val_sort in sorted_:
        if val_sort in occ_dict.keys():
            occ_dict[val_sort] += word.count(val_sort) 
        else:
            occ_dict[val_sort] = word.count(val_sort) 


def remove_ponctuaion(text: str) -> str:
    """
    Remove all ponctuations from a string
    @params: 
        - string (str): string to remove ponct.
    """
    text = re.sub(' +', ' ', text)
    return text.translate(str.maketrans('', '', string.punctuation))


def sort_tuple(tup, most_freq=True): 
    """
    key is set to sort using second element of sublist lambda has been used 
    """
    tup.sort(key = lambda x: x[1], reverse=most_freq) 
    return tup


def get_top_n_occ(df, n: int)-> list:
    occ = {}
    for index, row in df.iterrows():
        text = remove_ponctuaion(row[1]).lower() # Lower case all char
        add_word_occ_to_dict(text, occ)
    sorted_tup = sort_tuple([(k, v) for k, v in occ.items()])
    return [sorted_tup[i] for i in range(0, n)]

### 1.1 Binary Bag of words 

In [4]:
def binary_bag_of_word(df, n) -> pd.DataFrame:
    # Step 1 and 2
    top_n_words_tup = get_top_n_occ(df, n)
    cols = [k for k, v in top_n_words_tup]
    
    
    # Step 3
    bow_df = pd.DataFrame(columns=["label"] + cols)
    for index, row in df.iterrows():
        bow_row = [row[0]]
        text = remove_ponctuaion(row[1]).lower()
        word_set = sorted(set(text.split(' ')))
        for word in cols:
            if word in word_set:
                bow_row.append(1)
            else:
                bow_row.append(0)
        bow_df.loc[len(bow_df)] = bow_row
    return bow_df

In [5]:
start_time = time.time()

# Process dataset - Binary Bag of word
n = 10000
b_test = binary_bag_of_word(pre_test, n)  
b_train = binary_bag_of_word(pre_train, n)  
b_valid = binary_bag_of_word(pre_valid, n)  

b_train = b_train.loc[:, ~b_train.columns.str.contains('^Unnamed')]
b_test = b_test.loc[:, ~b_test.columns.str.contains('^Unnamed')]
b_valid = b_valid.loc[:, ~b_valid.columns.str.contains('^Unnamed')]

print(f"--- {(time.time() - start_time)}s seconds ---")

# Save bag of words dataframe
b_test.to_csv(submit_path + 'b_test.csv')
b_valid.to_csv(submit_path + 'b_valid.csv')
b_train.to_csv(submit_path + 'b_train.csv')

--- 919.4922947883606s seconds ---


In [5]:
# Extract bag of words dataframe from file
b_test = pd.read_csv(submit_path + 'b_test.csv', sep=",")
b_valid = pd.read_csv(submit_path + 'b_valid.csv', sep=",")
b_train = pd.read_csv(submit_path + 'b_train.csv', sep=",")

b_train = b_train.loc[:, ~b_train.columns.str.contains('^Unnamed')]
b_test = b_test.loc[:, ~b_test.columns.str.contains('^Unnamed')]
b_valid = b_valid.loc[:, ~b_valid.columns.str.contains('^Unnamed')]

### 1.2 Frequency Bag of words

In [6]:
def frequency_bag_of_word(df, n) -> pd.DataFrame:
    # Step 1 and 2
    top_n_words_tup = get_top_n_occ(df, n)
    cols = [k for k, v in top_n_words_tup]
    
    # Step 3
    bow_df = pd.DataFrame(columns=["label"] + cols)
    for index, row in df.iterrows():
        bow_row = [row[0]]
        text = remove_ponctuaion(row[1]).lower()
        word_set = sorted(set(text.split(' ')))
        for word_tup in top_n_words_tup:
            bow_row.append(text.count(word_tup[0]) / word_tup[1])
        bow_df.loc[len(bow_df)] = bow_row
        
    return bow_df

In [8]:
start_time = time.time()

# Process dataset - Frequency Bag of word
n = 10000
f_test = frequency_bag_of_word(pre_test, n)  
f_valid = frequency_bag_of_word(pre_valid, n)  
f_train = frequency_bag_of_word(pre_train, n)

f_train = f_train.loc[:, ~f_train.columns.str.contains('^Unnamed')]
f_test = f_test.loc[:, ~f_test.columns.str.contains('^Unnamed')]
f_valid = f_valid.loc[:, ~f_valid.columns.str.contains('^Unnamed')]

print(f"--- {(time.time() - start_time)}s seconds ---")

# Save bag of words dataframe
f_test.to_csv(submit_path + 'f_test.csv')
f_valid.to_csv(submit_path + 'f_valid.csv')
f_train.to_csv(submit_path + 'f_train.csv')

--- 585.1088128089905s seconds ---


In [7]:
# Extract bag of words dataframe from file
f_test = pd.read_csv(submit_path + 'f_test.csv', sep=",")
f_valid = pd.read_csv(submit_path + 'f_valid.csv', sep=",")
f_train = pd.read_csv(submit_path + 'f_train.csv', sep=",")

f_train = f_train.loc[:, ~f_train.columns.str.contains('^Unnamed')]
f_test = f_test.loc[:, ~f_test.columns.str.contains('^Unnamed')]
f_valid = f_valid.loc[:, ~f_valid.columns.str.contains('^Unnamed')]

### 1.3 Submission

In [8]:
def submit_vocab(vocab):
    with open('medical_nlp-vocab.txt', 'w') as f:
        for i in range(1, len(vocab) + 1):
            f.writelines(f"{vocab[i-1][0]}\t{i}\t{vocab[i-1][1]}\n")
            
def submit_set(prefix, df, vocab):
    cols = [k for k, v in vocab]
    
    with open(f'medical {prefix}-traint.txt', 'w') as f:
        for index, row in df.iterrows():
            for i in range(len(cols)):
                if row[i+1] != 0:
                    f.write(f'{i+1} ')
            f.write(f'\t{row[0]}\n')

In [19]:
n = 10000

submit_vocab(get_top_n_occ(pre_train, n))
submit_set('test', b_test, get_top_n_occ(pre_test, n)) 
submit_set('valid', b_valid, get_top_n_occ(pre_valid, n)) 
submit_set('train', b_train, get_top_n_occ(pre_train, n)) 

## 2 2. Models using binary bag-of-words [19 marks]

For this question, we will focus on the Medical-NLP dataset with binary bag-of-words
(BBoW) representation. We will use the F1-score as the evaluation metric for the entire
assignment. 

### (a) Random classifier [2 marks]
As a baseline, report the performance of the random classifier (a classifier which
classifies a review into a uniformly random class) and the majority-class classifier
(a classifier which computes the majority class in the training set and classifies all
test instances as that majority class). 

In [15]:
from sklearn.metrics import f1_score

In [18]:
# 1. Random classifier
def random_model_f1(df):
    n_value = len(df)
    possible_value = df['label'].unique()
    print(possible_value)
    pred_values = [ possible_value[int(np.random.uniform(low=0.0, high=0.9999, size=None) * len(possible_value))] for _ in range(n_value)]
    pred_values = np.array(pred_values)
    print(pred_values)
    real_values = df['label'].to_numpy()
    print(real_values)
    f1 = f1_score(real_values, pred_values, labels=possible_value, average='macro')
    return f1
    
def most_freq_model_f1(df):
    real_values = df['label'].to_numpy()
    counts = np.bincount(real_values)
    most_freq_value = np.argmax(counts)
    pred_values = [most_freq_value] * len(real_values)
    f1 = f1_score(real_values, pred_values, labels=df['label'].unique(), average='macro')
    return f1

def test_model(model, fn, df):
    print(f"f1 {model}_set score: {fn(df)}")
    

In [19]:
print(" ~~ Random Model ~~")
test_model('train', random_model_f1, b_train)
test_model('valid', random_model_f1, b_valid)
test_model('test', random_model_f1, b_test)

print("\n")

print(" ~~ Most Frequence Model ~~")
test_model('train', most_freq_model_f1, b_train)
test_model('valid', most_freq_model_f1, b_valid)
test_model('test', most_freq_model_f1, b_test)

 ~~ Random Model ~~
[2 1 3 4]
[1 4 2 ... 2 3 3]
[2 1 2 ... 1 4 1]
f1 train_set score: 0.23985281411732642
[2 3 1 4]
[4 3 1 2 4 4 3 3 2 4 2 2 1 3 3 3 1 3 1 1 4 1 4 4 2 2 2 3 2 2 3 1 3 4 1 2 4
 2 4 3 4 3 4 2 3 1 4 2 1 2 3 1 3 4 4 3 3 3 4 2 2 2 3 3 2 1 2 3 1 3 2 1 3 3
 2 4 2 4 4 2 2 4 4 1 4 1 2 4 4 4 2 2 4 3 4 1 3 2 4 4 1 3 1 4 3 3 2 1 1 4 1
 1 4 2 2 4 1 1 1 3 1 3 1 3 2 4 3 1 2 2 2 1 3 4 2 3 4 1 2 2 4 1 3 2 2 2 4 3
 4 1 3 3 4 3 4 4 3 1 1 1 1 1 4 3 3 3 1 3 2 4 2 3 3 4 1 1 1 3 1 4 1 1 3 2 2
 3 1 2 4 4 2 3 4 2 2 2 3 4 4 3 2 4 1 2 4 4 4 2 3 2 4 4 1 3 3 4 2 1 4 3 4 4
 4 4 4 1 2 3 2 1 2 3 1 2 1 3 3 2 4 2 1 3 1 2 3 2 4 3 1 2 3 1 3 4 2 4 3 4 1
 2 3 2 1 1 2 1 2 2 3 3 1 4 4 2 3 2 2 2 4 1 1 4 2 4 1 4 4 4 3 2 2 3 4 2 1 4
 4 2 4 1 4 2 2 4 3 3 4 1 4 3 1 1 2 3 3 4 3 3 3 4 3 2 4 3 2 3 3 3 2 1 4 3 2
 3 2 1 1 4 2 3 2 1 3 1 1 2 4 1 3 2 4 1 1 4 2 3 3 2 1 2 4 4 2 3 2 1 2 1 3 4
 3 1 1 3 3 4 2 2 4 4 2 3 4 1 1 4 4 1 2 4 1 4 2 2 2 3 4 1 4 1 1 3 2 2 4 1 1
 3 1 4 1 1 4 4 2 4 4 1 1 3 2 1 3 3 3 1 3 4 4 4 3 1 1 4 2 3 

  'precision', 'predicted', average, warn_for)


In [22]:
import sklearn
print(sklearn.__version__)

0.21.3


### (b) Naive Bayes, Decision Trees, Logistic regression and Linear SVM [8 marks]
Now train Naive Bayes, Decision Trees, Logistic regression and Linear SVM for this
task. 

[Note: You should do a thorough hyper-parameter tuning by using the given
validation set. Also, note that you should use the appropriate naive Bayes classifier
for binary input features (also called Bernoulli naive Bayes).] 

In [10]:
def extract_data(df):
    X = df.drop(columns=['label']).to_numpy()
    y = df['label'].to_numpy()
    return X, y

def f1_bench(clf, df, print_=True):
    # Prediction
    X, real_values = extract_data(df)
    pred_values = clf.predict(X)
    
    # f1 score
    f1 = f1_score(real_values, pred_values, labels=df['label'].unique(), average='macro')
    if print_:
        print(f"f1 score: {f1}")
    else:
        return f1
    

In [11]:
from sklearn.naive_bayes import BernoulliNB

In [14]:
X, y = extract_data(b_test)
print(type(y))

<class 'numpy.ndarray'>


In [17]:
def bernoulli_naive_bayes(df, **kwargs):
    X, y = extract_data(df)
    return BernoulliNB(**kwargs).fit(X, y)

clf = bernoulli_naive_bayes(b_train)

print(f"train : {f1_bench(clf, b_train, False)}")
print(f"valid : {f1_bench(clf, b_valid, False)}")
print(f"test : {f1_bench(clf, b_test, False)}")

train : 0.5225228733709496
valid : 0.19589150434516062
test : 0.18533185585758225


  'precision', 'predicted', average, warn_for)


In [18]:
from sklearn.tree import DecisionTreeClassifier

In [19]:
def decision_tree(df, **kwargs):
    X, y = extract_data(df)
    return DecisionTreeClassifier(**kwargs).fit(X, y)

clf = decision_tree(b_train)

print(f"train : {f1_bench(clf, b_train, False)}")
print(f"valid : {f1_bench(clf, b_valid, False)}")
print(f"test : {f1_bench(clf, b_test, False)}")

train : 0.8172281157316901
valid : 0.340993471532188
test : 0.33200323263278014


In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
def logistic_regression(df, **kwargs):
    X, y = extract_data(df)
    return LogisticRegression(**kwargs).fit(X, y)

clf = logistic_regression(b_train)

print(f"train : {f1_bench(clf, b_train, False)}")
print(f"valid : {f1_bench(clf, b_valid, False)}")
print(f"test : {f1_bench(clf, b_test, False)}")



train : 0.8254875067208343
valid : 0.25581763529845875
test : 0.256838068595216


In [22]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [23]:
def SVM(df, **kwargs):
    X, y = extract_data(df)
    clf = make_pipeline(StandardScaler(), SVC(**kwargs))
    clf.fit(X, y)
    return clf

clf = SVM(b_train)

print(f"train : {f1_bench(clf, b_train, False)}")
print(f"valid : {f1_bench(clf, b_valid, False)}")
print(f"test : {f1_bench(clf, b_test, False)}")

KeyboardInterrupt: 

### (c) Report the list of hyper-parameters [3 marks]
Report the list of hyper-parameters you considered for each classifier, their range,
as well as the best values for these hyper-parameters, chosen based on the validation
set performance1.

In [7]:
from sklearn.model_selection import GridSearchCV
X, y = extract_data(b_train)
print(y)


NameError: name 'extract_data' is not defined

In [411]:
# Naive bayes
X, y = extract_data(f_train)
param_DT_bbow = {
    'alpha': [1, 0.1, 0.01, 0.001, 1.0e-6, 1.0e-10],
}

DT_grid_search = GridSearchCV(estimator=BernoulliNB(), param_grid=param_DT_bbow, refit=False, verbose=3, n_jobs=-1)
DT_grid_search.fit(X, y)

print(DT_grid_search.best_params_)

clf = bernoulli_naive_bayes(f_train, **DT_grid_search.best_params_)
print("Naive Bayes")
print(f"train : {f1_bench(clf, f_train, False)}")
print(f"valid : {f1_bench(clf, f_valid, False)}")
print(f"test : {f1_bench(clf, f_test, False)}")

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  18 | elapsed:   10.5s remaining:    8.4s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   12.4s finished


{'alpha': 1}
Naive Bayes
train : 0.5171964299159018
valid : 0.10999999999999999
test : 0.11841285063533924


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [423]:
X, y = extract_data(f_train)
param_DT_bbow = {
    'criterion': ["gini", "entropy"],
    'splitter': ["best", "random"],
    'max_depth': [None, 4000],
    'min_samples_split': [2, 4, 10]
}

DT_grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_DT_bbow, refit=False, verbose=3)
DT_grid_search.fit(X, y)

print(DT_grid_search.best_params_)

clf = decision_tree(b_train, **DT_grid_search.best_params_)
print("Decision tree")
print(f"train : {f1_bench(clf, b_train, False)}")
print(f"valid : {f1_bench(clf, b_valid, False)}")
print(f"test : {f1_bench(clf, b_test, False)}")

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] criterion=gini, max_depth=None, min_samples_split=2, splitter=best 
[CV]  criterion=gini, max_depth=None, min_samples_split=2, splitter=best, score=0.688, total=   5.5s
[CV] criterion=gini, max_depth=None, min_samples_split=2, splitter=best 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.5s remaining:    0.0s


[CV]  criterion=gini, max_depth=None, min_samples_split=2, splitter=best, score=0.692, total=   4.9s
[CV] criterion=gini, max_depth=None, min_samples_split=2, splitter=best 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   10.5s remaining:    0.0s


[CV]  criterion=gini, max_depth=None, min_samples_split=2, splitter=best, score=0.692, total=   5.1s
[CV] criterion=gini, max_depth=None, min_samples_split=2, splitter=random 
[CV]  criterion=gini, max_depth=None, min_samples_split=2, splitter=random, score=0.699, total=   4.8s
[CV] criterion=gini, max_depth=None, min_samples_split=2, splitter=random 
[CV]  criterion=gini, max_depth=None, min_samples_split=2, splitter=random, score=0.695, total=   4.7s
[CV] criterion=gini, max_depth=None, min_samples_split=2, splitter=random 
[CV]  criterion=gini, max_depth=None, min_samples_split=2, splitter=random, score=0.696, total=   4.8s
[CV] criterion=gini, max_depth=None, min_samples_split=4, splitter=best 
[CV]  criterion=gini, max_depth=None, min_samples_split=4, splitter=best, score=0.691, total=   5.5s
[CV] criterion=gini, max_depth=None, min_samples_split=4, splitter=best 
[CV]  criterion=gini, max_depth=None, min_samples_split=4, splitter=best, score=0.682, total=   5.2s
[CV] criterion=gi

[CV]  criterion=entropy, max_depth=None, min_samples_split=10, splitter=best, score=0.677, total=   4.4s
[CV] criterion=entropy, max_depth=None, min_samples_split=10, splitter=best 
[CV]  criterion=entropy, max_depth=None, min_samples_split=10, splitter=best, score=0.677, total=   4.2s
[CV] criterion=entropy, max_depth=None, min_samples_split=10, splitter=best 
[CV]  criterion=entropy, max_depth=None, min_samples_split=10, splitter=best, score=0.688, total=   4.1s
[CV] criterion=entropy, max_depth=None, min_samples_split=10, splitter=random 
[CV]  criterion=entropy, max_depth=None, min_samples_split=10, splitter=random, score=0.708, total=   4.2s
[CV] criterion=entropy, max_depth=None, min_samples_split=10, splitter=random 
[CV]  criterion=entropy, max_depth=None, min_samples_split=10, splitter=random, score=0.736, total=   3.9s
[CV] criterion=entropy, max_depth=None, min_samples_split=10, splitter=random 
[CV]  criterion=entropy, max_depth=None, min_samples_split=10, splitter=random, 

[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:  5.5min finished


Decision tree
train : 0.7899652463968564
valid : 0.29670788763116773
test : 0.30538861473283374


In [None]:
X, y = extract_data(f_train)
# param_DT_bbow = {
#     'solver': ['newton-cg'], 
#     'penalty': ['l2', 'none'], 
#     'C': [1.0, 0.1], 
#     'dual': [False],
#     'multi_class': ['multinomial']
# }

# DT_grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=param_DT_bbow, refit=False, verbose=3)
# DT_grid_search.fit(X, y)

# print(DT_grid_search.best_params_)

# clf = logistic_regression(b_train, **DT_grid_search.best_params_)
# print("Decision tree")
# print(f"train : {f1_bench(clf, b_train, False)}")
# print(f"valid : {f1_bench(clf, b_valid, False)}")
# print(f"test : {f1_bench(clf, b_test, False)}")

param_DT_bbow = {
#     'solver': ['lbfgs'], 
#     'penalty': ['l2', 'elasticnet', 'none'], 
#     'C': [1.0, 0.1], 
#     'dual': [False],
#     'multi_class': ['multinomial']
    'C':0.1, 'multi_class':'multinomial', 'penalty':'none', 'solver':'newton-cg'
}

clf = logistic_regression(b_train, **param_DT_bbow)
print("Decision tree")
print(f"train : {f1_bench(clf, b_train, False)}")
print(f"valid : {f1_bench(clf, b_valid, False)}")
print(f"test : {f1_bench(clf, b_test, False)}")

  "Setting penalty='none' will ignore the C and l1_ratio "


In [332]:
hyper_params = {
    'gamma':'auto',
    'kernel': 'sigmoid',
    'degree': 5,
    'coef0': 1.0,
    'shrinking': True,
    'probability': False,
    'tol': 1e-3,
    'cache_size': 200,
    'max_iter': -1,
    'random_state': None
}
clf = SVM(b_train, **hyper_params)

print("SVM")
print(f"train : {f1_bench(clf, b_train, False)}")
print(f"valid : {f1_bench(clf, b_valid, False)}")
print(f"test : {f1_bench(clf, b_test, False)}")

f1 score: 0.2560252834861072


### (d) F1-score [3 marks]
Report the training, validation, and test F1-score for all the classifiers  (with best hyper-parameter configuration). 

In [346]:
# Naive bayes
hyper_params = {
    'alpha': 1.0e-10,
    'binarize': 0.0,
    'fit_prior': True
}
clf = bernoulli_naive_bayes(b_train, **hyper_params)

print("Naive Bayes")
print(f"train : {f1_bench(clf, b_train, False)}")
print(f"valid : {f1_bench(clf, b_valid, False)}")
print(f"test : {f1_bench(clf, b_test, False)}")

hyper_params = {
    'criterion':"gini",
    'splitter':"best", 
    'max_depth': 4000,
    'min_samples_split': 2, 
    'min_samples_leaf' : 10,
    'min_weight_fraction_leaf': 0.0,
    'max_features': None,
    'random_state': 1000,
    'max_leaf_nodes': 100,
    'min_impurity_decrease': 0.0,
    'class_weight': None
}
clf = decision_tree(b_train, **hyper_params)

print("\nDecision Tree")
print(f"train : {f1_bench(clf, b_train, False)}")
print(f"valid : {f1_bench(clf, b_valid, False)}")
print(f"test : {f1_bench(clf, b_test, False)}")

hyper_params = { 
    'solver':'newton-cg', 
    'penalty': 'l2', 
    'C':1.0, 
    'dual': False,
    'tol': 1e-4,
    'fit_intercept': True,
    'random_state': None,
    'intercept_scaling': 1,
    'max_iter':100, 
    'multi_class':'multinomial'
}
clf = logistic_regression(b_train, **hyper_params)

print("\nLog Reg")
print(f"train : {f1_bench(clf, b_train, False)}")
print(f"valid : {f1_bench(clf, b_valid, False)}")
print(f"test : {f1_bench(clf, b_test, False)}")

hyper_params = {
    'gamma':'auto',
    'kernel': 'sigmoid',
    'degree': 5,
    'coef0': 1.0,
    'shrinking': True,
    'probability': False,
    'tol': 1e-3,
    'cache_size': 200,
    'max_iter': -1,
    'random_state': None
}
clf = SVM(b_train, **hyper_params)

print("\nSVM")
print(f"train : {f1_bench(clf, b_train, False)}")
print(f"valid : {f1_bench(clf, b_valid, False)}")
print(f"test : {f1_bench(clf, b_test, False)}")

Naive Bayes
train : 0.5761029784481652
valid : 0.21121649184149183
test : 0.20312014064619555
Decision Tree
train : 0.6956109917518675
valid : 0.3613305487071276
test : 0.26362123164636053
Log Reg
train : 0.8307231436953693
valid : 0.262772782271148
test : 0.25139234113604436
SVM
train : 0.5018864077080809
valid : 0.2560252834861072
test : 0.28256980681798927


### (e) Comment on the performance of different classifiers. [3 marks]

Why did a particular classifier
perform better than the rest? What was the role of the hyper-parameters in finding
the best results. 

## 3. Model using frequency bag-of-words [21 marks]

Now we will repeat question 2 but with frequency bag-of-words (FBoW) representation.

### (a) Naive Bayes, Decision Tree, Logistic regression and Linear SVM  [8 marks]
Train Naive Bayes, Decision Tree, Logistic regression and Linear SVM for this task.

[Note: Again, you should do a thorough hyper-parameter tuning by using the given
validation set. Also, note that you should use the appropriate naive Bayes classifier
for real valued input features (also called Gaussian naive Bayes).] [8 marks]

In [351]:
from sklearn.naive_bayes import GaussianNB

def gaussian_naive_bayes(df, **kwargs):
    X, y = extract_data(df)
    return GaussianNB(**kwargs).fit(X, y)

In [353]:
clf = bernoulli_naive_bayes(f_train)
print("Naivie bayes")
print(f"train : {f1_bench(clf, f_train, False)}")
print(f"valid : {f1_bench(clf, f_valid, False)}")
print(f"test : {f1_bench(clf, f_test, False)}")

clf = decision_tree(f_train)
print("Decision Tree")
print(f"train : {f1_bench(clf, f_train, False)}")
print(f"valid : {f1_bench(clf, f_valid, False)}")
print(f"test : {f1_bench(clf, f_test, False)}")

clf = logistic_regression(f_train)
print("Log Reg")
print(f"train : {f1_bench(clf, f_train, False)}")
print(f"valid : {f1_bench(clf, f_valid, False)}")
print(f"test : {f1_bench(clf, f_test, False)}")

clf = SVM(f_train)
print("SVM")
print(f"train : {f1_bench(clf, f_train, False)}")
print(f"valid : {f1_bench(clf, f_valid, False)}")
print(f"test : {f1_bench(clf, f_test, False)}")

Naivie bayes
train : 0.5171964299159018
valid : 0.10999999999999999
test : 0.11841285063533924


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Decision Tree
train : 1.0
valid : 0.21554428491806374
test : 0.17420969070172534




Log Reg
train : 0.6488389753831039
valid : 0.1757513807786575
test : 0.07911457265833162


  'precision', 'predicted', average, warn_for)


SVM
train : 0.6669423820274355


  'precision', 'predicted', average, warn_for)


valid : 0.090311986863711
test : 0.07698815566835872


  'precision', 'predicted', average, warn_for)


### (b) Report the list of hyper-parameters [3 marks]
Report the list of hyper-parameters you considered for each classifier, their range,
as well as the best values for these hyper-parameters, chosen based on the validation
set performance. 

In [358]:
X, y = extract_data(f_train)
param_DT_bbow = {
    'criterion': ["gini", 'entropy'],
    'splitter': ["best", "random"],
    'max_depth': [None, 1000, 4000],
    'min_samples_split': [2, 10], 
    'min_samples_leaf' : [1, 10],
    'max_features': [None, 1000],
    'random_state': [None, 1000]
}

DT_grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_DT_bbow, refit=False, verbose=3, n_jobs=-1)
DT_grid_search.fit(X, y)

print(DT_grid_search.best_params_)
clf = decision_tree(f_train, **DT_grid_search.best_params_)
print(f"train : {f1_bench(clf, f_train, False)}")
print(f"valid : {f1_bench(clf, f_valid, False)}")
print(f"test : {f1_bench(clf, f_test, False)}")

Fitting 3 folds for each of 192 candidates, totalling 576 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   36.3s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 576 out of 576 | elapsed: 11.0min finished


{'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 10, 'min_samples_split': 2, 'random_state': None, 'splitter': 'random'}
train : 0.8129893829433428
valid : 0.1966441340149671
test : 0.20046829222817086


In [None]:
model_visualization(X_train_clf,y_train_clf, DT_grid_search,'Logistic Regression-Training')

In [None]:
train = f_train
valid = f_valid
test = f_test

# Naive bayes
hyper_params = {
    'alpha': 1.0e-10,
    'binarize': 0.0,
    'fit_prior': True
}
clf = bernoulli_naive_bayes(f_train, **hyper_params)

print("Naive Bayes")
print(f"train : {f1_bench(clf, b_train, False)}")
print(f"valid : {f1_bench(clf, b_valid, False)}")
print(f"test : {f1_bench(clf, b_test, False)}")

hyper_params = {
    'criterion':"gini",
    'splitter':"best", 
    'max_depth': 4000,
    'min_samples_split': 2, 
    'min_samples_leaf' : 10,
    'min_weight_fraction_leaf': 0.0,
    'max_features': None,
    'random_state': 1000,
    'max_leaf_nodes': 100,
    'min_impurity_decrease': 0.0,
    'class_weight': None
}
clf = decision_tree(b_train, **hyper_params)

print("\nDecision Tree")
print(f"train : {f1_bench(clf, b_train, False)}")
print(f"valid : {f1_bench(clf, b_valid, False)}")
print(f"test : {f1_bench(clf, b_test, False)}")

hyper_params = { 
    'solver':'newton-cg', 
    'penalty': 'l2', 
    'C':1.0, 
    'dual': False,
    'tol': 1e-4,
    'fit_intercept': True,
    'random_state': None,
    'intercept_scaling': 1,
    'max_iter':100, 
    'multi_class':'multinomial'
}
clf = logistic_regression(b_train, **hyper_params)

print("\nLog Reg")
print(f"train : {f1_bench(clf, b_train, False)}")
print(f"valid : {f1_bench(clf, b_valid, False)}")
print(f"test : {f1_bench(clf, b_test, False)}")

hyper_params = {
    'gamma':'auto',
    'kernel': 'sigmoid',
    'degree': 5,
    'coef0': 1.0,
    'shrinking': True,
    'probability': False,
    'tol': 1e-3,
    'cache_size': 200,
    'max_iter': -1,
    'random_state': None
}
clf = SVM(b_train, **hyper_params)

print("\nSVM")
print(f"train : {f1_bench(clf, b_train, False)}")
print(f"valid : {f1_bench(clf, b_valid, False)}")
print(f"test : {f1_bench(clf, b_test, False)}")

### (c) F1-score [3 marks]
Report the training, validation, and test F1-score for all the classifiers (with best
hyper-parameter configuration). 

### (d) Comment on the performance of different classifiers [3 mark]
Why did a particular classifier perform better than the rest? What was the role of the hyper-parameters in finding
the best results. 

### (e) Compare the performance with the binary bag-of-words based classifiers. [2 mark]

Why is
there a difference in the performance? Give a brief explanation comparing BBoW
Naive Bayes and FBoW Naive Bayes and similarly for other models. 

### (f)  Comment on the representation [2 mark]
Which representation is better? Why?

In [None]:
start_time = time.time()
print(f"--- {(time.time() - start_time)}s seconds ---")

Unnamed: 0.1,Unnamed: 0,label,the,and,was,of,to,a,with,in,...,arthritis4,junctions,amoxil,prevented,3000,hypertensionallergies,approval,25mm,tapers,c5c6postoperative
0,0,2.0,1.7e-05,3e-05,0.0,2.1e-05,7.4e-05,0.00128,3.5e-05,0.000192,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1.0,0.000342,0.000135,0.000285,0.000186,0.000394,0.004675,0.000211,0.000882,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2.0,9.4e-05,0.000255,3.6e-05,0.000145,0.000296,0.005776,0.000141,0.001764,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,1.0,0.000461,0.000359,0.000374,0.00029,0.000542,0.007176,0.000423,0.002493,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,2.0,0.00012,0.000195,0.000125,0.000414,0.000517,0.004883,0.000388,0.000997,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [341]:
param_DT_bbow = {
    'max_depth': [None, 100, 4000],
    'min_samples_split': [2, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf' : [10, 100],
    'random_state': [10, 100, 1000],
    'max_leaf_nodes': [10, 100, 1000]
}

DT_grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_DT_bbow, refit=False, verbose=3, n_jobs=-1)
DT_grid_search.fit(X, y)

print(DT_grid_search.best_params_)
clf = decision_tree(b_train, **DT_grid_search.best_params_)
f1_bench(clf, b_valid)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  9.1min finished


{'max_depth': None, 'max_leaf_nodes': 100, 'min_samples_leaf': 100, 'min_samples_split': 2, 'random_state': 10}
f1 score: 0.32989399205709746


In [6]:
a = [1,2,3,4]
print(sum(a))
a = a / sum(a)
print(a)

10


TypeError: unsupported operand type(s) for /: 'list' and 'int'