# Classification using NB, SVM, RF

In [None]:
import pandas as pd
import numpy as np
import utipy as ut
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import ComplementNB, BernoulliNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import naive_bayes
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score, GridSearchCV

In [None]:
ngrams_upper_limit = 3
use_subwords = True

In [None]:
project_path = 
dpath = project_path + "data/preprocessed/"
prefix = "upsampled_" # else "" "upsampled_" "iscontrol_downsampled_"
data = pd.read_csv(dpath + prefix + "grouped_for_tf.csv")
stopwords = list(pd.read_csv(project_path+"stopwords_list_DK.txt", header=None)[0])

In [None]:
data["isControl"] = (data["Diagnosis"] == "Control").astype(int)

In [None]:
data.head()

In [None]:
stopwords[0:8]

In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]

## Subword utils

In [None]:
def split_to_subwords(w, lower_limit, upper_limit):
    w_len = len(w)
    if (w_len < 1):
        return []
    upper_limit = min(upper_limit, w_len)
    sizes = list(range(lower_limit, upper_limit+1))
    if len(w) not in sizes:
        sizes.append(w_len)
    splits = flatten([ut.window(list(w), size=sz, discard_shorts=False)[0] \
            for sz in sizes])
    add_hashtag = lambda t: "##" + t if t != w else t
    tokens = [add_hashtag("".join(t)) for t in splits]
    return tokens

split_to_subwords("monster", 2, 3)

In [None]:
def subword_tokenizer(x, lower_limit=2, upper_limit=3):
    if not isinstance(x, list):
        tokens = x.split(" ")
    else:
        tokens = x
    tokens = flatten([split_to_subwords(t, lower_limit, upper_limit) for t in tokens])
    return tokens

subword_tokenizer("Jeg er en dejlig kat !")
        

## Preprocessing

In [None]:
def clean_sentence(x, stop_words=None, add_subwords=False, add_padding=False, pad_to=25):
    x = x.lower()
    tokens = x.split(" ")
    disallowed_tokens = [".","?","!",",","-","..."]
    if stop_words is not None:
        disallowed_tokens += stop_words
    tokens = [t for t in tokens if t not in disallowed_tokens]
    if add_subwords:
        tokens = subword_tokenizer(tokens, 2,3)
    if add_padding:
        tokens += ["PAD"]*(pad_to-len(tokens))
    return " ".join(tokens)

In [None]:
# Preprocess each sentence
data["Transcript.Split"] = [clean_sentence(trscpt, stop_words=stopwords, add_subwords=use_subwords) \
                            for trscpt in data["Transcript.Split"]]
    

## Tests

In [None]:
# For testing functions
train_data = data[data["Fold"] != 1]
test_data = data[data["Fold"] == 1]

train_data["Transcript.Split"].head(3)

In [None]:
X_train = np.asarray(train_data["Transcript.Split"])
X_test = np.asarray(train_data["Transcript.Split"])


In [None]:
X_test[:3]

In [None]:
count_vec = CountVectorizer(max_features=50000, ngram_range=(1, ngrams_upper_limit))
X_train_bow = count_vec.fit_transform(X_train)
X_test_bow = count_vec.transform(X_test)
print(X_train_bow.shape)
print(X_test_bow.shape)

In [None]:
y_train = np.asarray([str(l) for l in train_data["Diagnosis"]])
y_test = np.asarray([str(l) for l in test_data["Diagnosis"]])

In [None]:
MNBclf_bow = MultinomialNB()
MNBclf_bow.fit(X_train_bow, y_train)

In [None]:
CNBclf_bow = ComplementNB()
CNBclf_bow.fit(X_train_bow, y_train)

In [None]:
def cv_single(data, current_fold, fold_col="Fold", classifiers={}):
    
    # Split data in train/test
    train_data = data[data[fold_col] != current_fold]
    test_data = data[data[fold_col] == current_fold]
    
    # Clean sentences - remove stopwords and some punctuation (done outside)
    # X_train = np.asarray([clean_sentence(str(t), stop_words=stopword) \
    #                       for t in train_data["Transcript.Split"]])
    # X_test = np.asarray([clean_sentence(str(t), stop_words=stopwords) \
    #                      for t in test_data["Transcript.Split"]])
    X_train = np.asarray(train_data["Transcript.Split"])
    X_test = np.asarray(test_data["Transcript.Split"])
    
    # Vectorize features (tokens)
    count_vec = CountVectorizer(max_features=10000)
    X_train_bow = count_vec.fit_transform(X_train)
    X_test_bow = count_vec.transform(X_test)
    
    # Prepare labels for diagnosis classification
    y_train_diagnosis = np.asarray([str(l) for l in train_data["Diagnosis"]])
    y_test_diagnosis = np.asarray([str(l) for l in test_data["Diagnosis"]])
    
    # Prepare labels for isControl classification
    y_train_iscontrol = np.asarray([l for l in train_data["isControl"]])
    y_test_iscontrol = np.asarray([l for l in test_data["isControl"]])
    
    # Fit each classifier
    fitted_models_diagnosis = {key:clf().fit(X_train_bow, y_train_diagnosis) for (key,clf) in classifiers.items()}
    fitted_models_iscontrol = {key:clf().fit(X_train_bow, y_train_iscontrol) for (key,clf) in classifiers.items()}

    # Predict test set with each classifier
    predictions = pd.concat([pd.DataFrame({
        "Fold":current_fold,
        "Classifier":key, 
        "DiagnosisPrediction":clf.predict(X_test_bow),
        "IsControlPrediction":fitted_models_iscontrol[key].predict(X_test_bow),
        "Target":y_test_diagnosis,
        "isControl":y_test_iscontrol,
        "Subwords":int(use_subwords)}) \
                             for (key,clf) in fitted_models_diagnosis.items()])
    
    return predictions
    
    
    
    

In [None]:
cv_single(data, current_fold=1, fold_col="Fold", classifiers={"ComplementNB": ComplementNB,
                                                              "MultinomialNB": MultinomialNB,
                                                              "BernoulliNB": BernoulliNB,
                                                              "RandomForestClassifier": RandomForestClassifier,
                                                              "LinearSVC":lambda : LinearSVC(dual=False, max_iter=3000)})

In [None]:
def cross_validate(data, fold_col="Fold", classifiers={"ComplementNB": ComplementNB}):
    return pd.concat([cv_single(data, current_fold=current_fold, 
                                fold_col=fold_col, classifiers=classifiers) \
                      for current_fold in np.unique(data[fold_col])]).reset_index(drop=True)

In [None]:
cv_results = cross_validate(data, fold_col="Fold", 
                            classifiers={"ComplementNB": ComplementNB,
                                         "MultinomialNB": MultinomialNB,
                                         "BernoulliNB": BernoulliNB,
                                         "RandomForestClassifier": lambda : RandomForestClassifier(n_estimators=100),
                                         "LinearSVC":lambda : LinearSVC(dual=False, max_iter=3000)
                                         # "MLPClassifier": MLPClassifier # Too slow
                                        })

In [None]:
sbw = "_subwords_" if use_subwords else "_"
cv_results.to_csv(project_path+prefix+"ngrams_"+str(ngrams_upper_limit)+sbw+"NB_predictions.csv")