In [1]:
import numpy as np
import pandas as pd

from glob import glob

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
datasets = dict()
for d in ('', '_basic', '_classic', '_classic_nos_nol'):
    train = pd.read_pickle(f'../data/datasets/train{d}.pkl')
    val = pd.read_pickle(f'../data/datasets/val{d}.pkl')
    d = 'nopreproc' if d == '' else d[1:]
    datasets[d] = (train, val)

In [3]:
names = [
    "Logistic Regression", "SGD Classifier", "KNN", "Linear SVM", "RBF SVM",
    "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes",
    "QDA"
]

classifiers = [
    LogisticRegression(),
    SGDClassifier(alpha=.0001, max_iter=100),
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
]

In [4]:
encoder = LabelEncoder()
lbencoder = LabelBinarizer()
vectorizer = TfidfVectorizer(
    sublinear_tf = True, max_df = 0.5,
    stop_words = None)

In [None]:
scores = list()
i = 1
for dataname, data in datasets.items():
    y_train, x_train = data[0].label, data[0].tweet
    y_val, x_val = data[1].label, data[1].tweet
    
    y_train = encoder.fit_transform(y_train)
    y_val = encoder.transform(y_val)
    
    x_train = vectorizer.fit_transform(x_train)
    x_val = vectorizer.transform(x_val)
    
    for name, clf in zip(names, classifiers):
        print(f'{i} of 40: {dataname} with {name}' , end = '\r', flush = True)
        try:
            clf.fit(x_train, y_train)
            y_pred = clf.predict(x_val)
        except TypeError:
            clf.fit(x_train.toarray(), y_train)
            y_pred = clf.predict(x_val.toarray())
        scores.append({
            "data": dataname,
            "model": name,
            "accuracy": accuracy_score(y_val, y_pred),
            "f1_score": f1_score(y_val, y_pred, average = "macro"),
            "auc": roc_auc_score(
                lbencoder.fit_transform(y_val),
                lbencoder.fit_transform(y_pred),
                average = "macro", multi_class = 'ovr')
        })
        i += 1

In [6]:
scores = pd.DataFrame(scores).sort_values(by = ['f1_score', 'auc'], ascending = False)

In [8]:
scores.sort_values(by = ['f1_score', 'accuracy', 'auc'], ascending = False)

Unnamed: 0,data,model,accuracy,f1_score,auc
21,classic,SGD Classifier,0.709375,0.608302,0.717033
1,nopreproc,SGD Classifier,0.671094,0.582889,0.697898
31,classic_nos_nol,SGD Classifier,0.675781,0.573454,0.693037
11,basic,SGD Classifier,0.671875,0.56049,0.690356
20,classic,Logistic Regression,0.698438,0.517371,0.662689
30,classic_nos_nol,Logistic Regression,0.675,0.494052,0.646813
12,basic,KNN,0.610938,0.487661,0.654789
2,nopreproc,KNN,0.610938,0.48692,0.659867
22,classic,KNN,0.609375,0.477684,0.646555
32,classic_nos_nol,KNN,0.610156,0.475462,0.645948
