In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

npf = pd.read_csv("data/npf_train.csv")
npf["class4"], _ = pd.factorize(npf["class4"], )
npf["class2"] = (npf["class4"]!= 1).astype(int) #Pandas happens to factorize nonevent = 1
y2 = npf["class2"].values
y4 = npf["class4"].values

#Remove some columns
columns = npf.columns.values.tolist()
remove = ["id", "date", "event", "partlybad", "class2", "class4"]
columns = [column for column in columns if column not in remove]
npf = npf[columns].apply(pd.to_numeric, errors='coerce')
X = npf[columns].values

X = npf[columns].values

X_rest, X_test, y2_rest, y2_test, y4_rest, y4_test = train_test_split(X, y2, y4, test_size = 0.2, random_state = 42)
X_train, X_validate, y2_train, y2_validate, y4_train, y4_validate = train_test_split(X_rest, y2_rest, y4_rest, train_size= 0.75, random_state= 42)

npf = npf[columns].apply(pd.to_numeric, errors='coerce')

models = [
    DummyClassifier(strategy= "most_frequent"),
    RandomForestClassifier(),
    Pipeline([
        ('minmax', MinMaxScaler()),
        ('naive_bayes', MultinomialNB()),
    ]),
    SVC(),
    KNeighborsClassifier(10),
    Pipeline([
        ('scaler', StandardScaler()),
        ('lr', LogisticRegression(max_iter= 10000)),
    ]),
]

def get_accuracy2(model):
    model.fit(X_train, y2_train)
    y_pred = model.predict(X_validate)
    return accuracy_score(y2_validate, y_pred)

def get_accuracy4(model):
    model.fit(X_train, y4_train)
    y_pred = model.predict(X_validate)
    return accuracy_score(y4_validate, y_pred)


results = pd.DataFrame({"model": ["Dummy", "RFC", "NB", "SVC", "KNN10", "LR"]})
results["accuracy2"] = [get_accuracy2(model) for model in models]
results["accuracy4"] = [get_accuracy4(model) for model in models]
results["cv2"] = [cross_val_score(model, X, y2, cv=5, scoring="accuracy").mean() for model in models]
results["cv4"] = [cross_val_score(model, X, y4, cv=5, scoring="accuracy").mean() for model in models]
results = results.set_index("model")
results = results.sort_values("accuracy2", ascending= False)

results

Unnamed: 0_level_0,accuracy2,accuracy4,cv2,cv4
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NB,0.903226,0.591398,0.799392,0.583988
RFC,0.892473,0.677419,0.864095,0.644367
LR,0.88172,0.645161,0.840439,0.612132
SVC,0.827957,0.494624,0.771318,0.55374
KNN10,0.827957,0.602151,0.801636,0.590416
Dummy,0.569892,0.430108,0.495699,0.5
