In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

# Read data and split into train and test sets

In [None]:
X = pd.read_csv('processed_data.csv', index_col = 0)

In [None]:
y = pd.read_csv('training_set_labels.csv',index_col = 0)

In [None]:
y.head()

In [None]:
y = y.drop('h1n1_vaccine', axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Test various models

#### Model perfroms resonably considering no optimisations. Has a 2 large variations in the cross val scores bu others are very stable

In [None]:
tree_model = DecisionTreeClassifier()
cross_val_score(tree_model, X_train, y_train, cv=10)

#### Model performs quite well but is unstable. Might improve once hypers are adjusted and features pruned

In [None]:
svm_model = LinearSVC()
cross_val_score(svm_model, X_train, y_train.values.ravel(), cv=10)

#### All the forest based models perform well and are stable. Adaboosted slghly edges the others out

In [None]:
forest_model = RandomForestClassifier(max_depth=4)
cross_val_score(forest_model, X_train, y_train.values.ravel(), cv=10)

In [None]:
gbm_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0)
cross_val_score(gbm_model, X_train, y_train.values.ravel(), cv=10)

In [None]:
ada_model = AdaBoostClassifier(n_estimators=100)
cross_val_score(ada_model, X_train, y_train.values.ravel(), cv=10)

# Final tests

In [None]:
svm_model.fit(X_train, y_train.values.ravel())
svm_model.score(X_test,y_test)

In [None]:
ada_model.fit(X_train, y_train.values.ravel())
ada_model.score(X_test,y_test)

In [None]:
y_pred = ada_model.predict(X_test)

In [None]:
roc_auc_score(y_test,y_pred)

In [None]:
y_pred = svm_model.predict(X_test)

In [None]:
roc_auc_score(y_test,y_pred)

# Will proceed with adaboosted tree and return to SVC in the future