# Model selection x challenge

Model selection using the term project challenge setup. So basically models are trained using entire training data and after that, evaluated using test data. Using three different scorings, which are categories in term project challenge. Model scoring methods are: binary accuracy, multi-class accuracy and perplexity.

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.manifold import Isomap
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA, KernelPCA

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE

## Functions

In [2]:
def lablesToBinary(multi_labels):
    """Transform multi-class labels of NPF data to binary labels

    Keyword arguments:
    multi_labels -- Labels to transform as pandas.Series
    
    Return:
    Equal size pandas.Series containing binary labels
    """
    return multi_labels.apply(lambda x: "nonevent" if x == "nonevent" else "event")

In [3]:
def accuracy_score(y, y_pred):
    """Compute accuracy score of the model

    Keyword arguments:
    y -- Real labels
    y_pred -- Predicted labels
    
    Return:
    Accuracy score as float number
    """
    return np.mean(y == y_pred)

In [4]:
def perplexity_score(true_labels, pred_p):
    """Compute perplexity score of the model

    Keyword arguments:
    true_labels -- Real labels
    pred_p -- Predicted probabilities of the model
    
    Return:
    Perplexity score as float number
    """
    cond_result = []

    for i in range(0, true_labels.size):
        if true_labels[i] == "nonevent":
            cond_result.append(1 - pred_p[i])
        else:
            cond_result.append(pred_p[i])

    return np.exp( -np.mean( np.log( np.array(cond_result) ) ) )

In [5]:
def predict_npf(classifier, X_tr, y, X_te):
    """Train ML model and predict test data

    Keyword arguments:
    classifier -- Intance of scikit-learn Pipeline
    X_tr -- Training data
    y -- Labels of the training data
    X_te -- Test data
    
    Return:
    Pandas.DataFrame containing predicted classes and their probabilities
    """
    classifier.fit(X_tr, y)

    classes = pd.Series(classifier.predict(X_te))
    p = pd.DataFrame(classifier.predict_proba(X_te))
    
    results = []

    for i in range(0, classes.size):
        label = classes[i]
        p_value = 1 - p.loc[i, label]
        results.append([label, p_value])
    
    answers = pd.DataFrame(results, columns=['class4', 'p'])
    answers['class4'] = mapping_class4[answers['class4']]
    
    return answers

## Data preperation

In [6]:
df = pd.read_csv('../data/npf_train.csv', index_col='id')
df.drop(['date', 'partlybad'], axis=1, inplace=True)

class2 = df['class4'].copy()
class2[class2 != 'nonevent'] = 'event'
df['class2'] = class2

df['class4'], mapping_class4 = df['class4'].astype('category').factorize()
df['class2'], mapping_class2 = df['class2'].astype('category').factorize()

In [7]:
df_test = pd.read_csv('../data/npf_test.csv', index_col='id')
test_data = df_test.drop(['date', 'partlybad', 'class4'], axis=1)

In [8]:
train_data = df.drop(['class4', 'class2'], axis=1)

scaled = StandardScaler().fit_transform(pd.concat([train_data, test_data]))

X_train = pd.DataFrame(scaled[0:430], columns=train_data.columns)
X_test = pd.DataFrame(scaled[430:], columns=train_data.columns)

y_class2 = df['class2']
y_class4 = df['class4']

y_test, mapping_test = df_test['class4'].astype('category').factorize()
y_test = mapping_test[y_test]

y_test_binary = lablesToBinary(pd.Series(y_test))

In [9]:
classifiers = [
    ('logistic', LogisticRegression()),
    ('kNeighbour', KNeighborsClassifier(3)),
    ('svcLinear', SVC(kernel="linear", C=0.025, probability=True)),
    ('svc', SVC(gamma=2, C=1, probability=True)),
    ('gaussian', GaussianProcessClassifier(1.0 * RBF(1.0))),
    ('decissionTree', DecisionTreeClassifier(max_depth=5)),
    ('rfc', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ('mlp', MLPClassifier(alpha=1, max_iter=1000)),
    ('ada', AdaBoostClassifier()),
    ('gaussianNB', GaussianNB()),
    ('qda', QuadraticDiscriminantAnalysis())]

In [10]:
p = 0.7
variance = p * (1 - p)

In [11]:
dimension_reductions = [
    ('iso', Isomap(n_components=70)),
    ('lle', LocallyLinearEmbedding(n_components=50)), 
    ('llemodified', LocallyLinearEmbedding(n_components=50, method='modified', n_neighbors=90)),
    ('svd', TruncatedSVD(n_components=60)),
    ('lda', LinearDiscriminantAnalysis(n_components=2)),
    ('pca', PCA()),
    ('kpca', KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=1)),
    ('sel', VarianceThreshold(threshold=variance)),
    ('kbest', SelectKBest(f_classif, k=20)), 
    ('kbestmutual', SelectKBest(mutual_info_classif, k=40)),
    ('select', SelectFromModel(LinearSVC(penalty="l2"))),
    ('selecttree', SelectFromModel(ExtraTreesClassifier(n_estimators=70))),
    ('rfe', RFE(estimator=DecisionTreeClassifier(), n_features_to_select=50))]

## Experiment

In [12]:
statistics = pd.DataFrame(index = ['accuracy_binary', 'accuracy_multi', 'peplexity'])

In [13]:
import warnings
warnings.filterwarnings('ignore')

for model_used in classifiers:
    pred = predict_npf(Pipeline([model_used]), X_train, y_class4, X_test)

    acc_binary = accuracy_score(y_test_binary, lablesToBinary(pred['class4']))
    acc_multi = accuracy_score(y_test, pred['class4'])
    perp = perplexity_score(y_test, pred['p'])

    statistics[ model_used[0] ] = [acc_binary, acc_multi, perp]

    for feature_selection in dimension_reductions:
        pipe = Pipeline([feature_selection, model_used])
        pred = predict_npf(pipe, X_train, y_class4, X_test)

        acc_binary = accuracy_score(y_test_binary, lablesToBinary(pred['class4']))
        acc_multi = accuracy_score(y_test, pred['class4'])
        perp = perplexity_score(y_test, pred['p'])

        column = model_used[0] + '_' + feature_selection[0]
        statistics[ column ] = [acc_binary, acc_multi, perp]

## Evalution

In [14]:
result = statistics.transpose(copy=True)
result

Unnamed: 0,accuracy_binary,accuracy_multi,peplexity
logistic,0.875648,0.663212,1.938318
logistic_iso,0.803109,0.583420,inf
logistic_lle,0.533679,0.530570,1.757439
logistic_llemodified,0.555440,0.541969,1.751146
logistic_svd,0.872539,0.662176,1.934030
...,...,...,...
qda_kbest,0.794819,0.585492,inf
qda_kbestmutual,0.825907,0.640415,inf
qda_select,0.797927,0.617617,inf
qda_selecttree,0.840415,0.652850,inf


In [15]:
result.describe()

Unnamed: 0,accuracy_binary,accuracy_multi,peplexity
count,154.0,154.0,154.0
mean,0.76254,0.604953,inf
std,0.123666,0.072376,
min,0.474611,0.204145,1.418967
25%,0.751554,0.548187,1.741961
50%,0.819689,0.629534,1.995068
75%,0.843264,0.653886,inf
max,0.88601,0.696373,inf


In [16]:
result.sort_values(['accuracy_binary'], ascending = False)[0:10]

Unnamed: 0,accuracy_binary,accuracy_multi,peplexity
mlp_sel,0.88601,0.688083,2.051158
mlp_svd,0.88601,0.694301,1.975621
mlp,0.882902,0.694301,2.027226
mlp_pca,0.881865,0.696373,2.039675
gaussian_pca,0.878756,0.694301,1.630433
gaussian_svd,0.878756,0.694301,1.6304
gaussian,0.878756,0.694301,1.630433
gaussian_sel,0.878756,0.694301,1.630433
mlp_selecttree,0.876684,0.684974,1.740673
logistic,0.875648,0.663212,1.938318


In [17]:
result.sort_values(['accuracy_multi'], ascending = False)[0:10]

Unnamed: 0,accuracy_binary,accuracy_multi,peplexity
mlp_pca,0.881865,0.696373,2.039675
gaussian,0.878756,0.694301,1.630433
gaussian_sel,0.878756,0.694301,1.630433
gaussian_pca,0.878756,0.694301,1.630433
mlp,0.882902,0.694301,2.027226
mlp_svd,0.88601,0.694301,1.975621
gaussian_svd,0.878756,0.694301,1.6304
mlp_rfe,0.86943,0.689119,1.896612
mlp_sel,0.88601,0.688083,2.051158
mlp_selecttree,0.876684,0.684974,1.740673


In [22]:
result.sort_values(['peplexity'])[0:10]

Unnamed: 0,accuracy_binary,accuracy_multi,peplexity
svcLinear_llemodified,0.527461,0.527461,1.418967
svcLinear_lle,0.527461,0.527461,1.484711
svcLinear_svd,0.858031,0.675648,1.596527
svcLinear,0.858031,0.674611,1.60466
svcLinear_selecttree,0.855959,0.667358,1.604852
svcLinear_sel,0.858031,0.674611,1.606007
svc_llemodified,0.826943,0.664249,1.619814
svcLinear_kbest,0.809326,0.633161,1.623401
svcLinear_rfe,0.845596,0.672539,1.623538
svcLinear_kbestmutual,0.835233,0.660104,1.628935


## Save results

In [23]:
result.to_csv('../data/models_agaisnt_test_data.csv', index_label="model_name")