# Code for predicting

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.decomposition import PCA

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## Functions

In [2]:
def k_fold_cross_validation(ml_pipeline, X, y, n=5, k=10, score='accuracy'):
    """Perform N repeated K-fold cross-validation

    Keyword arguments:
    ml_pipeline -- Intance of scikit-learn's Pipeline
    X -- Data to perform cross-validation
    y -- Labels of the data
    n -- Amount of times cross-validation is repeated (default is 5)
    k -- Amount of folds that the data is splitted to perform 
         cross-validation (default is 10)
    score -- Scoring type as a string for scikit-learn's 
             cross_val_score method (default is accuracy)
    
    Return:
    Two element numpy array where first value is mean of cross-validation scores
    and second is standard deviation of cross-validation scores.
    """
    cv = RepeatedStratifiedKFold(n_splits = n, 
                                 n_repeats = k, 
                                 random_state = 1)
    n_scores = cross_val_score(ml_pipeline, X, y, 
                               scoring = score, cv = cv, 
                               n_jobs = -1)
    
    return(np.array([np.mean(n_scores), np.std(n_scores)]))

In [3]:
def lablesToBinary(multi_labels):
    """Transform multi-class labels of NPF data to binary labels

    Keyword arguments:
    multi_labels -- Labels to transform as pandas.Series
    
    Return:
    Equal size pandas.Series containing binary labels
    """
    return multi_labels.apply(lambda x: "nonevent" if x == "nonevent" else "event")

In [4]:
def accuracy_score(y, y_pred):
    """Compute accuracy score of the model

    Keyword arguments:
    y -- Real labels
    y_pred -- Predicted labels
    
    Return:
    Accuracy score as float number
    """
    return np.mean(y == y_pred)

In [5]:
def perplexity_score(true_labels, pred_p):
    """Compute perplexity score of the model

    Keyword arguments:
    true_labels -- Real labels
    pred_p -- Predicted probabilities of the model
    
    Return:
    Perplexity score as float number
    """
    cond_result = []

    for i in range(0, true_labels.size):
        if true_labels[i] == "nonevent":
            cond_result.append(1 - pred_p[i])
        else:
            cond_result.append(pred_p[i])

    return np.exp( -np.mean( np.log( np.array(cond_result) ) ) )

In [6]:
def predict_npf(classifier, X_tr, y, X_te):
    """Train ML model and predict test data

    Keyword arguments:
    classifier -- Intance of scikit-learn Pipeline
    X_tr -- Training data
    y -- Labels of the training data
    X_te -- Test data
    
    Return:
    Pandas.DataFrame containing predicted classes and their probabilities
    """
    classifier.fit(X_tr, y)

    classes = pd.Series(classifier.predict(X_te))
    p = pd.DataFrame(classifier.predict_proba(X_te))

    results = []

    for i in range(0, classes.size):
        label = classes[i]
        p_value = p.loc[i, label]
        results.append([label, p_value])

    answers = pd.DataFrame(results, columns=['class4', 'p'])
    answers['class4'] = mapping_class4[answers['class4']]

    return answers

## Data preperation

In [7]:
df = pd.read_csv('../data/npf_train.csv', index_col='id')
df.drop(['date', 'partlybad'], axis=1, inplace=True)

class2 = df['class4'].copy()
class2[class2 != 'nonevent'] = 'event'
df['class2'] = class2

df['class4'], mapping_class4 = df['class4'].astype('category').factorize()
df['class2'], mapping_class2 = df['class2'].astype('category').factorize()

In [8]:
df_test = pd.read_csv('../data/npf_test.csv', index_col='id')
test_data = df_test.drop(['date', 'partlybad', 'class4'], axis=1)

In [9]:
train_data = df.drop(['class4', 'class2'], axis=1)

scaled = StandardScaler().fit_transform(pd.concat([train_data, test_data]))

X_train = pd.DataFrame(scaled[0:430], columns=train_data.columns)
X_test = pd.DataFrame(scaled[430:], columns=train_data.columns)

y_class2 = df['class2']
y_class4 = df['class4']

y_test, mapping_test = df_test['class4'].astype('category').factorize()
y_test = mapping_test[y_test]

y_test_binary = lablesToBinary(pd.Series(y_test))

## Process

In [10]:
kbest = ('kbestmutual', SelectKBest(mutual_info_classif, k=90))
selecttree = ('selecttree', SelectFromModel(ExtraTreesClassifier(n_estimators=70)))

p = 0.7
var = p * (1 - p)

pca = ('pca', PCA())
sel = ('sel', VarianceThreshold(threshold=var))

gaussian = ('model', GaussianProcessClassifier(1.0 * RBF(1.0)))
logistic = ('model', LogisticRegression())

In [11]:
binary = Pipeline([kbest, gaussian])
multi = Pipeline([selecttree, logistic])

main = Pipeline([gaussian])

main_alter1 = Pipeline([pca, gaussian])
main_alter2 = Pipeline([sel, gaussian])

In [12]:
pred_df = predict_npf(main, X_train, y_class4, X_test)

acc_binary = accuracy_score(y_test_binary, lablesToBinary(pred_df['class4']))
acc_multi = accuracy_score(y_test, pred_df['class4'])
perp = perplexity_score(y_test, pred_df['p'])

In [13]:
print(acc_binary)
print(acc_multi)
print(perp)

0.8787564766839379
0.694300518134715
3.526965890904833


In [14]:
pred_df.to_csv('../results/answers.csv', index=False)