# Multiclass Classification

In [124]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

test = pd.read_csv("data/test.csv")
train = pd.read_csv("data/train.csv")

In [98]:
# Encoder la colonne species 
from sklearn.preprocessing import LabelEncoder

train["species"] = LabelEncoder().fit_transform(train["species"])
train.head()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,3,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,49,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,65,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293
3,5,94,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.0,...,0.0,0.000977,0.0,0.0,0.020508,0.0,0.0,0.017578,0.0,0.047852
4,6,84,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.0,...,0.09668,0.0,0.021484,0.0,0.0,0.0,0.0,0.0,0.0,0.03125


In [99]:
# Séparation des espèces (label) et des caractéristiques + nettoyage des données

train_feature = train.drop(['id', 'species'], axis=1)
train_label = train["species"]

In [100]:
# Standardisation 
from sklearn.preprocessing import StandardScaler

std_scale = StandardScaler()
train_scaled = pd.DataFrame(std_scale.fit_transform(train_feature))

In [101]:
# Dans ce cas, nous ne voulons pas un simple échantillonnage aléatoire mais une stratification en raison du grand nombre de classes (99)
# la stratification permettra de s'assurer qu'il y a un nombre égal d'échantillons par classe dans l'ensemble de formation
from sklearn.model_selection import StratifiedShuffleSplit

StratifiedShuffleSplit = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

for train_index, test_index in StratifiedShuffleSplit.split(train_scaled, train_label):
    x_train, x_test = train_scaled.iloc[train_index], train_scaled.iloc[test_index]
    y_train, y_test = train_label.iloc[train_index], train_label.iloc[test_index]
    


In [102]:
# Validation croisée du paramètre k d'un k-NN

from sklearn import neighbors, metrics
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, log_loss

classifiers = [SVC(), KNeighborsClassifier()] # un classifieur kNN et SVM linéaire
params = [{'kernel':('linear','poly','sigmoid','rbf'), 'C':[0.01,0.05,0.025,0.07,0.09,1.0], 'probability':[True]}, {'n_neighbors': [3,5,7,9]}] # hyperparamètres à tester
cv = ShuffleSplit(n_splits=10,test_size=0.20,random_state=42) # nombre de folds de validation croisée

# Trouver le meilleur estimateur pour des différents classifiers avec les bons paramètres pour chacun
best_estimators = []
for classifier, param in zip(classifiers, params):
    grid = GridSearchCV(classifier, param, cv=cv)
    grid = grid.fit(x_train, y_train) # Optimiser ce classifieur sur le jeu d'entraînement
    best_estimators.append(grid.best_estimator_)
    
for estimator in best_estimators:
    estimator.fit(x_train, y_train)
    name = estimator.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    print('**Training set**')
    train_predictions = estimator.predict(x_train)
    accuracy = accuracy_score(y_train, train_predictions)
    print("Accuracy: {:.4%}".format(accuracy))
    train_predictions = estimator.predict_proba(x_train)
    ll = log_loss(y_train, train_predictions)
    print("Log Loss: {}".format(ll))
    
    print('**Test set**')
    train_predictions = estimator.predict(x_test)
    accuracy = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(accuracy))
    train_predictions = estimator.predict_proba(x_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    
print("="*30)


SVC
****Results****
**Training set**
Accuracy: 100.0000%
Log Loss: 2.1804077853761648
**Test set**
Accuracy: 99.4949%
Log Loss: 2.293574416509179
KNeighborsClassifier
****Results****
**Training set**
Accuracy: 98.2323%
Log Loss: 0.041433802638993475
**Test set**
Accuracy: 98.4848%
Log Loss: 0.0412192836531407


In [125]:
test_copy = test.copy()
test_index = test_copy.id 

In [130]:
test_features_scale

array([[ 0.10738699, -0.48340807,  1.7859236 , ..., -0.52001246,
        -0.36877633,  1.50689382],
       [-0.48661074, -0.58401248,  1.25669405, ..., -0.49497401,
         2.03998637,  1.12066917],
       [-0.88257543, -0.73488047, -1.16261896, ..., -0.11957666,
        -0.65214311, -0.85338743],
       ...,
       [ 0.00839582,  0.01951098, -0.6333894 , ..., -0.52001246,
         2.46510908, -0.5529856 ],
       [-0.18958653, -0.48340807,  1.10549666, ..., -0.52001246,
         0.19802977, -0.03800476],
       [-0.88257543,  2.28273681, -1.23821765, ..., -0.11957666,
        -0.65214311, -0.08093814]])

In [134]:
test_features = test.drop('id',axis=1)
test_features_scale = std_scale.transform(test_features)

pred = best_estimators[1].predict_proba(test_features_scale) # KNeighbors classifer model

classes = LabelEncoder().fit(train['species']).classes_
df_pred = pd.DataFrame(pred, index=test_index, columns=classes) # Insight dataframe