## Chargements des données

In [1]:
import data.data as data
import utils.model_utils as model_utils
import utils.kaggle_submit as kaggle_submit
from sklearn.model_selection import train_test_split

In [2]:
files = {"train": "./res/train.csv", "test": "./res/test.csv"}
files["train"] = data.open_otto_csv(files["train"])
files["test"] = data.open_otto_csv(files["test"])

In [3]:
train_x, train_y = data.get_x_y(files["train"])
nb_class = len(train_y.unique())
print(nb_class)
testx = files["test"].drop(columns=["id"])

9


## Séparation ensemble apprentissage et validation

In [4]:
data_train, data_valid, labels_train, labels_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

## Comparatif sur quelques classifiers différents

### Classifier : K-NN

In [7]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(data_train, labels_train)

y_pred3 = knn.predict(data_valid)
y_pred3_probas = knn.predict_proba(data_valid)

model_utils.evaluate_model(labels_valid, y_pred3, y_pred3_probas)

f1_score precision : (version sklearn) 0.7739562191425366
La log-loss est de :  2.3086776076579794
evaluation (version Sam) : 
9662 / 12376
Precision : 0.780705


In [8]:
kaggle_submit.make_csv_soumission(knn, testx, "res_knn.csv", nb_class)

### Classifier : Naive Bayes

In [5]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(data_train, labels_train)

y_pred4 = nb.predict(data_valid)
y_pred4_probas = nb.predict_proba(data_valid)

model_utils.evaluate_model(labels_valid, y_pred4, y_pred4_probas)

f1_score precision : (version sklearn) 0.6286302230265768
La log-loss est de :  7.251880092416661
evaluation (version Sam) : 
7684 / 12376
Precision : 0.620879


In [6]:
kaggle_submit.make_csv_soumission(nb, testx, "res_GaussNaiveBayes.csv", nb_class)

### Classifier : Perceptron

In [35]:
from sklearn.linear_model import Perceptron

perceptron_clf = Perceptron(max_iter = 1000)
perceptron_clf.fit(data_train, labels_train)

y_pred5 = perceptron_clf.predict(data_valid)

model_utils.evaluate_model_Acc(labels_valid, y_pred5)

f1_score precision : (version sklearn) 0.690788007968747
evaluation (version Sam) : 
8718 / 12376
Precision : 0.704428


### Classifier : Regression Logistique

In [21]:
from sklearn.linear_model import LogisticRegression

logistic_clf = LogisticRegression(max_iter = 1000)
logistic_clf.fit(data_train, labels_train)

y_pred6 = logistic_clf.predict(data_valid)
y_pred6_probas = logistic_clf.predict_proba(data_valid)

model_utils.evaluate_model(labels_valid, y_pred6, y_pred6_probas)

f1_score precision : (version sklearn) 0.7343083104433648
La log-loss est de :  0.6699890545002816
evaluation (version Sam) : 
9367 / 12376
Precision : 0.756868


### Classifier : SGDClassifier = modèle linéaire + descente de gradient stochastique

In [16]:
from sklearn.linear_model import SGDClassifier

SGD_clf = SGDClassifier(max_iter=1000)
SGD_clf.fit(data_train, labels_train)

y_predSGD = SGD_clf.predict(data_valid)

model_utils.evaluate_model_Acc(labels_valid, y_predSGD)

f1_score precision : (version sklearn) 0.7216904073127184
evaluation (version Sam) : 
9295 / 12376
Precision : 0.751050


### Classifier : MLP Classifier : Multi-Layers-Perceptron

In [34]:
from sklearn.neural_network import MLPClassifier

mlp_clf = MLPClassifier(solver = 'sgd', hidden_layer_sizes = (30, 15), learning_rate = 'adaptive')
mlp_clf.fit(data_train, labels_train)

y_pred8 = mlp_clf.predict(data_valid)
y_pred8_probas = mlp_clf.predict_proba(data_valid)

model_utils.evaluate_model(labels_valid, y_pred8, y_pred8_probas)

f1_score precision : (version sklearn) 0.7776234266031896
La log-loss est de :  0.5616724859868261
evaluation (version Sam) : 
9718 / 12376
Precision : 0.785229




### Classifier : Random Forest

In [33]:
from sklearn.ensemble import RandomForestClassifier
from numpy.core.umath_tests import inner1d

rnd_f = RandomForestClassifier(n_estimators = 250)
rnd_f.fit(data_train, labels_train)

y_pred9 = rnd_f.predict(data_valid)
y_pred9_probas = rnd_f.predict_proba(data_valid)

model_utils.evaluate_model(labels_valid, y_pred9, y_pred9_probas)

f1_score precision : (version sklearn) 0.8015505959705683
La log-loss est de :  0.5783165926416591
evaluation (version Sam) : 
10039 / 12376
Precision : 0.811167


In [35]:
kaggle_submit.make_csv_soumission(rnd_f, testx, "res_rndF.csv", nb_class) # 0.56

### Classifier : XGBoost

In [38]:
from xgboost.sklearn import XGBClassifier

xgb = XGBClassifier(gamma = 0.03, learning_rate = 0.08, max_depth = 7, n_estimators = 250)
print("train du xgb...")
xgb.fit(train_x, train_y)

print("predict du xgb...")
# y_pred10 = xgb.predict(data_valid)
y_pred10_probas = xgb.predict_proba(testx)

# model_utils.evaluate_model(labels_valid, y_pred10, y_pred10_probas)

train du xgb...
predict du xgb...


In [40]:
kaggle_submit.make_csv_soumission2(y_pred10_probas, 'xgb.csv', nb_class) # 0.48

In [17]:
mlp_Optim_clf = MLPClassifier(solver = 'sgd', hidden_layer_sizes = (50,), learning_rate = 'constant')
mlp_clf.fit(data_train, labels_train)

y_predOptimMLP = mlp_clf.predict(data_valid)
y_predOptimMLP_proba = mlp_clf.predict_proba(data_valid)

model_utils.evaluate_model(labels_valid, y_predOptimMLP, y_predOptimMLP_proba)

f1_score precision : (version sklearn) 0.7793973092566393
evaluation (version Sam) : 
9706 / 12376
Precision : 0.784260




## Mélange : MLPClassifier & RandomForest

In [27]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from numpy.core.umath_tests import inner1d

mlp_clf = MLPClassifier(solver = 'sgd', hidden_layer_sizes = (30, 15), learning_rate = 'adaptive')
mlp_clf.fit(train_x, train_y)

rnd_f = RandomForestClassifier(n_estimators = 250)
rnd_f.fit(train_x, train_y)

y_predRndOptim = rnd_f.predict_proba(testx)
y_predMlpOptim = mlp_clf.predict_proba(testx)

y = 0.5*y_predRndOptim + 0.5*y_predMlpOptim

print(y.shape)



(144368, 9)


In [40]:
kaggle_submit.make_csv_soumission2(y, 'mlpRnd.csv', nb_class) # Kaggle : 0.51

## Mélange : MLP & Random Forest + Calibrage

In [20]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from numpy.core.umath_tests import inner1d

mlp_clf = MLPClassifier(solver = 'sgd', hidden_layer_sizes = (30, 15), learning_rate = 'adaptive')
rnd_f = RandomForestClassifier(n_estimators = 250)

calibrated_mlp_clf = CalibratedClassifierCV(mlp_clf, method='isotonic', cv = 2)
calibrated_rnd_f = CalibratedClassifierCV(rnd_f, method = 'isotonic', cv = 2)

calibrated_mlp_clf.fit(train_x, train_y)
calibrated_rnd_f.fit(train_x, train_y)

y_predMlpCalibrated = calibrated_mlp_clf.predict_proba(testx)
y_predRndFCalibrated = calibrated_rnd_f.predict_proba(testx)

y = 0.5*y_predMlpCalibrated + 0.5*y_predRndFCalibrated

# evaluate_model_prob(labels_valid, y)



In [24]:
y2 = 0.2*y_predMlpCalibrated + 0.8*y_predRndFCalibrated
print(y2.shape)

(144368, 9)


In [26]:
kaggle_submit.make_csv_soumission2(y, 'mlpRndCalibrated.csv', nb_class) # 0.49

In [None]:
kaggle_submit.make_csv_soumission2(y2, 'mlpRndCalibrated.csv', nb_class) # 0.48

## Mélange : Random Forest + MLP + XGBoost

In [36]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from numpy.core.umath_tests import inner1d

mlp_clf = MLPClassifier(solver = 'sgd', hidden_layer_sizes = (30, 15), learning_rate = 'adaptive')
rnd_f = RandomForestClassifier(n_estimators = 250)
xgb = XGBClassifier(gamma = 0.03, learning_rate = 0.08, max_depth = 7, n_estimators = 250)

calibrated_mlp_clf = CalibratedClassifierCV(mlp_clf, method='isotonic', cv = 2)
calibrated_rnd_f = CalibratedClassifierCV(rnd_f, method = 'isotonic', cv = 2)
calibrated_xgb = CalibratedClassifierCV(xgb, method = 'isotonic', cv = 2)

print("train du MLP...")
calibrated_mlp_clf.fit(train_x, train_y)
print("train du Random Forest...")
calibrated_rnd_f.fit(train_x, train_y)
print("train du XGBoost...")
calibrated_xgb.fit(train_x, train_y)

print("prédiction MLP...")
y_predMlpCalibrated = calibrated_mlp_clf.predict_proba(testx)
print("prédiction Random Forest...")
y_predRndFCalibrated = calibrated_rnd_f.predict_proba(testx)
print("prédiction XGBoost...")
y_predXGBCalibrated = calibrated_xgb.predict_proba(testx)

y = 0.2*y_predMlpCalibrated + 0.4*y_predRndFCalibrated + 0.4*y_predXGBCalibrated

train du MLP...




train du Random Forest...
train du XGBoost...
prédiction MLP...
prédiction Random Forest...
prédiction XGBoost...


In [37]:
kaggle_submit.make_csv_soumission2(y, 'RndF+MLP+XGBoost.csv', nb_class) # 0.47