In [1]:
import pandas as pd
import numpy as np
import data_sampling as ds
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print(np.shape(train))

(595212, 59)


In [2]:
y = train['target']
train = train.drop(columns = ['target', 'id'])

In [3]:
all_models = [GaussianNB(), 
              LogisticRegression(random_state=2019, solver='lbfgs', multi_class='multinomial', 
                                 n_jobs=-1, max_iter=256, verbose=2), 
              DecisionTreeClassifier()]

In [4]:
kf = KFold(n_splits=5, shuffle=True, random_state=2019)    

In [5]:
#For each model:
for model in all_models:
    print('\n\n======= Model: ', model, '=======')
    # for each fold:
    accuracies = list()
    for i, (train_index, test_index) in enumerate(kf.split(train)):
        #Builds the train and validation dataset, according to the current fold:
        y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
        X_train, X_valid = train.iloc[train_index,:].copy(), train.iloc[test_index,:].copy()
        print("Fold", i)
        model.fit(X_train, y_train)
        pred = model.predict(X_valid)
        acc = np.equal(pred, y_valid).astype(float)
        acc = np.mean(acc)
        print('Accuracy:', acc)
        accuracies.append(acc)
    print('Final results: \nMean accuracy:', np.mean(accuracies))



Fold 0
Accuracy: 0.902942634174206
Fold 1
Accuracy: 0.9074032072444411
Fold 2
Accuracy: 0.9027654105273769
Fold 3
Accuracy: 0.9075116345491507
Fold 4
Accuracy: 0.9037650577107239
Final results: mean accuracy: 0.9048775888411799


          intercept_scaling=1, max_iter=256, multi_class='multinomial',
          n_jobs=-1, penalty='l2', random_state=2019, solver='lbfgs',
Fold 0


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  3.8min finished


Accuracy: 0.9635257848004503
Fold 1


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  3.4min finished


Accuracy: 0.9630973681778853
Fold 2


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  3.4min finished


Accuracy: 0.9632818669041179
Fold 3


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  3.3min finished


Accuracy: 0.9642731136909662
Fold 4


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  3.6min finished


Accuracy: 0.9635842811780716
Final results: mean accuracy: 0.9635524829502984


            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
Fold 0
Accuracy: 0.917542400645145
Fold 1
Accuracy: 0.9181640247641608
Fold 2
Accuracy: 0.918751365064431
Fold 3
Accuracy: 0.9183649468254902
Fold 4
Accuracy: 0.9194065959913308
Final results: mean accuracy: 0.9184458666581115
