## Récupération des données

In [66]:
import os
import urllib.request

if("training_labels.txt" not in os.listdir(".")):
    print("Training labels are not in the root directory. Downloading...")
    urllib.request.urlretrieve("https://dl.dropboxusercontent.com/s/l0f9z08rysp0kjy/training_labels.txt", "training_labels.txt")
    print("Done.")
if("training_templates.csv" not in os.listdir(".")):
    print("Training data is not in the root directory. Downloading... this may take a while...")
    urllib.request.urlretrieve("https://dl.dropboxusercontent.com/s/dqudxed82ljnxa8/training_templates.csv", "training_templates.csv")
    print("Done.")
if("testing_templates.csv" not in os.listdir(".")):
    print("Testing data are not in the root directory. Downloading... this may take a while...")
    urllib.request.urlretrieve("https://dl.dropboxusercontent.com/s/syrry7miykrmjz0/testing_templates.csv", "testing_templates.csv")
    print("Done.")

import pandas as pd
import numpy as np


data = pd.read_csv("training_templates.csv", header=None)
X = np.array(data)
y = np.loadtxt("training_labels.txt")

ds = X,y #saving dataset

In [67]:
print("(Samples, features) = ",X.shape, sep=" ")
print(len(y))
print(np.unique(y))

(Samples, features) =  (105600, 128)
105600
[-1.  1.]


# Prédiction naïve

In [93]:
X,y = ds

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB

# Shuffling data
permut = np.random.permutation(len(X))
X,y = X[permut], y[permut]

X = StandardScaler().fit_transform(X)
# # Diminishing number of features:
# pca = PCA(n_components=0.9, svd_solver='full')
# X = pca.fit_transform(X, y=y)
# print("New shape:",X.shape)
# X = StandardScaler().fit_transform(X)


In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

clf = GaussianNB()
print("Fitting classifier...")
clf.fit(X_train[:10000], y_train[:10000])

# Performance
def compute_pred_score(y_true, y_pred):
    y_pred_unq =  np.unique(y_pred)
    for i in y_pred_unq:
        if((i != -1) & (i!= 1) & (i!= 0) ):
            raise ValueError('The predictions can contain only -1, 1, or 0!')
    y_comp = y_true * y_pred
    score = float(10*np.sum(y_comp == -1) + np.sum(y_comp == 0))
    score /= y_comp.shape[0]
    return score

print("Predicting test samples...")
y_pred = clf.predict(X_test)
print("Computing score...")
s = compute_pred_score(y_test, y_pred)

print("Score on", len(y_pred), "samples:", s)

Fitting classifier...
Predicting test samples...
Computing score...
Score on 10560 samples: 1.1865530303030303


In [95]:
print("Predicting only confident results:")
T_pred = clf.predict_proba(X_test)


Predicting only confident results:


In [96]:
print(T_pred.shape)
print(T_pred[0:30,...])
print(y_test[0:30])
print("Selecting the confidence threshold")
for confidence in np.linspace(0,0.3,20):
    y_pred = []
    for (p1, p2) in T_pred:
        if p1 <= confidence:
            y_pred.append(1)
        elif p2 <= confidence:
            y_pred.append(-1)
        else:
            y_pred.append(0)

    print("Confidence threshold:", confidence, "score:", compute_pred_score(y_pred, y_test))

print(y_pred[0:30])

(10560, 2)
[[  9.71862612e-01   2.81373876e-02]
 [  9.68599671e-01   3.14003291e-02]
 [  3.73211579e-01   6.26788421e-01]
 [  9.89365300e-01   1.06347001e-02]
 [  9.90819431e-01   9.18056874e-03]
 [  8.22395806e-02   9.17760419e-01]
 [  6.99954017e-01   3.00045983e-01]
 [  7.40901703e-04   9.99259098e-01]
 [  9.70184964e-03   9.90298150e-01]
 [  7.06357847e-01   2.93642153e-01]
 [  5.08872312e-03   9.94911277e-01]
 [  9.71554906e-01   2.84450936e-02]
 [  9.75586809e-03   9.90244132e-01]
 [  2.43167847e-01   7.56832153e-01]
 [  9.97230996e-01   2.76900405e-03]
 [  9.97997937e-01   2.00206269e-03]
 [  9.76515978e-01   2.34840218e-02]
 [  3.37326283e-01   6.62673717e-01]
 [  1.30822176e-01   8.69177824e-01]
 [  5.06083242e-01   4.93916758e-01]
 [  2.02378622e-02   9.79762138e-01]
 [  8.98926291e-01   1.01073709e-01]
 [  6.90260361e-04   9.99309740e-01]
 [  4.02995491e-01   5.97004509e-01]
 [  1.09112655e-01   8.90887345e-01]
 [  2.28406957e-01   7.71593043e-01]
 [  7.84901591e-01   2.1509

In [97]:
for i in range(len(y_pred)):
     if(y_pred[i] != y_test[i]):
            print("Wrong prediction for i =",i)
            print("Predicted ",y_pred[i],", proba were ", T_pred[i])

Wrong prediction for i = 2
Predicted  0 , proba were  [ 0.37321158  0.62678842]
Wrong prediction for i = 6
Predicted  0 , proba were  [ 0.69995402  0.30004598]
Wrong prediction for i = 17
Predicted  0 , proba were  [ 0.33732628  0.66267372]
Wrong prediction for i = 19
Predicted  0 , proba were  [ 0.50608324  0.49391676]
Wrong prediction for i = 23
Predicted  0 , proba were  [ 0.40299549  0.59700451]
Wrong prediction for i = 30
Predicted  0 , proba were  [ 0.30762899  0.69237101]
Wrong prediction for i = 35
Predicted  0 , proba were  [ 0.46296622  0.53703378]
Wrong prediction for i = 42
Predicted  -1 , proba were  [ 0.89736246  0.10263754]
Wrong prediction for i = 43
Predicted  0 , proba were  [ 0.67592565  0.32407435]
Wrong prediction for i = 67
Predicted  1 , proba were  [ 0.00437399  0.99562601]
Wrong prediction for i = 74
Predicted  0 , proba were  [ 0.34888666  0.65111334]
Wrong prediction for i = 77
Predicted  0 , proba were  [ 0.40423459  0.59576541]
Wrong prediction for i = 78
P

## Prédiction :

In [105]:
testing = pd.read_csv("testing_templates.csv", header=None)
X_unknown = np.array(testing)
# print("Before dim reduction:",X_unknown.shape)
# X_unknown = pca.transform(X_unknown)
# print("After dim reduction:",X_unknown.shape)
X_unknown = StandardScaler().fit_transform(X_unknown)
# print("Fitting on the entire known dataset")
# clf.fit(X,y=y)

print("Predicting for the unknown dataset")
T_pred = clf.predict_proba(X_unknown)
print(T_pred.shape)
# print(T_pred[0:30,...])
# print(T_pred[-30:,...])
confidence = 0.2
y_pred = []
unsure = 0
print("Selecting the best results")
for proba in T_pred:
    p1, p2 = proba[0], proba[1]
    if p1 <= confidence:
        y_pred.append(1)
    elif p2 <= confidence:
        y_pred.append(-1)
    else:
#         print("Not sure for sample", len(y_pred))
        y_pred.append(0)
        unsure += 1
print(np.unique(y_pred))
print("Unsure predictions: ", unsure,"/",len(y_pred))

Predicting for the unknown dataset
(8496, 2)
Selecting the best results
[-1  0  1]
Unsure predictions:  3139 / 8496


In [106]:
np.savetxt('y_pred.txt', y_pred, fmt='%d')

In [86]:
#TODO: Réduire dimensions


In [87]:
#TODO: Essayer avec Gaussian Naive Bayes

In [88]:
print(len(y_pred))

8496
