## Récupération des données

In [54]:
import os
import urllib.request

if("training_labels.txt" not in os.listdir(".")):
    print("Training labels are not in the root directory. Downloading...")
    urllib.request.urlretrieve("https://dl.dropboxusercontent.com/s/l0f9z08rysp0kjy/training_labels.txt", "training_labels.txt")
    print("Done.")
if("training_templates.csv" not in os.listdir(".")):
    print("Training data is not in the root directory. Downloading... this may take a while...")
    urllib.request.urlretrieve("https://dl.dropboxusercontent.com/s/dqudxed82ljnxa8/training_templates.csv", "training_templates.csv")
    print("Done.")
if("testing_templates.csv" not in os.listdir(".")):
    print("Testing data are not in the root directory. Downloading... this may take a while...")
    urllib.request.urlretrieve("https://dl.dropboxusercontent.com/s/syrry7miykrmjz0/testing_templates.csv", "testing_templates.csv")
    print("Done.")

import pandas as pd
import numpy as np


data = pd.read_csv("training_templates.csv", header=None)
X = np.array(data)
y = np.loadtxt("training_labels.txt")

ds = X,y #saving dataset

In [55]:
print("(Samples, features) = ",X.shape, sep=" ")
print(len(y))
print(np.unique(y))

(Samples, features) =  (105600, 128)
105600
[-1.  1.]


# Prédiction naïve

In [74]:
X,y = ds

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier

# Shuffling data
permut = np.random.permutation(len(X))
X,y = X[permut], y[permut]

X = StandardScaler().fit_transform(X)
# # Diminishing number of features:
# pca = PCA(n_components=0.5, svd_solver='full')
# X = pca.fit_transform(X, y=y)
# print("New shape:",X.shape)
# X = StandardScaler().fit_transform(X)


In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

clf = KNeighborsClassifier()
print("Fitting classifier...")
clf.fit(X_train[:], y_train[:])

# Performance
def compute_pred_score(y_true, y_pred):
    y_pred_unq =  np.unique(y_pred)
    for i in y_pred_unq:
        if((i != -1) & (i!= 1) & (i!= 0) ):
            raise ValueError('The predictions can contain only -1, 1, or 0!')
    y_comp = y_true * y_pred
    score = float(10*np.sum(y_comp == -1) + np.sum(y_comp == 0))
    score /= y_comp.shape[0]
    return score

print("Predicting test samples...")
y_pred = clf.predict(X_test)
print("Computing score...")
s = compute_pred_score(y_test, y_pred)

print("Score on", len(y_pred), "samples:", s)

Fitting classifier...
Predicting test samples...
Computing score...
Score on 10560 samples: 0.23390151515151514


In [77]:
print("Predicting only confident results:")
T_pred = clf.predict_proba(X_test)


Predicting only confident results:


In [78]:
print(T_pred.shape)
# print(T_pred[0:30,...])
# print(y_test[0:30])
print("Selecting the confidence threshold")
for confidence in np.linspace(0,0.5,10):
    y_pred = []
    for (p1, p2) in T_pred:
        if p1 <= confidence:
            y_pred.append(1)
        elif p2 <= confidence:
            y_pred.append(-1)
        else:
            y_pred.append(0)

    print("Confidence threshold:", confidence, "score:", compute_pred_score(y_pred, y_test))

print(y_pred[0:30])

(10560, 2)
Selecting the confidence threshold
Confidence threshold: 0.0 score: 0.1434659090909091
Confidence threshold: 0.0555555555556 score: 0.1434659090909091
Confidence threshold: 0.111111111111 score: 0.1434659090909091
Confidence threshold: 0.166666666667 score: 0.1434659090909091
Confidence threshold: 0.222222222222 score: 0.14696969696969697
Confidence threshold: 0.277777777778 score: 0.14696969696969697
Confidence threshold: 0.333333333333 score: 0.14696969696969697
Confidence threshold: 0.388888888889 score: 0.14696969696969697
Confidence threshold: 0.444444444444 score: 0.23390151515151514
Confidence threshold: 0.5 score: 0.23390151515151514
[-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1]


## Prédiction :

In [79]:
testing = pd.read_csv("testing_templates.csv", header=None)
X_unknown = np.array(testing)
# print("Before dim reduction:",X_unknown.shape)
# X_unknown = pca.transform(X_unknown)
# print("After dim reduction:",X_unknown.shape)
X_unknown = StandardScaler().fit_transform(X_unknown)
print("Predicting for the unknown dataset")
T_pred = clf.predict_proba(X_unknown)
print(T_pred.shape)
# print(T_pred[0:30,...])
# print(T_pred[-30:,...])
for confidence in np.linspace(0.01, 0.45, 20):
    y_pred = []
    unsure = 0
    print("Selecting the best results with threshold = ",confidence)
    for proba in T_pred:
        p1, p2 = proba[0], proba[1]
        if p1 <= confidence:
            y_pred.append(1)
        elif p2 <= confidence:
            y_pred.append(-1)
        else:
    #         print("Not sure for sample", len(y_pred))
            y_pred.append(0)
            unsure += 1
    print(np.unique(y_pred))
    print("Unsure predictions: ", unsure,"/",len(y_pred))
    np.savetxt('y_pred'+str(confidence)+'.txt', y_pred, fmt='%d')

Predicting for the unknown dataset
(8496, 2)
Selecting the best results
[-1  0  1]
Unsure predictions:  2439 / 8496
Selecting the best results
[-1  0  1]
Unsure predictions:  2439 / 8496
Selecting the best results
[-1  0  1]
Unsure predictions:  2439 / 8496
Selecting the best results
[-1  0  1]
Unsure predictions:  2439 / 8496
Selecting the best results
[-1  0  1]
Unsure predictions:  2439 / 8496
Selecting the best results
[-1  0  1]
Unsure predictions:  2439 / 8496
Selecting the best results
[-1  0  1]
Unsure predictions:  2439 / 8496
Selecting the best results
[-1  0  1]
Unsure predictions:  2439 / 8496
Selecting the best results
[-1  0  1]
Unsure predictions:  2439 / 8496
Selecting the best results
[-1  0  1]
Unsure predictions:  892 / 8496
Selecting the best results
[-1  0  1]
Unsure predictions:  892 / 8496
Selecting the best results
[-1  0  1]
Unsure predictions:  892 / 8496
Selecting the best results
[-1  0  1]
Unsure predictions:  892 / 8496
Selecting the best results
[-1  0  1

In [80]:
np.savetxt('y_pred.txt', y_pred, fmt='%d')

In [81]:
#TODO: Réduire dimensions


In [82]:
#TODO: Essayer avec Gaussian Naive Bayes

In [83]:
print(len(y_pred))

8496
