In [1]:
import time
import h5py
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC

  from ._conv import register_converters as _register_converters
  from numpy.core.umath_tests import inner1d


# Lectura de Datos

In [2]:
f1 = h5py.File('data_100_ytf.hdf5', 'r')
X = f1.get('dataset_1').value # `data` is now an ndarray.
f1.close()
#X = np.array(X)

f2 = h5py.File('labels_100_ytf.hdf5', 'r')
y = f2.get('dataset_1').value # `data` is now an ndarray.
f2.close()
#y = np.array(y)
print(X.shape, y.shape)

(100, 2048) (100,)


# Train Test Split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state = 6)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
#print(X.shape, y.shape)

(67, 2048) (67,) (33, 2048) (33,)


# Prueba de n_neighbors

In [4]:
# search for an optimal value of K for KNN

# range of k we want to try
k_range = range(1, 91)
# empty list to store scores
k_scores = []

# 1. we will loop through reasonable values of k
for k in k_range:
    # 2. run KNeighborsClassifier with k neighbours
    knn = KNeighborsClassifier(n_neighbors=k)
    # 3. obtain cross_val_score for KNeighborsClassifier with k neighbours
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    # 4. append mean of scores for k neighbors to k_scores list
    k_scores.append(scores.mean())


print(k_scores)

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.99, 0.9800000000000001, 0.97, 0.96, 0.95, 0.9400000000000001, 0.9200000000000002, 0.9100000000000001, 0.9, 0.9, 0.89, 0.86, 0.8400000000000001, 0.8300000000000001, 0.8, 0.7900000000000001, 0.74, 0.74, 0.71, 0.7, 0.6900000000000001, 0.65, 0.62, 0.5900000000000001, 0.5700000000000001, 0.54, 0.5400000000000001, 0.53, 0.5000000000000001, 0.51, 0.5, 0.48, 0.45999999999999996, 0.45, 0.47000000000000003, 0.45999999999999996, 0.45999999999999996, 0.45, 0.44000000000000006, 0.43, 0.43, 0.43, 0.43, 0.42000000000000004, 0.42000000000000004, 0.4, 0.4, 0.38, 0.37, 0.35, 0.35, 0.35, 0.33999999999999997, 0.33999999999999997, 0.32999999999999996, 0.31999999999999995, 0.30999999999999994, 0.29999999999999993, 0.27999999999999997, 0.26999999999999996, 0.25, 0.22000000000000003, 0.21000000000000002, 0.21000000000000002, 0.18, 0.16000000000000003, 0.12000000000000002, 0.11000000000000001, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]


In [5]:
neigh = KNeighborsClassifier(n_neighbors=6)
neigh.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='uniform')

# KNN

In [6]:
start_time = time.time()
clf = KNeighborsClassifier(n_neighbors=6, p=1)
clf.fit(X_train,y_train)

scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print(scores)
print(scores.mean())

print("%s ms" % ((time.time() - start_time)*1000))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
1.0
60.059547424316406 ms


# Decision Tree

In [7]:
start_time = time.time()
clf = DecisionTreeClassifier(max_depth=250, min_samples_split=2)
clf.fit(X_train,y_train)

scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print(scores)
print(scores.mean())

print("%s ms" % ((time.time() - start_time)*1000))

[0.8 1.  0.9 0.9 0.9 0.8 0.7 0.9 0.7 0.9]
0.85
752.4216175079346 ms


# Random Forest

In [8]:
start_time = time.time()
clf = RandomForestClassifier(max_depth=250, n_estimators=200, max_features=5)
clf.fit(X_train,y_train)

scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print(scores)
print(scores.mean())

print("%s ms" % ((time.time() - start_time)*1000))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
1.0
2584.2485427856445 ms


# Red Neuronal

In [9]:
start_time = time.time()
clf = MLPClassifier(alpha=3.1)
clf.fit(X_train,y_train)

scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print(scores)
print(scores.mean())

print("%s ms" % ((time.time() - start_time)*1000))




[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
1.0
18689.08166885376 ms




# AdaboostClassifier

In [10]:
start_time = time.time()
clf = AdaBoostClassifier(n_estimators=100, learning_rate = 0.5)
clf.fit(X_train,y_train)

scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print(scores)
print(scores.mean())

print("%s ms" % ((time.time() - start_time)*1000))

[0.9 1.  0.9 1.  0.8 1.  1.  0.9 0.9 0.9]
0.93
18143.638134002686 ms


# Bayes

In [11]:
start_time = time.time()
clf = GaussianNB()
clf.fit(X_train,y_train)

scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print(scores)
print(scores.mean())

print("%s ms" % ((time.time() - start_time)*1000))

[1.  1.  1.  1.  0.9 0.9 1.  0.9 1.  1. ]
0.97
58.594703674316406 ms


# Quadratic Discriminant Analysis

In [12]:
start_time = time.time()
clf = QuadraticDiscriminantAnalysis()
clf.fit(X_train,y_train)

scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print(scores)
print(scores.mean())


print("%s ms" % ((time.time() - start_time)*1000))

[0.7 0.5 0.5 0.4 0.5 0.3 0.7 0.4 0.3 0.3]
0.45999999999999996
143.3699131011963 ms




# SVM

In [13]:
start_time = time.time()
clf = SVC(gamma=1, C=1)
clf.fit(X_train,y_train)

scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print(scores)
print(scores.mean())

print("%s ms" % ((time.time() - start_time)*1000))

[0.3 0.4 0.3 0.8 0.3 0.4 0.2 0.2 0.3 0.2]
0.34
348.59681129455566 ms
