In [1]:
import time
import h5py
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC

  from ._conv import register_converters as _register_converters
  from numpy.core.umath_tests import inner1d


In [2]:

f1 = h5py.File('./YTF_hdf5/data_ytf_100_rn18.hdf5', 'r')
X = f1.get('dataset_1').value # `data` is now an ndarray.
f1.close()
#X = np.array(X)

f2 = h5py.File('./YTF_hdf5/labels_ytf_100_rn18.hdf5', 'r')
y = f2.get('dataset_1').value # `data` is now an ndarray.
f2.close()
#y = np.array(y)
print(X.shape, y.shape)

(100, 512) (100,)


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state = 6)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(X.shape, y.shape)

(67, 512) (67,) (33, 512) (33,)
(100, 512) (100,)


In [4]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

1.0

In [5]:
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print(scores)
print(scores.mean())

[1.  1.  1.  1.  1.  0.9 1.  1.  1.  1. ]
0.99


In [6]:
# search for an optimal value of K for KNN

# range of k we want to try
k_range = range(1, 19)
# empty list to store scores
k_scores = []

# 1. we will loop through reasonable values of k
for k in k_range:
    # 2. run KNeighborsClassifier with k neighbours
    knn = KNeighborsClassifier(n_neighbors=k)
    # 3. obtain cross_val_score for KNeighborsClassifier with k neighbours
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    # 4. append mean of scores for k neighbors to k_scores list
    k_scores.append(scores.mean())


print(k_scores)

[0.99, 0.99, 0.99, 0.9800000000000001, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.9800000000000001, 0.99, 0.97, 0.9400000000000001, 0.9400000000000001, 0.9200000000000002, 0.9099999999999999, 0.9100000000000001]


In [7]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X, y)

#testVec = [x/n for x in [sum(x) for x in newPics]]
#print(neigh.predict([testVec]))
#print(neigh.predict_proba([testVec]))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [8]:
start_time = time.time()
clf = KNeighborsClassifier(n_neighbors=1, p=1)
clf.fit(X_train,y_train)

scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print('KNeighborsClassifier:')
print(scores)
print('Mean:',scores.mean())

print("%s ms" % ((time.time() - start_time)*1000),'\n')

KNeighborsClassifier:
[1.  1.  1.  1.  1.  0.9 1.  1.  1.  1. ]
Mean: 0.99
28.392314910888672 ms 



In [9]:
start_time = time.time()
clf = DecisionTreeClassifier(max_depth=250, min_samples_split=2)
clf.fit(X_train,y_train)

scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print('DecisionTreeClassifier:')
print(scores)
print('Mean:',scores.mean())

print("%s ms" % ((time.time() - start_time)*1000),'\n')

DecisionTreeClassifier:
[0.8 0.9 0.9 1.  0.9 0.7 1.  0.8 0.8 0.9]
Mean: 0.8699999999999999
185.20784378051758 ms 



In [10]:
start_time = time.time()
clf = RandomForestClassifier(max_depth=250, n_estimators=200, max_features=5)
clf.fit(X_train,y_train)

scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print('RandomForestClassifier:')
print(scores)
print('Mean:', scores.mean())

print("%s ms" % ((time.time() - start_time)*1000),'\n')

RandomForestClassifier:
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Mean: 1.0
2591.107130050659 ms 



In [11]:
start_time = time.time()
clf = MLPClassifier(alpha=3.1)
clf.fit(X_train,y_train)

scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print('MLPClassifier:')

print(scores)
print('Mean:',scores.mean())

print("%s ms" % ((time.time() - start_time)*1000),'\n')




MLPClassifier:
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Mean: 1.0
6597.861289978027 ms 





# AdaboostClassifier

In [12]:
start_time = time.time()
clf = AdaBoostClassifier(n_estimators=100, learning_rate = 0.5)
clf.fit(X_train,y_train)

scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print('AdaBoostClassifier:')

print(scores)
print('Mean:',scores.mean())

print("%s ms" % ((time.time() - start_time)*1000))

AdaBoostClassifier:
[0.9 0.7 0.9 0.8 1.  0.9 0.6 0.7 0.7 0.7]
Mean: 0.79
5778.168439865112 ms


# Bayes

In [13]:
start_time = time.time()
clf = GaussianNB()
clf.fit(X_train,y_train)

scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print('GaussianNB:')

print(scores)
print('Mean:',scores.mean())

print("%s ms" % ((time.time() - start_time)*1000))

GaussianNB:
[0.9 0.9 1.  0.9 0.9 0.9 1.  0.9 0.8 1. ]
Mean: 0.9200000000000002
41.01109504699707 ms


In [14]:
start_time = time.time()
clf = QuadraticDiscriminantAnalysis()
clf.fit(X_train,y_train)

scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print('QuadraticDiscriminantAnalysis:')

print(scores)
print('Mean:',scores.mean())


print("%s ms" % ((time.time() - start_time)*1000))

QuadraticDiscriminantAnalysis:
[0.6 0.6 0.8 0.6 0.3 0.6 0.2 0.3 0.5 0.4]
Mean: 0.49000000000000005
59.229373931884766 ms




In [15]:
start_time = time.time()
clf = SVC(gamma=1, C=1)
clf.fit(X_train,y_train)

scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
print('SVC:')

print(scores)
print('Mean:',scores.mean())

print("%s ms" % ((time.time() - start_time)*1000))

SVC:
[0.4 0.4 0.3 0.6 0.3 0.4 0.3 0.2 0.3 0.3]
Mean: 0.35
103.25026512145996 ms
