In [23]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from tqdm import tqdm,tqdm_notebook

from sklearn.model_selection import RepeatedKFold
from sklearn.neural_network import MLPClassifier

## Read CSV's

In [24]:
df3 = pd.read_csv('comp-data/4a-training-dataset-creation/train.csv')

In [25]:
df3.head(3)

Unnamed: 0,age,gender,zip code,ocupation,target,unknown,Action,Adventure,Animation,Children's,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release-year
0,2,0,85711,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
1,4,0,29206,7,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
2,3,1,37212,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994


## Normalize the Dataset so that we eliminate outliers
  * Final Features: {age, gender, zip code, occupation, target, movie_genres (one-hot representation), release year}

In [12]:
# Normalize the Dataset
X = np.array(df3.drop(['target', 'zip code'], axis=1))
X = preprocessing.scale(X, axis=1)
# Separate the correct answer y from the Dataset
y = np.array(df3['target'])



## Initiate the Repeated K-fold Validation Learning Technique
  * for K = 5 (a.k.a 5 shards of the learning dataset)
  * and for 10 repeats (so as to have concrete proof of the machine's accuracy

## Create the Multilayered Perceptron

In [13]:
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=100)
clf = MLPClassifier(hidden_layer_sizes=(4,4), activation='logistic', solver='sgd', alpha=1e-5, learning_rate='adaptive', random_state=100, verbose=False)

## Training The (Deep) Neural Network...

In [14]:
accuracy = np.array([])

for train_index, test_index in tqdm_notebook(rkf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    accuracy = np.append(accuracy, [clf.score(X_test,y_test)])

A Jupyter Widget




## Print The Best and Worst Case Accuracy (we see ~82.98% at best and ~81.94% at worst)

In [15]:
print(np.amax(accuracy))
print(np.amin(accuracy))

0.829841492075
0.819481948195


## Print The Coefficients for each neuron connection for every layer of the MLP

In [39]:
layers = len(clf.coefs_)

for i in range(layers):
    weight_matrix_i = clf.coefs_[i]
    print ('>>> Weight Matrix Layer ', i+1, '\n')
    tmp = pd.DataFrame(weight_matrix_i, columns=range(weight_matrix_i.shape[1]))
    print (tmp, '\n\n')

>>> Weight Matrix Layer  1 

           0         1         2         3
0   0.023781 -0.120564 -0.043144  0.187900
1  -0.269434 -0.205923  0.090773  0.177617
2  -0.197601  0.040936  0.210999 -0.158062
3  -0.171123 -0.213103 -0.154751  0.260775
4   0.169821 -0.178505  0.169958 -0.122733
5  -0.037013  0.239590  0.170734 -0.088964
6  -0.176521 -0.069152 -0.271241 -0.134517
7   0.161101 -0.263794  0.051626  0.056750
8  -0.214767 -0.064196 -0.254493  0.212760
9   0.261942 -0.239467  0.210417  0.042104
10  0.132152  0.070932  0.042376 -0.260794
11 -0.157681  0.024398  0.144352 -0.135466
12 -0.116381  0.191888  0.256385  0.209733
13 -0.076312  0.053881 -0.081210 -0.086745
14 -0.175067 -0.142713 -0.249925  0.003203
15 -0.067197  0.050586  0.068559 -0.194299
16  0.236316  0.243048  0.053513 -0.060848
17 -0.074309 -0.160863 -0.123674 -0.137726
18 -0.177502  0.254058  0.246594  0.053575
19  0.126066 -0.086815 -0.224226 -0.019625
20  0.004897 -0.223943  0.013098  0.268139
21 -0.056972 -0.089421  0

In [20]:
clf.coefs_

[array([[ 0.0237808 , -0.12056436, -0.04314429,  0.1878995 ],
        [-0.26943395, -0.20592331,  0.0907731 ,  0.17761659],
        [-0.19760118,  0.04093634,  0.21099947, -0.15806237],
        [-0.17112256, -0.21310336, -0.1547511 ,  0.26077452],
        [ 0.16982135, -0.1785048 ,  0.16995752, -0.12273296],
        [-0.03701279,  0.23959011,  0.17073402, -0.08896419],
        [-0.17652124, -0.06915222, -0.27124075, -0.13451708],
        [ 0.16110114, -0.26379353,  0.05162614,  0.05674993],
        [-0.21476666, -0.06419618, -0.25449253,  0.21276043],
        [ 0.26194227, -0.23946739,  0.21041706,  0.04210404],
        [ 0.13215185,  0.07093207,  0.0423756 , -0.26079449],
        [-0.15768131,  0.02439768,  0.14435211, -0.13546618],
        [-0.11638053,  0.19188795,  0.25638509,  0.20973273],
        [-0.07631155,  0.05388147, -0.08120973, -0.08674489],
        [-0.17506706, -0.14271342, -0.2499246 ,  0.00320285],
        [-0.06719682,  0.05058579,  0.06855889, -0.1942986 ],
        