In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from tqdm import tqdm,tqdm_notebook

from sklearn.model_selection import RepeatedKFold
from sklearn.neural_network import MLPClassifier

## Read CSV's

In [2]:
df3 = pd.read_csv('comp-data/4a-training-dataset-creation/train.csv')

In [3]:
df3.head(3)

Unnamed: 0,age,gender,zip code,ocupation,target,unknown,Action,Adventure,Animation,Children's,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release-year
0,2,0,85711,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
1,4,0,29206,7,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
2,3,1,37212,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994


In [4]:
zpc_enc = pd.factorize(df3['zip code'], sort=False, order=None, na_sentinel=-1, size_hint=None)
df3['zip code'] = zpc_enc[0]

## Normalize the Dataset so that we eliminate outliers
  * Final Features: {age, gender, zip code, occupation, target, movie_genres (one-hot representation), release year}

In [5]:
# Normalize the Dataset
X = np.array(df3.drop(['target'], axis=1))
X = preprocessing.scale(X, axis=1)
# Separate the correct answer y from the Dataset
y = np.array(df3['target'])



## Initiate the Repeated K-fold Validation Learning Technique
  * for K = 5 (a.k.a 5 shards of the learning dataset)
  * and for 10 repeats (so as to have concrete proof of the machine's accuracy

## Create the Multilayered Perceptron

In [6]:
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=100)
clf = MLPClassifier(hidden_layer_sizes=(4,4), activation='logistic', solver='sgd', alpha=1e-5, learning_rate='adaptive', random_state=100, verbose=False)

## Training The (Deep) Neural Network...

In [7]:
accuracy = np.array([])

for train_index, test_index in tqdm_notebook(rkf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    accuracy = np.append(accuracy, [clf.score(X_test,y_test)])

A Jupyter Widget




## Print The Best and Worst Case Accuracy (we see ~82.98% at best and ~81.94% at worst)

In [8]:
print(np.amax(accuracy))
print(np.amin(accuracy))

0.829841492075
0.819481948195


## Print The Coefficients for each neuron connection for every layer of the MLP

In [12]:
clf.coefs_

[array([[ 0.02019929, -0.11911074, -0.04062271,  0.18058035],
        [-0.26787537, -0.20298129,  0.09099156,  0.17030353],
        [-0.18816625,  0.04213448,  0.20948576, -0.14784703],
        [-0.17112062, -0.20999262, -0.15008327,  0.25223312],
        [ 0.16345933, -0.17605564,  0.16875026, -0.1246412 ],
        [-0.03964504,  0.23450175,  0.16951322, -0.09147641],
        [-0.17663939, -0.0686754 , -0.26449829, -0.13620853],
        [ 0.15489819, -0.25980664,  0.05255519,  0.05160966],
        [-0.21420201, -0.06380841, -0.24804144,  0.20480171],
        [ 0.25391444, -0.23593202,  0.2084794 ,  0.03722062],
        [ 0.12647423,  0.06888816,  0.04346755, -0.26020769],
        [-0.15813871,  0.02318421,  0.14356918, -0.13713704],
        [-0.11754342,  0.18768745,  0.25362018,  0.20188276],
        [-0.07823948,  0.05213995, -0.07789335, -0.08930195],
        [-0.17521121, -0.14090667, -0.24355999, -0.00097146],
        [-0.06929285,  0.04890047,  0.06917904, -0.19492207],
        