In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from tqdm import tqdm,tqdm_notebook

from sklearn.model_selection import RepeatedKFold
from sklearn.neural_network import MLPClassifier

## Read CSV's

In [2]:
movies = pd.read_csv('comp-data/4a-training-dataset-creation/movies.csv')
users = pd.read_csv('comp-data/4a-training-dataset-creation/users.csv')
ratings3 = pd.read_csv('comp-data/4a-training-dataset-creation/ratings3.csv')
ratings4 = pd.read_csv('comp-data/4a-training-dataset-creation/ratings4.csv')

## Create the two Aprroaches of the Dataset
  * The first one implies that people who rated a movie with **2 or less** stars are **most likely not going to see that movie in the first place**
  * The second one implies that people who rated a movie with **3 or less** stars are **most likely not going to see that movie in the first place**

In [3]:
df3 = ((users.merge(ratings3, how='inner', on='user id'))\
.merge(movies, how='inner', left_on='item id', right_on='movie id'))\
.drop(['item id', 'movie id', 'user id'], axis=1)

df3.rename(columns={'rating': 'target'}, inplace=True)

In [5]:
df3.head(10)

Unnamed: 0,age,gender,zip code,ocupation,target,unknown,Action,Adventure,Animation,Children's,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release-year
0,2,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
1,4,0,1,7,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
2,3,1,2,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
3,2,0,3,10,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
4,4,0,4,7,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
5,5,0,5,17,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
6,2,0,6,5,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
7,2,0,7,5,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
8,5,0,8,10,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
9,2,0,9,5,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994


In [4]:
zpc_enc = pd.factorize(df3['zip code'], sort=False, order=None, na_sentinel=-1, size_hint=None)
df3['zip code'] = zpc_enc[0]

## Normalize the Dataset so that we eliminate outliers
  * Final Features: {age, gender, zip code, occupation, target, movie_genres (one-hot representation), release year}

In [6]:
# Normalize the Dataset
X = np.array(df3.drop(['target'], axis=1))
X = preprocessing.scale(X, axis=1)
# Separate the correct answer y from the Dataset
y = np.array(df3['target'])



## Initiate the Repeated K-fold Validation Learning Technique
  * for K = 5 (a.k.a 5 shards of the learning dataset)
  * and for 10 repeats (so as to have concrete proof of the machine's accuracy

## Create the Multilayered Perceptron

In [7]:
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=100)
clf = MLPClassifier(hidden_layer_sizes=(4,4), activation='logistic', solver='sgd', alpha=1e-5, learning_rate='adaptive', random_state=100, verbose=False)

## Training The (Deep) Neural Network...

In [8]:
accuracy = np.array([])

for train_index, test_index in tqdm_notebook(rkf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    accuracy = np.append(accuracy, [clf.score(X_test,y_test)])

A Jupyter Widget




## Print The Best and Worst Case Accuracy (we see ~82.98% at best and ~81.94% at worst)

In [12]:
print(np.amax(accuracy))
print(np.amin(accuracy))

0.829841492075
0.819481948195
