In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from tqdm import tqdm,tqdm_notebook

from sklearn import preprocessing
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.linear_model import LinearRegression

## Read CSV's

In [2]:
movies = pd.read_csv('comp-data/4a-training-dataset-creation/movies.csv')
users = pd.read_csv('comp-data/4a-training-dataset-creation/users.csv')
ratings3 = pd.read_csv('comp-data/4a-training-dataset-creation/ratings3.csv')

## Create Class Labels of our Dataset
   * The Label 0 implies that people who rated a movie with **2 or less** stars are **most likely were not going to see that movie in the first place**
   * The Lavel 1 implies that people who rated a movie with **3 or more** stars are **most likely were going to see that movie in the first place**

In [3]:
df3 = ((users.merge(ratings3, how='inner', on='user id'))\
.merge(movies, how='inner', left_on='item id', right_on='movie id'))\
.drop(['item id', 'movie id', 'user id'], axis=1)

In [4]:
df3.head(10)

Unnamed: 0,age,gender,zip code,ocupation,target,unknown,Action,Adventure,Animation,Children's,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release-year
0,2,0,85711,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
1,4,0,29206,7,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
2,3,1,37212,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
3,2,0,52246,10,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
4,4,0,8403,7,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
5,5,0,6472,17,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
6,2,0,2215,5,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
7,2,0,71457,5,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
8,5,0,20910,10,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
9,2,0,53703,5,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994


## Normalize the Dataset so that we eliminate outliers
  * Final Features: {age, gender, zip code, occupation, target, movie_genres (one-hot representation), release year}

In [5]:
# Normalize the Dataset
X = np.array(df3.drop(['target', 'zip code'], axis=1))
X = preprocessing.scale(X, axis=1)
# Separate the correct answer y from the Dataset
y = np.array(df3['target'])



## Initiate the Repeated K-fold Validation Learning Technique
  * for K = 5 (a.k.a 5 shards of the learning dataset)
  * and for 10 repeats (so as to have concrete proof of the machine's accuracy

In [6]:
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=100)

## Initiate the Least Squares Algorithm

In [12]:
clf = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=-1)
accuracy = np.array([])

for train_index, test_index in tqdm_notebook(rkf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    accuracy = np.append(accuracy, [clf.score(X_test,y_test)])

A Jupyter Widget




## Print The Best and Worst Case Accuracy (we see ~2.2% at best and ~1.3% at worst)

In [11]:
print(np.amax(accuracy))
print(np.amin(accuracy))

0.0220490424448
0.0145960649968


# ~ END OF CHAPTER 4 - LEAST SQUARES (Linear Regression) CLASSIFIER ~