In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from tqdm import tqdm,tqdm_notebook

from sklearn import preprocessing
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.linear_model import LinearRegression

## Read Train CSV

In [2]:
df3 = pd.read_csv('comp-data/4a-training-dataset-creation/train.csv')

In [3]:
df3.head(3)

Unnamed: 0,age,gender,zip code,ocupation,target,unknown,Action,Adventure,Animation,Children's,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release-year
0,2,0,85711,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
1,4,0,29206,7,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
2,3,1,37212,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994


## Normalize the Dataset so that we eliminate outliers
  * Final Features: {age, gender, zip code, occupation, target, movie_genres (one-hot representation), release year}

In [4]:
# Normalize the Dataset
X = np.array(df3.drop(['target', 'zip code'], axis=1))
X = preprocessing.scale(X, axis=1)
# Separate the correct answer y from the Dataset
y = np.array(df3['target'])



## Initiate the Repeated K-fold Validation Learning Technique
  * for K = 5 (a.k.a 5 shards of the learning dataset)
  * and for 10 repeats (so as to have concrete proof of the machine's accuracy

In [5]:
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=100)

## Initiate the Least Squares Algorithm

In [6]:
clf = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=-1)
accuracy = np.array([])

for train_index, test_index in tqdm_notebook(rkf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    accuracy = np.append(accuracy, [clf.score(X_test,y_test)])

A Jupyter Widget




## Print The Best and Worst Case Accuracy (we see ~2.2% at best and ~1.3% at worst)

In [7]:
print(np.amax(accuracy))
print(np.amin(accuracy))

0.0220490424448
0.0145960649968


## Print The Coefficients of the LS Line for Each Feature

In [23]:
for feature, coef in zip(df3.columns, clf.coef_):
    print ("Feature: ",feature,"\t\tCoefficient in LS Line: ",coef)

Feature:  age 		Coefficient in LS Line:  -59275403.8278
Feature:  gender 		Coefficient in LS Line:  -59275415.2115
Feature:  zip code 		Coefficient in LS Line:  -59275408.7048
Feature:  ocupation 		Coefficient in LS Line:  -59275581.0935
Feature:  target 		Coefficient in LS Line:  -59275417.748
Feature:  unknown 		Coefficient in LS Line:  -59275401.9718
Feature:  Action 		Coefficient in LS Line:  -59275372.8292
Feature:  Adventure 		Coefficient in LS Line:  -59275433.129
Feature:  Animation 		Coefficient in LS Line:  -59275414.7403
Feature:  Children's 		Coefficient in LS Line:  -59275405.395
Feature:  Comedy 		Coefficient in LS Line:  -59275394.1112
Feature:  Crime 		Coefficient in LS Line:  -59275385.2372
Feature:  Documentary 		Coefficient in LS Line:  -59275428.2778
Feature:  Drama 		Coefficient in LS Line:  -59275373.1127
Feature:  Fantasy 		Coefficient in LS Line:  -59275427.8511
Feature:  Film-Noir 		Coefficient in LS Line:  -59275404.7933
Feature:  Horror 		Coefficient in LS Li

# ~ END OF CHAPTER 4 - LEAST SQUARES (Linear Regression) CLASSIFIER ~