In [None]:
# import packages
import numpy as np
import pickle as pk
import pandas as pd
import seaborn as sns
from matplotlib import pyplot 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error


In [None]:
# load data 
movies = pd.read_csv("../datasets/ml-25m/movies.csv")
ratings = pd.read_csv("../datasets/ml-25m/ratings.csv")

In [None]:
# dummy encode genre 
genres_encoded = movies['genres'].str.get_dummies('|')

# merge with original
df_encoded = pd.concat([movies, genres_encoded], axis=1)

# drop genre and no genre listed columns
df_encoded.drop(columns=["genres","(no genres listed)"], inplace=True)

In [None]:
# add column for year
df_encoded['year'] = df_encoded['title'].str.extract(r'\((\d{4})\)')
df_encoded['year'] = df_encoded['year'].fillna(0).astype('int64')
df_encoded.head()

In [None]:
# merge rating and encoded dataframes 
merged = ratings.merge(df_encoded,on="movieId")
merged.head()

In [None]:
# drop timestamp column 
merged.drop(columns="timestamp",inplace = True)

In [None]:
# display NA values  
merged.isna().sum()

In [None]:
# take sample of 100k records from merged dataset
merged_sample = merged.sample(100000)

In [None]:
# rating is a float number, to pass it to model, we need integer value
encoder = {
          0.5: 1,
          1 : 2,
          1.5: 3,
          2:4,
          2.5 : 5,
          3:6,
          3.5:7,
          4:8,
          4.5:9,
          5:10
           }
merged_sample["rating"] = merged_sample["rating"].map(encoder)

In [None]:
# select predictor columns
columns_X = merged_sample.columns.to_list()
columns_X.remove("rating")
columns_X.remove("title")
columns_X  

In [None]:
# declare X and y values 
X = merged_sample[columns_X]
y = merged_sample["rating"]

In [None]:
# select few algorithms for comparison 
models = []
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('DT', DecisionTreeRegressor()))
models.append(("RF", RandomForestClassifier()))
models.append(('NB', GaussianNB()))

In [None]:
# split data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X,y , 
                                   random_state=104,  
                                   test_size=0.3,  
                                   shuffle=True) 


In [None]:
# scale training ans testing set 
scaler = StandardScaler()  
scaler.fit(X_train)  
X_train_scaled = scaler.transform(X_train)  
X_test_scaled = scaler.transform(X_test)  

In [None]:
# perform stratified k fold cross validation to see, which model performs the best 
results = []
names = []
for name, model in models:
  kfold = StratifiedKFold(n_splits=5, random_state=104, shuffle=True)
  cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
  results.append(cv_results)
  names.append(name)
  print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())) 

In [None]:
# plot boxplot of results for each algorithm 
pyplot.boxplot(results, labels=names)
pyplot.title('Algorithm Comparison')
pyplot.show() 

In [None]:
# we can see that LDA performed the best, we can now try to optimise it with grid search
parameters = {
    'solver': ['svd'],
    'n_components': [None, 1, 2, 3],
    'tol': [1e-4, 1e-3, 1e-2],
    'store_covariance': [True, False],
}

# Initialize LDA
lda = LinearDiscriminantAnalysis()

# Perform grid search
grid_search = GridSearchCV(lda, parameters, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
# we can create our model with parameters that achieved the best score 
model = LinearDiscriminantAnalysis(**grid_search.best_params_)
y_pred_test = model.fit(X_train,y_train).predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)

At this point, we have chosen and optimised one of models available in sklearn package.  
In the next part, we will compare it to algorithm obtained from Suprise package, which was influenced by SVD algorithm. 