In [196]:
# Importing some functions to use in the implementation of the ML method
import numpy as np #import numpy to work with arrays
import pandas as pd #import pandas to manipulate the dataset
from matplotlib import pyplot as plt #import the module matplotlib.pyplot to do visulization
from sklearn.preprocessing import PolynomialFeatures    # function to generate polynomial and interaction features
from sklearn.linear_model import LinearRegression, LogisticRegression, HuberRegressor
from sklearn.metrics import mean_squared_error, accuracy_score    # function to calculate mean squared error 

# Bunch of data preprocessing here
ratings = pd.read_csv('ratingsvol3.csv') # read in the personal ratings
ratings['Movie'] = ratings['Movie'].str.lower() # make all strings lowercase

val_ratings = pd.read_csv('ratings_validation.csv') # read in the personal ratings for validation set
val_ratings['Movie'] = val_ratings['Movie'].str.lower()

imdb_data = pd.read_csv('imdb_basics.tsv', sep = '\t') # read in tsv
imdb_ratings = pd.read_csv('imdb_ratings.tsv', sep = '\t')
imdb_data = imdb_data.merge(imdb_ratings, on = 'tconst') # merge with ratings
imdb_data['Movie'] = imdb_data['Movie'].str.lower()

imdb_data = imdb_data.loc[(imdb_data['titleType'] == 'movie')] # pick only movies, no tv-shows or shorts for example
imdb_data = imdb_data.loc[(imdb_data['numVotes'] > 1000)] # filter out very obscure movies with similiar names
imdb_data['genres'] = imdb_data['genres'].str.split(',').apply(lambda x: x[0]) # pick only the first genre out of the three in the dataset
imdb_data.genres = pd.Categorical(imdb_data.genres) # make genres into integers instead of strings so that they can be used in the ML model
imdb_data['genreInt'] = imdb_data.genres.cat.codes # add a column with the genres as integers

  imdb_data = pd.read_csv('imdb_basics.tsv', sep = '\t')


In [201]:
ratings['Release_year'] = ratings['Release_year'].apply(lambda x: float(x)) # make release year into floats to be compatible with the imdb dataset
imdb_data['Release_year'] = pd.to_numeric(imdb_data['Release_year'], errors = 'coerce') # make make into floats as well, they are strings for some reason

newDf = imdb_data.merge(ratings, how = 'inner', on = ['Movie']) # combine ratings and imdb data using the name of the movie
newDf = newDf.loc[newDf.Release_year_x == newDf.Release_year_y] # filter out movies with the wrong release year (there are three movies named the little mermaid for example)

valDf = imdb_data.merge(val_ratings, how = 'inner', on = ['Movie'])
valDf = valDf.loc[valDf.Release_year_x == valDf.Release_year_y]

newDf = newDf.drop(['genres','tconst', 'titleType', 'originalTitle', 'isAdult', 'endYear', 'numVotes', 'Release_year_y', 'Onniscore', 'Eeroscore', 'Mikkoscore', 'Samppascore'], axis = 1)
valDf = valDf.drop(['genres','tconst', 'titleType', 'originalTitle', 'isAdult', 'endYear', 'numVotes', 'Release_year_y'], axis = 1) # drop irrelevant data

In [209]:
# Standard huber regression using SciKit

X_train = newDf.drop(['Gussescore', 'Movie'], axis = 1).to_numpy().reshape(-1, 4)
X_train = pd.DataFrame(X_train).reset_index(drop=False) # Index was incorrect due to the filtering done before so we have to reset it

X_val = valDf.drop(['Gussescore', 'Movie'], axis = 1).to_numpy().reshape(-1, 4)
X_val = pd.DataFrame(X_val).reset_index(drop=False)

y = newDf['Gussescore'].to_numpy()

huber = HuberRegressor()
clf1 = huber.fit(X_train, y)
y_pred = pd.DataFrame(clf1.predict(X_train).reshape(-1,))
y_pred_val = pd.DataFrame(clf1.predict(X_val).reshape(-1,))

predDf = newDf.reset_index(drop=False).join(y_pred)
validDf = valDf.reset_index(drop=False).join(y_pred_val)

# Let's calculate the mean squared error for the training set and the validation set
print(f"Training error: {mean_squared_error(y,y_pred)}\nValidation error: {mean_squared_error(valDf['Gussescore'],y_pred_val)}")

validDf

Training error: 2.8663458316136485
Validation error: 1.5657365480516292




Unnamed: 0,index,Movie,Release_year_x,runtimeMinutes,averageRating,genreInt,Gussescore,0
0,2,godzilla,2014.0,123,6.4,0,3.5,4.48613
1,4,dunkirk,2017.0,106,7.8,0,7.75,7.617825
2,5,the shining,1980.0,146,8.4,8,9.0,9.054685
3,8,venom,2018.0,112,6.7,0,5.0,5.193276
4,9,ghostbusters,1984.0,105,7.8,0,8.0,7.765903
5,11,aliens,1986.0,137,8.4,0,8.5,8.843375
6,12,die hard,1988.0,132,8.3,0,9.25,8.652202
7,13,the nightmare before christmas,1993.0,76,8.0,3,9.0,8.448063
8,14,pulp fiction,1994.0,154,8.9,6,7.25,9.965231
9,16,hamilton,2020.0,160,8.4,4,8.0,8.671172
