# Project A14 KAGGLE - A MOVIE SCORE PREDICTION
## Predicting IMDb movie ratings from given parameters
Marten Mark, Karl Soosalu, Mark Alexander Helme


# All kind of imports

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error

from keras.layers import Dense, BatchNormalization
from keras.models import Sequential
from keras.optimizers import SGD

# Preprocessing

In [None]:
# Reading data
movies = pd.read_csv("IMDb movies.csv")

In [None]:
# Converting date_published into year, month and day
movies["date_published"].iloc[83917] = "2019" #Replacing anomaly
date = lambda x: pd.Series([i for i in x.split('-')])
newCols = movies['date_published'].apply(date)
newCols = newCols.fillna(0)

movies["year_new"] = newCols[0]
movies["month"] = newCols[1]
movies["day"] = newCols[2]

In [None]:
# Removing columns
movies = movies.drop(["imdb_title_id", "title", "original_title", "year", "description", "budget", "usa_gross_income", "worlwide_gross_income", "metascore", "votes", "reviews_from_users", "reviews_from_critics", "date_published"], axis=1)

In [None]:
movies["language"] = movies.language.replace("None", np.nan)

In [None]:
movies = pd.concat([movies.drop('language', 1), movies['language'].str.get_dummies(sep=", ")], 1)
movies = pd.concat([movies.drop('genre', 1), movies['genre'].str.get_dummies(sep=", ")], 1)
movies = pd.concat([movies.drop('country', 1), movies['country'].str.get_dummies(sep=", ")], 1)

In [None]:
movies = movies.drop(["production_company", "director", "actors", "writer"], axis=1)
movies

# Machine learning models

In [None]:
X = movies.drop("avg_vote", axis=1)
y = movies.avg_vote

# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.3, random_state=0)

#### Linear regression

In [None]:
linear = LinearRegression().fit(X_train, y_train)
pred_linear = linear.predict(X_test)
pred_linear[pred_linear > 10] = 10
pred_linear[pred_linear < 0] = 0

mae_linear = mean_absolute_error(y_test, pred_linear)
print ("Linear regression mean absolute error (MAE):", round(mae_linear,4))

#### Lasso

In [None]:
lasso= Lasso().fit(X_train, y_train)
pred_lasso = lasso.predict(X_test)
pred_lasso[pred_lasso > 10] = 10
pred_lasso[pred_lasso < 0] = 0

mae_lasso = mean_absolute_error(y_test, pred_lasso)
print ("Lasso mean absolute error (MAE):", round(mae_lasso,4))

#### K nearest neighbor regression

In [None]:
neighbors= KNeighborsRegressor().fit(X_train, y_train)
pred_neighbors = neighbors.predict(X_test)
pred_neighbors[pred_neighbors > 10] = 10
pred_neighbors[pred_neighbors < 0] = 0

mae_neighbors = mean_absolute_error(y_test, pred_neighbors)
print ("Nearest neighbors mean absolute error (MAE):", round(mae_neighbors,4))

#### Ridge

In [None]:
ridge= Ridge().fit(X_train, y_train)
pred_ridge = ridge.predict(X_test)
pred_ridge[pred_ridge > 10] = 10
pred_ridge[pred_ridge < 0] = 0

mae_ridge = mean_absolute_error(y_test, pred_ridge)
print ("Ridge mean absolute error (MAE):", round(mae_ridge,4))

# Experimenting with neural networks

In [None]:
X = movies.drop("avg_vote", axis=1)
y = movies[["avg_vote"]]

# Normalization
scalerX= preprocessing.MinMaxScaler()
X[['duration',	'year_new',	'month',	'day']] = scalerX.fit_transform(X[['duration',	'year_new',	'month',	'day']])
scalery= preprocessing.MinMaxScaler()
y = scalery.fit_transform(y[['avg_vote']])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

model = Sequential()
model.add(Dense(64, activation="tanh", input_shape=(X_train.shape[1],)))
model.add(BatchNormalization())
model.add(Dense(32, activation="tanh"))
model.add(BatchNormalization())
model.add(Dense(16, activation="selu"))
model.add(BatchNormalization())
model.add(Dense(7, activation="linear"))
model.add(Dense(1, activation="linear"))
model.compile(loss="mean_squared_error", optimizer="sgd")
model.summary()

In [None]:
model.fit(X_train, y_train, epochs=4, validation_data=(X_test, y_test), batch_size=256, verbose=True)

y_pred = model.predict(X_test)
y_pred = scalery.inverse_transform(y_pred)
y_test = scalery.inverse_transform(y_test)

mae_neural = mean_absolute_error(y_test, y_pred)
print ("Neural networks mean absolute error (MAE):", round(mae_neural,4))

#### Summary

In [None]:
#Graph
import matplotlib.pyplot as plt
import matplotlib
dframe_linear=round(mae_linear,4)
dframe_lasso=round(mae_lasso,4)
dframe_neighbors=round(mae_neighbors,4)
dframe_ridge=round(mae_ridge,4)
dframe_neural=round(mae_neural,4)


method_results=[dframe_ridge,dframe_linear,dframe_lasso,dframe_neighbors]
method_names=["Ridge","Linear Regression","Lasso","K nearest neighbor"]


plt.rcParams['figure.figsize'] = (10,6)
matplotlib.style.use('ggplot')
dframe=pd.DataFrame({'Method':method_names, 'Mean absolute error':method_results})
dframe.sort_values('Mean absolute error',ascending=True).plot.bar(x='Method',y='Mean absolute error',rot=0)


In [None]:
import seaborn as sns

movies_copy = movies.copy()
movies_copy = movies_copy.sort_values(by='year_new')

plt.figure(figsize=(10,20))
ax = sns.histplot(x = 'avg_vote', y = 'year_new', data = movies_copy)
plt.show()

In [None]:
movies_copy = movies_copy.sort_values(by='duration')

plt.figure(figsize=(10,20))
ax = sns.histplot(x = 'avg_vote', y = 'duration', data = movies_copy, color = '#e377c2')
plt.ylim(40, 200)
plt.show()