In [1]:
# Initial imports.
import numpy as np
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
# Loading data
file_path = Path("Resources/Movie_Data.csv")
movies_df = pd.read_csv(file_path)
movies_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,1,665,5.0,1147878820,Little Buddha (1993),Drama
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance


In [3]:
movies_df.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
title         object
genres        object
dtype: object

In [4]:
# Preprocess the data

# Drop the null columns where all values are null
movies_df = movies_df.dropna(axis='columns', how='all')

# Drop the null rows
movies_df = movies_df.dropna()

# Drop title column
movies_df = movies_df.drop(['title'], axis=1)



movies_df


Unnamed: 0,userId,movieId,rating,timestamp,genres
0,1,296,5.0,1147880044,Comedy|Crime|Drama|Thriller
1,1,306,3.5,1147868817,Drama
2,1,307,5.0,1147868828,Drama
3,1,665,5.0,1147878820,Drama
4,1,899,3.5,1147868510,Comedy|Musical|Romance
5,1,1088,4.0,1147868495,Drama|Musical|Romance


In [5]:
movies_df.select_dtypes(include=['object']).columns

Index(['genres'], dtype='object')

In [6]:
movies_df = pd.get_dummies(movies_df, columns = ['genres'])
movies_df.head()

# instead of using get_dummies, i need to bucket the genres differently. 

Unnamed: 0,userId,movieId,rating,timestamp,genres_Comedy|Crime|Drama|Thriller,genres_Comedy|Musical|Romance,genres_Drama,genres_Drama|Musical|Romance
0,1,296,5.0,1147880044,1,0,0,0
1,1,306,3.5,1147868817,0,0,1,0
2,1,307,5.0,1147868828,0,0,1,0
3,1,665,5.0,1147878820,0,0,1,0
4,1,899,3.5,1147868510,0,1,0,0


In [7]:
X = movies_df


y = movies_df[['rating']]

In [8]:
X.describe()

Unnamed: 0,userId,movieId,rating,timestamp,genres_Comedy|Crime|Drama|Thriller,genres_Comedy|Musical|Romance,genres_Drama,genres_Drama|Musical|Romance
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,1.0,593.5,4.333333,1147872000.0,0.166667,0.166667,0.5,0.166667
std,0.0,345.316521,0.752773,5576.639,0.408248,0.408248,0.547723,0.408248
min,1.0,296.0,3.5,1147868000.0,0.0,0.0,0.0,0.0
25%,1.0,306.25,3.625,1147869000.0,0.0,0.0,0.0,0.0
50%,1.0,486.0,4.5,1147869000.0,0.0,0.0,0.5,0.0
75%,1.0,840.5,5.0,1147876000.0,0.0,0.0,1.0,0.0
max,1.0,1088.0,5.0,1147880000.0,1.0,1.0,1.0,1.0


In [9]:
y['rating'].value_counts()

5.0    3
3.5    2
4.0    1
Name: rating, dtype: int64

In [10]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [11]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Create a random forest Regressor
k = 5
rf_model = RandomForestRegressor(n_estimators=128, random_state=78, max_depth = k) 

In [13]:
# Fitting the model
# y_train.shape
# rf_model = rf_model.fit(X_train_scaled, y_train)
rf_model = rf_model.fit(X_train_scaled, y_train.values.ravel())
# np.ravel(y_train)
# y_train.values.ravel().shape

In [14]:
# Making predictions using the testing data.

predictions = rf_model.predict(X_test_scaled)

In [18]:
print('MAE: ', mean_absolute_error(y_test, predictions))
print('MSE: ', mean_squared_error(y_test, predictions)) 

MAE:  0.33203125
MSE:  0.123077392578125


In [23]:
# train_scores = []
# random_forest_models = []

# train_scores.append(rf_model.score(X_train_scaled, y_train))
rf_model.score(X_train_scaled, y_train)

# random_forest_models.append(rf_model)

0.855712890625

In [20]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.        , 0.16144646, 0.35317342, 0.34194007, 0.07232299,
       0.        , 0.01437209, 0.05674497])

In [21]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3531734232001852, 'rating'),
 (0.341940066739353, 'timestamp'),
 (0.1614464598407506, 'movieId'),
 (0.07232299203753267, 'genres_Comedy|Crime|Drama|Thriller'),
 (0.056744969769055395, 'genres_Drama|Musical|Romance'),
 (0.014372088413123183, 'genres_Drama'),
 (0.0, 'userId'),
 (0.0, 'genres_Comedy|Musical|Romance')]

In [None]:
#  note:  handle genres differently or is it ok? The above indicates that movie that falls in multiple genres have a higher importance.
