In [1]:
# Importing Potentially Used Basic Dependencies
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter

# Import Data Preparation Dependencies for Algorithms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Import Alternate Algorithm Dependencies
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn import gaussian_process as gp
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor

# Import Tree Regressors
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
df_path = "ML_Static_Dataframe_full_alt.csv"
data_df = pd.read_csv(df_path)
data_df.head()

Unnamed: 0,user_id,rating,name,year,runtime,genre_id,cast_id,movie_id
0,4,4.0,Toy Story,1995,81.0,1,31,1
1,4,4.0,Toy Story,1995,81.0,3,31,1
2,4,4.0,Toy Story,1995,81.0,4,31,1
3,4,4.0,Toy Story,1995,81.0,5,31,1
4,4,4.0,Toy Story,1995,81.0,9,31,1


In [3]:
# Run Only If Index Previously Included In CSV Being Read From
# data_df = data_df.drop(columns=["Unnamed: 0"])
# data_df.head()

In [4]:
# data_df["year"] = data_df["year"].astype(int)
# data_df["mg_id"] = data_df["mg_id"].astype(int)
# data_df["genre_id"] = data_df["genre_id"].astype(int)
# data_df["cast_id"] = data_df["cast_id"].astype(int)

# data_df.head()

In [5]:
data_df.count()

user_id     38412582
rating      38412582
name        38412582
year        38412582
runtime     38412582
genre_id    38412582
cast_id     38412582
movie_id    38412582
dtype: int64

# Machine Learning Model Creation and Testing

In [6]:
X = data_df.drop(columns=["user_id", "rating", "name"])
y = data_df["rating"]

In [7]:
X.head()

Unnamed: 0,year,runtime,genre_id,cast_id,movie_id
0,1995,81.0,1,31,1
1,1995,81.0,3,31,1
2,1995,81.0,4,31,1
3,1995,81.0,5,31,1
4,1995,81.0,9,31,1


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
# scaler = StandardScaler()
# X_scaler = scaler.fit(X_train)
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

# Regressor 1: Gaussian Regressor

In [9]:
# # Regressor 1: Gaussian Regressor
# model = GaussianProcessRegressor(random_state=0)
# model.fit(X_train_scaled, y_train)

# Gaussian Process Regressor Impossible To Run On This Dataset

In [10]:
# model.score(X_test_scaled, y_test)

# Regressor 2a: Extra Tree Regressor

In [11]:
# Regressor 2: Extra Tree Regressor
extra_tree = ExtraTreeRegressor(random_state=0)
model = BaggingRegressor(extra_tree, random_state=0)

In [12]:
model.fit(X_train, y_train)

BaggingRegressor(base_estimator=ExtraTreeRegressor(random_state=0),
                 random_state=0)

In [13]:
model.score(X_test, y_test)

0.1898226470586416

In [14]:
extree_preds = model.predict(X)

# Regressor 2b: Decision Tree Regressor

In [15]:
dcsn_tree = DecisionTreeRegressor(random_state=0)
model = DecisionTreeRegressor(random_state=0)

In [16]:
model.fit(X_train, y_train)

DecisionTreeRegressor(random_state=0)

In [17]:
model.score(X_test, y_test)

0.18985347898845306

In [18]:
dcsntree_preds = model.predict(X)

In [19]:
data_df["Predicted Rating"] = dcsntree_preds
data_df.head()

Unnamed: 0,user_id,rating,name,year,runtime,genre_id,cast_id,movie_id,Predicted Rating
0,4,4.0,Toy Story,1995,81.0,1,31,1,3.886837
1,4,4.0,Toy Story,1995,81.0,3,31,1,3.887614
2,4,4.0,Toy Story,1995,81.0,4,31,1,3.885183
3,4,4.0,Toy Story,1995,81.0,5,31,1,3.887428
4,4,4.0,Toy Story,1995,81.0,9,31,1,3.88386


In [20]:
data_df.to_csv("full_df_prediction_dataframe.csv", index=False)

# Regressor 2c: Decision Tree and Extra Tree

In [21]:
model = BaggingRegressor(dcsn_tree, random_state=0)

In [22]:
model.fit(X_train, y_train)

BaggingRegressor(base_estimator=DecisionTreeRegressor(random_state=0),
                 random_state=0)

In [23]:
model.score(X_test, y_test)

0.18982264126597248

In [24]:
ext_decis_preds = model.predict(X)

# Regressor 3: K-Nearest Neighbors

In [25]:
# # Regressor 3: K-Nearest Neighbors
# model = KNeighborsRegressor(n_neighbors=5,leaf_size=40, n_jobs=4)

In [26]:
# model.fit(X_train, y_train)

In [27]:
# model.score(X_test, y_test)

In [28]:
# knn_preds = model.predict(X)

# Regressor 4: Random Forest Regressor

In [29]:
model = RandomForestRegressor(n_estimators=100, random_state=0) 

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
rf_preds = model.predict(X)

# Regressor 5: 