In [1]:
# Importing Potentially Used Basic Dependencies
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter

# Import Data Preparation Dependencies for Algorithms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Import Alternate Algorithm Dependencies
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn import gaussian_process as gp
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor

# Import Tree Regressors
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import BaggingRegressor

In [2]:
df_path = "ML_Static_Dataframe.csv"
data_df = pd.read_csv(df_path)
data_df.head()

Unnamed: 0.1,Unnamed: 0,user_id,rating,name,year,mg_id,genre_id,movie_id
0,0,4,4.0,Toy Story,1995.0,0.0,1.0,1
1,1,4,4.0,Toy Story,1995.0,1.0,3.0,1
2,2,4,4.0,Toy Story,1995.0,2.0,4.0,1
3,3,4,4.0,Toy Story,1995.0,3.0,5.0,1
4,4,4,4.0,Toy Story,1995.0,4.0,9.0,1


In [3]:
# Run Only If Index Previously Included In CSV Being Read From
data_df = data_df.drop(columns=["Unnamed: 0"])
data_df.head()

Unnamed: 0,user_id,rating,name,year,mg_id,genre_id,movie_id
0,4,4.0,Toy Story,1995.0,0.0,1.0,1
1,4,4.0,Toy Story,1995.0,1.0,3.0,1
2,4,4.0,Toy Story,1995.0,2.0,4.0,1
3,4,4.0,Toy Story,1995.0,3.0,5.0,1
4,4,4.0,Toy Story,1995.0,4.0,9.0,1


In [4]:
data_df["year"] = data_df["year"].astype(int)
data_df["mg_id"] = data_df["mg_id"].astype(int)
data_df["genre_id"] = data_df["genre_id"].astype(int)

data_df.head()

Unnamed: 0,user_id,rating,name,year,mg_id,genre_id,movie_id
0,4,4.0,Toy Story,1995,0,1,1
1,4,4.0,Toy Story,1995,1,3,1
2,4,4.0,Toy Story,1995,2,4,1
3,4,4.0,Toy Story,1995,3,5,1
4,4,4.0,Toy Story,1995,4,9,1


In [5]:
data_df.count()

user_id     93749288
rating      93749288
name        93749288
year        93749288
mg_id       93749288
genre_id    93749288
movie_id    93749288
dtype: int64

# Machine Learning Model Creation and Testing

In [6]:
X = data_df.drop(columns=["user_id", "rating", "name"])
y = data_df["rating"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
# scaler = StandardScaler()
# X_scaler = scaler.fit(X_train)
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

# Regressor 1: Gaussian Regressor

In [8]:
# # Regressor 1: Gaussian Regressor
# model = GaussianProcessRegressor(random_state=0)
# model.fit(X_train_scaled, y_train)

# Gaussian Process Regressor Impossible To Run On This Dataset

In [None]:
# model.score(X_test_scaled, y_test)

# Regressor 2a: Extra Tree Regressor

In [None]:
# Regressor 2: Extra Tree Regressor
extra_tree = ExtraTreeRegressor(random_state=0)
model = BaggingRegressor(extra_tree, random_state=0)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
extree_preds = model.predict(X)

# Regressor 2b: Decision Tree Regressor

In [9]:
dcsn_tree = DecisionTreeRegressor(random_state=0)
model = DecisionTreeRegressor(random_state=0)

In [10]:
model.fit(X_train, y_train)

DecisionTreeRegressor(random_state=0)

In [11]:
model.score(X_test, y_test)

0.1863399816012008

In [12]:
dcsntree_preds = model.predict(X)

In [13]:
data_df["Predicted Rating"] = dcsntree_preds
data_df.head()

Unnamed: 0,user_id,rating,name,year,mg_id,genre_id,movie_id,Predicted Rating
0,4,4.0,Toy Story,1995,0,1,1,3.889785
1,4,4.0,Toy Story,1995,1,3,1,3.885614
2,4,4.0,Toy Story,1995,2,4,1,3.88414
3,4,4.0,Toy Story,1995,3,5,1,3.886414
4,4,4.0,Toy Story,1995,4,9,1,3.887228


In [15]:
data_df.to_csv("preliminary_prediction_dataframe.csv", index=False)

# Regressor 2c: Decision Tree and Extra Tree

In [None]:
model = BaggingRegressor(dcsn_tree, random_state=0)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
ext_decis_preds = model.predict(X)

# Regressor 3: K-Nearest Neighbors

In [None]:
# # Regressor 3: K-Nearest Neighbors
model = KNeighborsRegressor(n_neighbors=5,leaf_size=40, n_jobs=4)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
knn_preds = model.predict(X)