In [54]:
import numpy as np 
import pandas as pd 
import seaborn as sb 

from sklearn.model_selection import train_test_split, validation_curve  
from sklearn.linear_model import Ridge 
from sklearn import preprocessing 
from sklearn.metrics import r2_score 

import nbformat 
from IPython import get_ipython 


In [None]:
# %run "../Data_Preprocessing/data_preprocess.ipynb" 

with open("../Data_Preprocessing/data_preprocess_test.ipynb", "r", encoding="utf-8") as f:
    notebook1 = nbformat.read(f, as_version=4)

ipython = get_ipython() 

for cell in notebook1.cells:
    if cell.cell_type == "code":
        print(cell.source) 
        if ("movies_test_data_cleaned" in cell.source or "movie_colen_data" in cell.source):
            ipython.run_cell(cell.source, silent=True) 
            # ipython.run_cell_async(cell.source, silent=True) 

try:
    print("Movies Test Clean Data : ")
    print(movies_test_data_cleaned.head(), sep='\n')      # type: ignore 

except NameError as e:
    print(f"Variable not found: {e}")

In [None]:
movies_test_data_cleaned      # type: ignore 

In [None]:
X_multi = movies_test_data_cleaned.drop('Collection', axis = 1)       # type: ignore 
X_multi  

In [None]:
y_multi = movies_test_data_cleaned['Collection']       # type: ignore 
y_multi 

Train - Test Split :

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_multi, y_multi, test_size = 0.3, random_state = 0)    

print(f" X_train size : {X_train.shape} \n X_test size : {X_test.shape} \n y_train size : {y_train.shape} \n y_test size : {y_test.shape}") 

Scalar formation for X values - 

In [73]:
scaler = preprocessing.StandardScaler().fit(X_train) 

X_train_s = scaler.transform(X_train) 
X_test_s = scaler.transform(X_test) 

Ridge Regression - 

In [None]:
lin_model_r = Ridge(alpha = 0.6)     # alpha ~ theoretical lambda value 
lin_model_r.fit(X_train_s, y_train) 

r2 = r2_score(y_test, lin_model_r.predict(X_test_s)) 
print(f"r2 score = {r2}") 

In [None]:
param_rg = np.logspace(-2, 8, 100)     # 100 values from 10^-2 to 10^8 
param_rg   # Range of alphas 

In [None]:
train_scores, test_scores = validation_curve(Ridge(), X_train_s, y_train, param_name="alpha", param_range=param_rg, scoring='r2')  
# This is running K-Fold Cross Validation bts.. 
print(f" Train score : {train_scores} \n Test score : {test_scores}") 

In [None]:
train_mean = np.mean(train_scores, axis=1) 
test_mean = np.mean(test_scores, axis=1) 

print(f" Train mean : {train_mean} \n Test mean : {test_mean}") 

In [None]:
max(test_mean) 

In [None]:
sb.jointplot(x = np.log(param_rg), y = test_mean)  

In [None]:
print(np.where(test_mean == max(test_mean)))
print(param_rg[36]) 

Best case with max value of the set  

In [None]:
lin_model_r_best = Ridge(alpha = param_rg[36])  
lin_model_r_best.fit(X_train_s, y_train) 

r2 = r2_score(y_train, lin_model_r_best.predict(X_train_s)) 
print(f"r2 train score = {r2}") 

r2 = r2_score(y_test, lin_model_r_best.predict(X_test_s)) 
print(f"r2 test score = {r2}") 