In [1]:
import numpy as np  
import pandas as pd 
import seaborn as sb 
import matplotlib.pyplot as plt 

import statsmodels.api as sm 
import statsmodels.discrete.discrete_model as smdm 

from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score, classification_report  
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis   

import nbformat 
from IPython import get_ipython 


In [None]:
# %run "../Data_Preprocessing/data_preprocess.ipynb" 

with open("../Data_Preprocessing/data_preprocess.ipynb", "r", encoding="utf-8") as f:
    notebook1 = nbformat.read(f, as_version=4)

ipython = get_ipython() 

for cell in notebook1.cells:
    if cell.cell_type == "code":
        print(cell.source) 
        if ("movies_data_cleaned" in cell.source or "movie_colen_data" in cell.source):
            ipython.run_cell(cell.source, silent=True) 
            # ipython.run_cell_async(cell.source, silent=True) 

try:
    print("Movies Clean Data : ")
    print(movies_data_cleaned.head())   # type: ignore 

except NameError as e:
    print(f"Variable not found: {e}")

In [None]:
movies_data_cleaned   # type: ignore 

Logistic Regression with Multiple Predictors 

In [None]:
X = movies_data_cleaned.loc[:, movies_data_cleaned.columns != 'Start_Tech_Oscar']        # type: ignore 
X 
# df.loc[:, <Allowed_or_not_column(s)>]   ----->>  [:] = all columns 

In [None]:
y = movies_data_cleaned['Start_Tech_Oscar']      # type: ignore 
y 

In [None]:
cls_logi_reg = LogisticRegression() 
cls_logi_reg.fit(X, y)         # X and y are independent variables 

logm_intercept = cls_logi_reg.intercept_     # beta_0 value 
logm_coeff = cls_logi_reg.coef_              # beta_n value 
print("Logarithmic classification model intercept = ", logm_intercept, "\nLogarithmic classification model coefficient = ", logm_coeff) 

# print("Logarithmic classification model intercept = ", logm_intercept, "\nLogarithmic classification model coefficient = ", logm_coeff[0]) 

Logistic Regression with statsmodels 

In [None]:
X_cons = sm.add_constant(X)     # Here, Beta_0 == 0 
X_cons  

In [None]:
# logit = sm.Logit(y, X_cons).fit() 
logit = smdm.Logit(y, X_cons).fit() 
logit.summary() 

In [None]:
sb.jointplot(x=movies_data_cleaned['Budget'], y=movies_data_cleaned['Start_Tech_Oscar'], data=movies_data_cleaned, kind='kde')     # type: ignore 

                        Confusion Matrix :- 

In [None]:
y_pred = cls_logi_reg.predict(X)        # default boundary condition of predict() : pred_value >= 0.5 
y_pred 

In [None]:
conf_mtx = confusion_matrix(y, y_pred) 
conf_mtx 

In [None]:
X_pred = cls_logi_reg.predict_proba(X)       # boundary condition : X_pred >= 0.5 
X_pred 

In [None]:
col_3 = int(input("Enter a random column from X predictor (0 or 1 only) : ")) 

y_pred_3 = X_pred[:, col_3] >= 0.3          # Here, boundary condition : pre_value >= 0.3 
y_pred_3  

In [None]:
conf_mtx_3 = confusion_matrix(y, y_pred_3)  
conf_mtx_3  

In [None]:
col_7 = int(input("Enter a random column from X predictor (0 or 1 only) : ")) 

y_pred_7 = X_pred[:, col_7] >= 0.7          # Here, boundary condition : pre_value >= 0.7 
y_pred_7  

In [None]:
conf_mtx_7 = confusion_matrix(y, y_pred_7)  
conf_mtx_7  

                    Performance Metrics :- 

In [None]:
prcn_sc = precision_score(y, y_pred)    
prcn_sc 

# Precision score = [ true_positive / (true_positive + false_positive) ] 

In [None]:
rcl_sc = recall_score(y, y_pred) 
rcl_sc 

# Recall score OR Sensitivity = [ true_positive / (true_positive + false_negative) ]

In [None]:
roc_sc = roc_auc_score(y, y_pred) 
roc_sc 

# Receiver Operating Characteristic curve : 
# X-axis = -ve predicted values      ,     Y-axis = +ve predicted values 
# It should be as far away from the diagonal line for better predicted values yield. 

                Linear Discriminant Analysis :- 

Linear Discriminant Analysis uses Bayes Theorem for multiple class or variable indicators. 

In [None]:
cls_lda = LinearDiscriminantAnalysis() 
cls_lda.fit(X, y) 

intcp = cls_lda.intercept_ 
coef = cls_lda.coef_ 

print(f" Linear Discriminant Analysis model intercept = {intcp} \n Linear Discriminant Analysis model coefficient = {coef} ") 

In [None]:
y_pred_lda = cls_lda.predict(X) 
y_pred_lda 

In [None]:
conf_mtx_lda = confusion_matrix(y, y_pred_lda) 
conf_mtx_lda 

Logistic Regression model and LDA model confusion_matrix comparision : 

In [None]:
print(f"Logistic model report (Boundary condition >= 0.5) : \n {classification_report(y, y_pred)} \n") 
print(f"Logistic model report (Boundary condition >= 0.7) : \n {classification_report(y, y_pred_7)} \n") 
print(f"Logistic model report (Boundary condition >= 0.3) : \n {classification_report(y, y_pred_3)} \n") 
print(f"Linear Discriminant Analysis model report : \n {classification_report(y, y_pred_lda)} \n")  

In [None]:
fig, axes = plt.subplots(2, 2, figsize = (12, 8)) 

sb.heatmap(conf_mtx, annot=True, fmt="d", cmap="Blues", ax=axes[0, 0]) 
axes[0, 0].set_title(" Confusion Matrix (Boundary condition >= 0.5) ")  

sb.heatmap(conf_mtx_3, annot=True, fmt="d", cmap="Greens", ax=axes[0, 1]) 
axes[0, 1].set_title(" Confusion Matrix (Boundary condition >= 0.3) ")   

sb.heatmap(conf_mtx_7, annot=True, fmt="d", cmap="Greys", ax=axes[1, 0]) 
axes[1, 0].set_title(" Confusion Matrix (Boundary condition >= 0.7) ")  

sb.heatmap(conf_mtx, annot=True, fmt="d", cmap="Oranges", ax=axes[1, 1]) 
axes[1, 1].set_title(" Confusion Matrix (LDA) ")   

plt.tight_layout() 
plt.show() 