In [16]:
import numpy as np  
import pandas as pd 
import seaborn as sb 

import statsmodels.api as sm 
import statsmodels.discrete.discrete_model as smdm 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score   

import nbformat 
from IPython import get_ipython 


In [None]:
# %run "../Data_Preprocessing/data_preprocess.ipynb" 

with open("../Data_Preprocessing/data_preprocess.ipynb", "r", encoding="utf-8") as f:
    notebook1 = nbformat.read(f, as_version=4)

ipython = get_ipython() 

for cell in notebook1.cells:
    if cell.cell_type == "code":
        print(cell.source) 
        if ("hp_cleaned" in cell.source or "hp_d" in cell.source):
            ipython.run_cell(cell.source, silent=True) 
            # ipython.run_cell_async(cell.source, silent=True) 

try:
    print("\nHouse Price Cleaned Data : ")
    print(hp_cleaned.head())   # type: ignore 
    
except NameError as e:
    print(f"Variable not found: {e}")

In [None]:
hp_cleaned  # type: ignore 

Logistic Regression with Multiple Predictors 

In [None]:
X = hp_cleaned.loc[:, hp_cleaned.columns != 'Sold']        # type: ignore 
X 
# df.loc[:, <Allowed_or_not_column(s)>]   ----->>  [:] = all columns 

In [None]:
y = hp_cleaned['Sold']       # type: ignore 
y 

In [None]:
cls_logi_reg = LogisticRegression() 
cls_logi_reg.fit(X, y)         # X and y are independent variables 

logm_intercept = cls_logi_reg.intercept_     # beta_0 value 
logm_coeff = cls_logi_reg.coef_              # beta_n value 
print("Logarithmic classification model intercept = ", logm_intercept, "\nLogarithmic classification model coefficient = ", logm_coeff) 

# print("Logarithmic classification model intercept = ", logm_intercept, "\nLogarithmic classification model coefficient = ", logm_coeff[0]) 

Logistic Regression with statsmodels 

In [None]:
X_cons = sm.add_constant(X)     # Here, Beta_0 == 0 
X_cons  

In [None]:
# logit = smdm.Logit(y, X_cons).fit() 
logit = sm.Logit(y, X_cons).fit() 
logit.summary() 

In [None]:
sb.jointplot(x=hp_cleaned['price'], y=hp_cleaned['Sold'], data=hp_cleaned, kind='kde')     # type: ignore  

                Confusion Matrix :- 

In [None]:
y_pred = cls_logi_reg.predict(X)          # default boundary condition of predict() : pred_value >= 0.5   
y_pred 

In [None]:
conf_mtx = confusion_matrix(y, y_pred) 
conf_mtx 

In [None]:
X_pred = cls_logi_reg.predict_proba(X)       # boundary condition : X_pred >= 0.5 
X_pred 

In [None]:
col_3 = int(input("Enter a random column from X predictor (0 or 1 only) : ")) 

y_pred_3 = X_pred[:, col_3] >= 0.3          # Here, boundary condition : pre_value >= 0.3 
y_pred_3  

In [None]:
conf_mtx = confusion_matrix(y, y_pred_3)  
conf_mtx 

In [None]:
col_7 = int(input("Enter a random column from X predictor (0 or 1 only) : ")) 

y_pred_7 = X_pred[:, col_7] >= 0.7          # Here, boundary condition : pre_value >= 0.7 
y_pred_7  

In [None]:
conf_mtx = confusion_matrix(y, y_pred_7)  
conf_mtx 

                    Performance Metrics :- 

In [None]:
prcn_sc = precision_score(y, y_pred)    
prcn_sc 

# Precision score = [ true_positive / (true_positive + false_positive) ] 

In [None]:
rcl_sc = recall_score(y, y_pred) 
rcl_sc 

# Recall score OR Sensitivity = [ true_positive / (true_positive + false_negative) ]

In [None]:
roc_sc = roc_auc_score(y, y_pred) 
roc_sc 

# Receiver Operating Characteristic curve : 
# X-axis = -ve predicted values      ,     Y-axis = +ve predicted values 
# It should be as far away from the diagonal line for better predicted values yield. 