### <u> **Libraries Used** </u>

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_validate
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, precision_score, precision_recall_curve, recall_score, f1_score, roc_curve, confusion_matrix, classification_report
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

### **<u>Reading Train and Test Dataset and Seeing missing proportion of each feature</u>**

In [43]:
df_train = pd.read_csv("/Applications/CISC251/home-credit-default-risk (1)/application_train.csv")
df_test = pd.read_csv("/Applications/CISC251/home-credit-default-risk (1)/application_test.csv")
missing_percent = (df_train.isnull().sum()/df_train.shape[0]) * 100
print(missing_percent)

KeyboardInterrupt: 

### <u>**Imputation Techniques --> Preprocessing**</u>
- We have many missing values in the dataset.
- To create the most informative dataset, we want to retain as much data as possible.
- Since we don’t know which features are truly important and may perform feature engineering later, we decided to keep all features rather than dropping them.
- Experiments showed that dropping features with >60% missing values led to a significant drop in model performance compared to keeping them.
- This suggests that some features, even though they have many missing values, contain critical information for predicting Default vs Non-Default.

- **SimpleImputer**:
    - Works well when a feature has most of its values present (e.g., >95%).
    - Filling missing values with the mean (or median) gives relatively accurate results for such features.
###
- **IterativeImputer**:
    - Predicts missing values for each feature using other features, taking into account their relationships.
    - Provides more accurate imputation, especially when features are correlated.
    - Caution: It is computationally expensive, so consider runtime when using large datasets. 
#
- **Categorical Multiclass Features**:
    - Create a missing flag as these features missing values may be important will be One Hot Encoded in the near future 

**Recommendation**:
- Use SimpleImputer for features with few missing values.
- Use IterativeImputer for features where relationships between variables are important and computational cost is acceptable.

In [None]:
#Above 60% Missing values we can drop these features as we dont have enough data to support(Ignore this as there some important features taken out)
threshold = 60
dropped_features = missing_percent[missing_percent > threshold].index
print("*****************************")
print("Dropped Features(60% missing values): ", dropped_features.tolist())
df_train = df_train.drop(columns=dropped_features)
print("*****************************")
print("Kept Features:", df_train.columns.tolist())
print("*****************************")


Dropped Features(60% missing values):  ['OWN_CAR_AGE', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'FLOORSMIN_AVG', 'LIVINGAPARTMENTS_AVG', 'NONLIVINGAPARTMENTS_AVG', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'FLOORSMIN_MODE', 'LIVINGAPARTMENTS_MODE', 'NONLIVINGAPARTMENTS_MODE', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'FLOORSMIN_MEDI', 'LIVINGAPARTMENTS_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'FONDKAPREMONT_MODE']

Of these Features one of them is significantly important in testing for Default vs Non Default do an analysis on the features to determine which one is a pivotal feature

In [None]:
### SimpleImputer on Features with Missing values less than 5%
from sklearn.impute import SimpleImputer

threshold = 5
features_simple_impute = missing_percent[(missing_percent <= threshold) & (missing_percent != 0) ].index

simple_imputer = SimpleImputer(strategy="median")
simple_imputer_cat = SimpleImputer(strategy="most_frequent")

for column in features_simple_impute:

    if (df_train[column].dtype != "object"):
        df_train[column] = simple_imputer.fit_transform(df_train[[column]]).ravel()
        
    else:
        df_train[column] = simple_imputer_cat.fit_transform(df_train[[column]]).ravel() #Removes one Dimension using ravel (3,3) to (3,)


print("Features that have been Univariate Imputation with Median or Most Frequent: ", features_simple_impute.tolist())

In [None]:
#Run IterativeImputer on all other Columns that are numeric and have missing values
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

iterative_imputer = IterativeImputer(random_state = 0)
numeric_missing_columns = [ column for column in df_train.select_dtypes(include=["float64", "int64"]).columns 
                                if df_train[column].isnull().sum() > 0]

df_train[numeric_missing_columns] = pd.DataFrame(iterative_imputer.fit_transform(df_train[numeric_missing_columns]), columns=numeric_missing_columns, index=df_train.index)
print("Columns that have been Iteratively Imputed: ", numeric_missing_columns)

Columns that have been Iteratively Imputed:  ['OWN_CAR_AGE', 'EXT_SOURCE_1', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'TOTALAREA_MODE', 'AMT_REQ_CREDIT_BUREA

In [None]:
#Categorical Features with higher than 5% Missing Values assign a new Column called Missing_{Column_name}_Flag
columns = df_train.columns[(df_train.isnull().sum() > 0)]

#Creating Missing Flag for each Column Categorical with more than 5% Missing Values
for col in columns:
    df_train[col + "_missing_flag"] = df_train[col].isnull().astype(int)

for col in columns:
    if df_train[col].dtype == "object":
        df_train[col] = df_train[col].fillna("Missing")
print("Categorical Columns that have been assigned a new class called Missing: ", columns)

### **<u>Check that all Columns in df_train have been imputed meaning no more missing values</u>**

In [None]:
if(df_train.isnull().sum().sum() == 0): 
    print("All Columns have no missing values")
else:
    print("Columns still have missing values")

### **<u>Creating Train, Validation, Test Set for Base Model(KNN, LOGREG, FOREST MODELS)</u>**

In [None]:
X = df_train.drop(columns="TARGET")
y = df_train["TARGET"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_train, X_val, y_train, y_val= train_test_split(X_train, y_train, test_size = 0.1/(1-0.3), stratify=y_train, random_state=42)
le = LabelEncoder()
for column in X_train.columns:
    if X_train[column].dtype == "object":
        le.fit(X_train[column]) 
        X_train[column] = le.transform(X_train[column])
        X_val[column] = le.transform(X_val[column])
        X_test[column] = le.transform(X_test[column])
        
#Run Autogun Algorithm to find best models
autoflag = input("Run Autogun:(Y/N): ")
if(autoflag == 'Y' or autoflag == 'y' ):
    predictor = TabularPredictor(label='TARGET').fit(train_data=df_train)
    predictions = predictor.predict(df_test)
    print(predictions)



### **<u>Initiation for SMOTE(Oversampling Method ) and StratifiedKFold Cross Validation</u>**

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score, cross_validate
%pip install imblearn
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")


kfold = StratifiedKFold(n_splits = 10, shuffle=True, random_state = 42)
sm = SMOTE(random_state = 100)

### **<u>BASE KNN Model</u>**

In [None]:
#Scale only Numeric Values ---> KNN needs this
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

knn_Classifier = KNeighborsClassifier()
print(X_train.shape[0], y_train.shape[0])
knn_Classifier.fit(X_train, y_train)  #Use Default Hyperparmas
y_pred_val = knn_Classifier.predict(X_val)
y_pred_train = knn_Classifier.predict(X_train)
y_pred_test = knn_Classifier.predict(X_test)

print("KNN Accuracy Train Score:", accuracy_score(y_train, y_pred_train))
print("KNN Precision Train Score: ", precision_score(y_train, y_pred_train))
print("KNN Recall Train Score: ", recall_score(y_train, y_pred_train))
print("KNN F1 Train Score: ", f1_score(y_train, y_pred_train))

print("KNN Accuracy Validation Score:", accuracy_score(y_val, y_pred_val))
print("KNN Precision Validation Score: ", precision_score(y_val, y_pred_val))
print("KNN Recall Validation Score: ", recall_score(y_val, y_pred_val))
print("KNN F1 Validation Score: ", f1_score(y_val, y_pred_val))


print("KNN Accuracy Test Score: ", accuracy_score(y_test, y_pred_test))
print("KNN Precision Test Score: ", precision_score(y_test, y_pred_test))
print("KNN Recall Test Score: ", recall_score(y_test, y_pred_test)) #Super low means that is overfitting
print("KNN F1 Test Score: ", f1_score(y_test, y_pred_test))

#Create a ROC Curve[For Train and Test]
plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_train, y_pred_train)
plt.plot(fpr, tpr, label = "KNN Classifier")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Train of KNN")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_test, y_pred_test)
plt.plot(fpr, tpr, label = "KNN Classifier")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Test of KNN")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
scores = cross_validate(estimator=knn_Classifier, X = X_train, y = y_train, cv = kfold, scoring=["accuracy", "precision", "recall"], return_train_score=True)
for metric in ["train_accuracy", "train_precision", "train_recall", "test_accuracy", "test_precision", "test_recall"]:
    mean_score = np.mean(scores[metric])
    print(f"Mean {metric}: ", mean_score)
    print("STD: ", np.std(scores[metric]))

### **<u>Random Forest Tree Base Model</u>**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, auc


rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)


y_pred_val = rf_model.predict(X_val)
y_pred_train = rf_model.predict(X_train)
y_pred_test = rf_model.predict(X_test)

print("RF Accuracy Train Score:", accuracy_score(y_train, y_pred_train))
print("RF Precision Train Score: ", precision_score(y_train, y_pred_train))
print("RF Recall Train Score: ", recall_score(y_train, y_pred_train))
print("RF F1 Train Score: ", f1_score(y_train, y_pred_train))

print("RF Accuracy Validation Score:", accuracy_score(y_val, y_pred_val))
print("RF Precision Validation Score: ", precision_score(y_val, y_pred_val))
print("RF Recall Validation Score: ", recall_score(y_val, y_pred_val))
print("RF F1 Validation Score: ", f1_score(y_val, y_pred_val))


print("RF Accuracy Test Score: ", accuracy_score(y_test, y_pred_test))
print("RF Precision Test Score: ", precision_score(y_test, y_pred_test))
print("RF Recall Test Score: ", recall_score(y_test, y_pred_test)) #Super low means that is overfitting
print("RF F1 Test Score: ", f1_score(y_test, y_pred_test))

#Create a ROC Curve[For Train and Test and Validation]
plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_train, y_pred_train)
plt.plot(fpr, tpr, label = "RF Classifier")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Train of RF")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


fpr, tpr, threshold = roc_curve(y_val, y_pred_val)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(10,8))
plt.plot(fpr, tpr, color="red", label = f"ROC CURVE for RF Validation with {roc_auc} score")
plt.plot([0,1], [0,1], linestyle="--", label=f"ROC CURVE Random model")
plt.legend(loc="lower right")
plt.title("RF ROC CURVE for Validation vs Random Model")
plt.xlabel('FPR')
plt.ylabel("TPR")
plt.grid(True)
plt.tight_layout()
plt.show()


plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_test, y_pred_test)
plt.plot(fpr, tpr, label = "RF Classifier")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Test of RF")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


print('Feature Importance\:\n', pd.Series(rf_model.feature_importances_, index = X.columns))


In [None]:
scores = cross_validate(estimator=rf_model, X = X_train, y = y_train, cv = kfold, scoring=["accuracy", "precision", "recall"], return_train_score=True)
for metric in ["train_accuracy", "train_precision", "train_recall", "test_accuracy", "test_precision", "test_recall"]:
    mean_score = np.mean(scores[metric])
    print(f"Mean {metric}: ", mean_score)
    print("STD: ", np.std(scores[metric]))

### **<u>XGBoost Classifier Base Mode</u>**

In [None]:
#XGBoost Classifier
%pip install xgboost
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")
xgb_model = xgb.XGBClassifier(ignore_coerce = True)
xgb_model.fit(X_train, y_train)


y_pred_val = xgb_model.predict(X_val)
y_pred_train = xgb_model.predict(X_train)
y_pred_test = xgb_model.predict(X_test)

print("XGBModel Accuracy Train Score:", accuracy_score(y_train, y_pred_train))
print("XGBModel Precision Train Score: ", precision_score(y_train, y_pred_train))
print("XGBModel Recall Train Score: ", recall_score(y_train, y_pred_train))
print("XGBModel F1 Train Score: ", f1_score(y_train, y_pred_train))

print("XGB Accuracy Validation Score:", accuracy_score(y_val, y_pred_val))
print("XGB Precision Validation Score: ", precision_score(y_val, y_pred_val))
print("XGB  Recall Validation Score: ", recall_score(y_val, y_pred_val))
print("XGB F1 Validation Score: ", f1_score(y_val, y_pred_val))


print("XGB Accuracy Test Score: ", accuracy_score(y_test, y_pred_test))
print("XGB Precision Test Score: ", precision_score(y_test, y_pred_test))
print("XGB Recall Test Score: ", recall_score(y_test, y_pred_test)) 
print("XGB F1 Test Score: ", f1_score(y_test, y_pred_test))

#Create a ROC Curve[For Train and Test]
plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_train, y_pred_train)
plt.plot(fpr, tpr, label = "XGB Classifier")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Train of XGB")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_test, y_pred_test)
plt.plot(fpr, tpr, label = "XGB Classifier")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Test of XGB")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
scores = cross_validate(estimator=xgb_model, X = X_train, y = y_train, cv = kfold, scoring=["accuracy", "precision", "recall"], return_train_score=True)
for metric in ["train_accuracy", "train_precision", "train_recall", "test_accuracy", "test_precision", "test_recall"]:
    mean_score = np.mean(scores[metric])
    print(f"Mean {metric}: ", mean_score)
    print("STD: ", np.std(scores[metric]))

### **<u>Base Model Logistic Regressio</u>**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, roc_auc_score, precision_score, accuracy_score, auc, f1_score, recall_score



log_model = LogisticRegression(random_state = 42, class_weight=balanced)
log_model.fit(X_train, y_train)


y_pred_val = log_model.predict(X_val)
y_pred_train = log_model.predict(X_train)
y_pred_test = log_model.predict(X_test)

print("Log Reg Accuracy Train Score:", accuracy_score(y_train, y_pred_train))
print("Log Reg Precision Train Score: ", precision_score(y_train, y_pred_train))
print("Log Reg Recall Train Score: ", recall_score(y_train, y_pred_train))
print("Log Reg F1 Train Score: ", f1_score(y_train, y_pred_train))

print("Log Reg Accuracy Validation Score:", accuracy_score(y_val, y_pred_val))
print("Log Reg Precision Validation Score: ", precision_score(y_val, y_pred_val))
print("Log Reg  Recall Validation Score: ", recall_score(y_val, y_pred_val))
print("Log Reg F1 Validation Score: ", f1_score(y_val, y_pred_val))


print("Log Reg Accuracy Test Score: ", accuracy_score(y_test, y_pred_test))
print("Log Reg Precision Test Score: ", precision_score(y_test, y_pred_test))
print("Log Reg Recall Test Score: ", recall_score(y_test, y_pred_test))
print("Log Reg F1 Test Score: ", f1_score(y_test, y_pred_test))

#Create a ROC Curve[For Train, Validation and Test]
plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_train, y_pred_train)
roc_score = roc_auc_score(y_train, y_pred_train)
plt.plot(fpr, tpr, label = f"Log Reg with ROC Score:{roc_score}")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Train of LogReg")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_val, y_pred_val)
roc_score = roc_auc_score(y_val, y_pred_val)
plt.plot(fpr, tpr, label = f"Log Reg with ROC Score:{roc_score}")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Validation of LogReg")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()




plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_test, y_pred_test)
roc_score = roc_auc_score(y_test, y_pred_test)
plt.plot(fpr, tpr, label = f"Log Reg with AUC score: {roc_score}")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Test of XGB")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()



#ROC Score is below 0.5 meaning that is predicting worse than model do 1 - pred and if it jumps above 0.5 thats good, that was not case dont use Log Reg

In [None]:
scores = cross_validate(estimator=log_model, X = X_train, y = y_train, cv = kfold, scoring=["accuracy", "precision", "recall"], return_train_score=True)
for metric in ["train_accuracy", "train_precision", "train_recall", "test_accuracy", "test_precision", "test_recall"]:
    mean_score = np.mean(scores[metric])
    print(f"Mean {metric}:", mean_score)
    print("STD:", np.std(scores[metric]))

### **<u>Base Model LightGMB Classifier</u>**

In [None]:
%pip install --quiet lightgbm
import lightgbm 
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_curve, roc_auc_score, precision_score, accuracy_score, auc, f1_score, recall_score

#Keep this Model --> Treebased can handle without SMOTE
lgb_model = LGBMClassifier(verbosity=-1) #Surpass the messages
lgb_model.fit(X_train,y_train)
print("Parameters of Baseline Model:", lgb_model.get_params())


y_pred_train = lgb_model.predict(X_train)
y_pred_val = lgb_model.predict(X_val)
y_pred_test = lgb_model.predict(X_test)

print("Train Accuracy: ", accuracy_score(y_train, y_pred_train))
print("Train Precision: ", precision_score(y_train, y_pred_train))
print("Train Recall:", recall_score(y_train, y_pred_train))
print("F1 Score:", f1_score(y_train, y_pred_train))


print("Val Accuracy: ", accuracy_score(y_val, y_pred_val))
print("Val Precision: ", precision_score(y_val, y_pred_val))
print("Val Recall:", recall_score(y_val, y_pred_val))
print("Val F1 Score:", f1_score(y_val, y_pred_val))


print("Accuracy Test Score: ", accuracy_score(y_test, y_pred_test))
print("Precision Test Score: ", precision_score(y_test, y_pred_test))
print("Recall Test Score: ", recall_score(y_test, y_pred_test))
print("F1 Test Score: ", f1_score(y_test, y_pred_test))

#Create a ROC Curve[For Train, Validation and Test]
plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_train, y_pred_train)
roc_score = roc_auc_score(y_train, y_pred_train)
plt.plot(fpr, tpr, label = f"LGB with ROC Score:{roc_score}")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Train of LGBClassifier")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_val, y_pred_val)
roc_score = roc_auc_score(y_val, y_pred_val)
plt.plot(fpr, tpr, label = f"LGB with ROC Score:{roc_score}")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Validation of LGBClassifier")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()




plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_test, y_pred_test)
roc_score = roc_auc_score(y_test, y_pred_test)
plt.plot(fpr, tpr, label = f"Log Reg with AUC score: {roc_score}")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Test of LGBClassifier")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()



In [None]:
scores = cross_validate(estimator=lgb_model, X = X_train, y = y_train, cv = kfold, scoring=["accuracy", "precision", "recall"], return_train_score=True)
for metric in ["train_accuracy",  "train_precision", "train_recall", "test_accuracy", "test_precision", "test_recall"]:
    mean_score = np.mean(scores[metric])
    print(f"Mean {metric}: ", mean_score)
    print("STD: ", np.std(scores[metric]))

### **<u>Improvement to Base Model ---> Label Encoding and One Hot Encoding to Encode Categorical Data**</u>


**Label Encoding**:
- Converts categorical features into ordinal integers.
- Useful for features with two categories (binary).
- Yes → 1, No → 0.

**One-Hot Encoding**:
- Creates a separate column for each unique category in a feature.
- Typically used when the feature is non-binary.

**Issues with One-Hot Encoding**:
- When a feature has multiple unique categories, the number of dimensions (features) can become extremely large like 100,000 which is R^(100,000) vector plane
- Each data point becomes a high-dimensional sparse vector, mostly zeros with a single one.

**Problems with High Dimension**
- 1. Memory usage and computation cost increase dramatically.
- 2. High-dimensional sparse data makes the model prone to overfitting. The model may memorize training data rather than learn meaningful patterns.
- 3. As the vectors are nearly/almost unique, the model can separate training points easily but struggles to generalize predicting unseen data like our test set which means the model is not learning meaningful patterns from the train set but instead memorizing the shape of train_set so when it comes to the test_set it will try fit the test_set on the train_set but these two sets differ astronomically

**Summary**:
- We use Label Encoding for Binary Features and One Hot Encoding for Multiclass features. The reason why at start at notebook we did Label Encoding for all was because we did not care much for the importance of features as a base line model is one that uses RAW data with no transformations or anything. This is to give us an indication on how well a Random Model would do with the data provided. We do not care much for data preprocessing as this is done usually when we want to improve a model

In [None]:
'''
Note tree models dont need to do this as they can handle ordinal data
'''
#Run LabelEncoder on Binary Categorical Features
le = LabelEncoder()

#Run OneHotEncoder on Categorical Features with 2+ Classes
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

for column in df_train.columns:

    if df_train[column].dtype == "object" and df_train[column].nunique() == 2:
        le.fit(df_train[column]) #Fit the Data to LabelEncoder to Train Data prevent Data leakage
        df_train[column] = le.transform(df_train[column])
        df_test[column] = le.transform(df_test[column])

    elif df_train[column].dtype == "object" and df_train[column].value_counts().nunique() > 2:
        ohe.fit(df_train[[column]]) #Expects 2D[n_samples, n_features] df_train[column] is 1D Series
        #Returns Arrays need convert to DF
        train_encoded = ohe.transform(df_train[[column]])
        test_encoded = ohe.transform(df_test[[column]])
        encoded_cols = ohe.get_feature_names_out([column])
        #New DataFrames Created with Encoding
        df_train_encoded = pd.DataFrame(train_encoded, columns=encoded_cols, index = df_train.index)
        df_test_encoded = pd.DataFrame(test_encoded, columns = encoded_cols, index = df_test.index)

        #Drop Original Dataframes
        df_train = pd.concat([df_train.drop(columns=column), df_train_encoded], axis = 1)
        df_test = pd.concat([df_test.drop(columns=column), df_test_encoded], axis = 1)


### <u> **Base Model Improvement 2 ----> SMOTE**</u>

**Issue**:
 - The model is overfitting due to class imbalance. The majority class (Non-default) makes up ~90% of the data, while the minority class (Default) is only ~10%. This causes the model to memorize patterns from the majority class and perform poorly on the minority class.

**Why it matters**:
- Accuracy is misleading for imbalanced datasets because the model can appear highly accurate by only predicting the majority class. With so few minority samples, the model cannot learn meaningful patterns for the rare class.

**Solution – SMOTE (Synthetic Minority Oversampling Technique):**
- SMOTE generates synthetic samples for the minority class by interpolating between existing minority class instances.
- This increases the representation of the minority class without losing any majority class data (unlike undersampling).
- By balancing the classes, the model can learn patterns from both classes, improving its performance on the minority class.


In [None]:
#SMOTE RETURNS Numpy Array 
%pip install imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 100)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train) #X_shape: (121,150) to (229,150)

### <u> **Base Model Improvement 3 ----> PCA[Linear Models like Logisitic Regression]**</u>

- Use PCA (dimensionality reduction technique) to help with issues caused by one-hot encoding. One-hot encoding often creates many sparse and correlated features, which can make modeling harder. PCA finds linear overlap among features and groups similar features together, while preserving as much of the original information as possible (variance).

How PCA works:
- Some dimensions carry more meaningful variation between points, representing strong patterns in the data, while others carry less meaningful variation, often representing noise rather than signal. PCA identifies directions (axes) in the feature space that have the most variation. These directions become the new principal components. It rotates the original coordinate system to align with these directions of maximum variance, so the new components capture the most important differences between points. Each principal component is a linear combination of the original features, which means that even after reducing dimensionality, we are still keeping as much information as possible from the original dataset. Also for Logistical Regression this works as features are still linear just linear_combination of features they are non nonlinear

- Why this is useful:
    - Reduces dimensionality and simplifies the model without losing key information(Captures most meaningful patterns while being robust)
    - Groups correlated features together, which prevents redundancy from one-hot encoding(Removes Dimensionality which makes it less Computationally Expensive)
    - Helps the model focus on the most informative patterns, improving stability and interpretability.

In [None]:
#CODE USED TO FIND THE PERFECT AMOUNT OF PC to explain 95% Variance -> 68
scaler = StandardScaler()
flag_pca = input("Do you want to run PCA to improve Model(Y/N): ")
if(flag_pca == "Y" or flag_pca == "y"):
    flag_1 = False
    X_train_scaled = scaler.fit_transform(X_train_smote)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    if flag_1:
        pca = PCA().fit(X_train_scaled)
        cumulative_variance = pca.explained_variance_ratio_.cumsum() 
        plt.plot(range(1, len(cumulative_variance)+1), cumulative_variance, marker='o')
        plt.xlabel('Number of components')
        plt.ylabel('Cumulative explained variance')
        plt.grid(True)
        plt.show()

    pca = PCA(n_components = 0.95) #can explain 0.95 of variance or n_components = 75
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_val_pca = pca.transform(X_val_scaled)
    X_test_pca = pca.transform(X_test_scaled)


    # print("Cumulative Variance:", pca.explained_variance_ratio_.cumsum())
    # pc_variance = pca.explained_variance_ratio_
    # print("Each Individual PC Variance Explained: ", pc_variance)

    #Keep Threshold amount until Cumulative Variance is 95%[Number of PC to Keep]
    pc_variance = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(pc_variance)
    n_components = np.argmax(cumulative_variance >= 0.95) + 1 #Returns First True value and then also +1 includes that PC without it it will exclude it
    print("Number of Components to keep for 95% variance: ", n_components)
    X_pca = pca.transform(X_train_scaled)[:,:n_components] #Shape: (229, n_components)

    df_pca = pd.DataFrame(X_pca, columns=[f"PC{i+1}" for i in range(n_components)])
    print("Transformed PCA DF Shape:", df_pca.shape)
        
    #pca.components_ --> (n_components_total, n_features) where each row is a PC and each column a feature


#### <u> **Feature Contribution for each PC** </u>

- We have 74 principal components (PCs) that together explain 95% of the variance in our dataset. Each PC is significantly important, as all of them contribute to building this 95% variance. PC1 explains the most variance, while PC74 explains the least. However, since each PC is calculated based on all features, removing a single feature can potentially change the composition of all PCs.

- Loadings are used to describe how much each original feature contributes to a given PC. We have created a dataframe to help users visualize which features contribute the most to each PC, showing which features describe each PC the most effectively.

In [None]:

import tabulate
from tabulate import tabulate
loading = pd.DataFrame(pca.components_[:n_components,:].T,
                    columns=[f"PC{i+1}" for i in range(n_components)],
                    index = X.columns
                    )

print(tabulate(loading, headers="keys", tablefmt="grid"))
print("Feature that contribute most to each PC: ", [(i, loading[i].idxmax()) for i in loading.columns])
most_import_feature = [(i, loading[i].idxmax()) for i in loading.columns]
df_importance_pc = pd.DataFrame(most_import_feature, columns=["PC", "Feature"])
print(tabulate(df_importance_pc, headers="keys", tablefmt="grid"))

### <u> **Decision Boundary for PC1 and PC2 of Logistic Regression Model** </u>

In [None]:
from mlxtend.plotting import plot_decision_regions  
plt.figure(figsize=(10,8))
plot_decision_regions(X_train_pca[["PC1", "PC2"]], np.array([y_train_smote]).ravel(), clf=log_model)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend()
plt.tight_layout()
plt.show()


###  **<u> Improvement on Base Model Logistic Regression with SMOTE and PCA</u>**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
print("*************************************************")
print("Same Shape: ",(X_train_pca.shape, y_train_smote.shape, True if X_pca.shape[0] == y_train_smote.shape[0] else False) )
print("*************************************************")
log_model = log_model.fit(X_train_pca, y_train_smote)

#PCA Reuced our total Features to 69
y_pred_train = log_model.predict(X_train_pca)
y_pred_val = log_model.predict(X_val_pca)
y_pred_test = log_model.predict(X_test_pca)

print("Train Accuracy: ", accuracy_score(y_train_smote, y_pred_train))
print("Train Precision: ", precision_score(y_train_smote, y_pred_train))
print("Train Recall:", recall_score(y_train_smote, y_pred_train))
print("F1 Score:", f1_score(y_train_smote, y_pred_train))
print("*************************************************")

print("Val Accuracy: ", accuracy_score(y_val, y_pred_val))
print("Val Precision: ", precision_score(y_val, y_pred_val))
print("Val Recall:", recall_score(y_val, y_pred_val))
print("Val F1 Score:", f1_score(y_val, y_pred_val))
print("*************************************************")

print("Accuracy Test Score: ", accuracy_score(y_test, y_pred_test))
print("Precision Test Score: ", precision_score(y_test, y_pred_test))
print("Recall Test Score: ", recall_score(y_test, y_pred_test))
print("F1 Test Score: ", f1_score(y_test, y_pred_test))
print("*************************************************")


#Create a ROC Curve[For Train, Validation and Test]
plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_train_smote, y_pred_train)
roc_score = roc_auc_score(y_train_smote, y_pred_train)
plt.plot(fpr, tpr, label = f"Log Reg with SMOTE ROC Score:{roc_score}")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Train of Log Reg Classifier")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_val, y_pred_val)
roc_score = roc_auc_score(y_val, y_pred_val)
plt.plot(fpr, tpr, label = f"Log Reg with SMOTE ROC Score:{roc_score}")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Validation of Log Reg Classifier")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_test, y_pred_test)
roc_score = roc_auc_score(y_test, y_pred_test)
plt.plot(fpr, tpr, label = f"Log Reg with SMOTE AUC score: {roc_score}")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Test of Log Reg Classifier")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

#### **<u>LOG REG WITH SMOTE NO PCA </u>**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
print("*************************************************")
print("Same Shape: ",(X_train_smote.shape, y_train_smote.shape, True if X_train_smote.shape[0] == y_train_smote.shape[0] else False) )
print(type(y_train_smote))
print(y_train_smote.value_counts())
print("*************************************************")
log_model = log_model
log_model.fit(X_train_smote, y_train_smote)

#PCA Reuced our total Features to 69
y_pred_train = log_model.predict(X_train_smote)
y_pred_val = log_model.predict(X_val)
y_pred_test = log_model.predict(X_test)

#Average = "macro" calculates the evaluation metric independently for each class and takes the unweighted average --> very good for balanced classes(each class contributed equally)
print("Train Accuracy: ", accuracy_score(y_train_smote, y_pred_train))
print("Train Precision: ", precision_score(y_train_smote, y_pred_train, average="macro"))
print("Train Recall:", recall_score(y_train_smote, y_pred_train, average="macro"))
print("F1 Score:", f1_score(y_train_smote, y_pred_train, average="macro"))
print("*************************************************")

print("Val Accuracy: ", accuracy_score(y_val, y_pred_val))
print("Val Precision: ", precision_score(y_val, y_pred_val, average="macro"))
print("Val Recall:", recall_score(y_val, y_pred_val, average="macro"))
print("Val F1 Score:", f1_score(y_val, y_pred_val, average="macro"))
print("*************************************************")

print("Accuracy Test Score: ", accuracy_score(y_test, y_pred_test))
print("Precision Test Score: ", precision_score(y_test, y_pred_test, average="macro"))
print("Recall Test Score: ", recall_score(y_test, y_pred_test, average="macro"))
print("F1 Test Score: ", f1_score(y_test, y_pred_test, average="macro"))
print("*************************************************")


#Create a ROC Curve[For Train, Validation and Test]
plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_train_smote, y_pred_train)
roc_score = roc_auc_score(y_train_smote, y_pred_train)
plt.plot(fpr, tpr, label = f"Log Reg with SMOTE ROC Score:{roc_score}")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Train of Log Reg Classifier")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_val, y_pred_val)
roc_score = roc_auc_score(y_val, y_pred_val)
plt.plot(fpr, tpr, label = f"Log Reg with SMOTE ROC Score:{roc_score}")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Validation of Log Reg Classifier")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_test, y_pred_test)
roc_score = roc_auc_score(y_test, y_pred_test)
plt.plot(fpr, tpr, label = f"Log Reg with SMOTE AUC score: {roc_score}")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Test of Log Reg Classifier")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()




#### <u>**Stratified KFold Cross Validation**</u>

- We use cross-validation (CV) to evaluate the model’s general performance, rather than relying on a single train-test split that could be unusually good or bad. For each validation split, we calculate the performance metrics, then take the average across all splits to get a robust estimate. We also compare training and testing metrics to detect overfitting. Additionally, we compute the standard deviation of the metrics across splits to assess variability, ensuring that performance is consistent and stable.

In [None]:
from sklearn.model_selection import cross_val_predict, cross_validate
pca_allowed = input("Does your Log Reg Model have PCA(Y/N): ")
if pca_allowed == "Y" or pca_allowed == "y":
    scores = cross_validate(estimator=log_model, X = X_train_pca, y = y_train_smote, cv = kfold, scoring=["accuracy", "precision", "recall", "roc_auc"], return_train_score=True)
    for metric in ["train_accuracy", "train_precision", "train_recall", "train_roc_auc", "test_accuracy", "test_precision", "test_recall", "test_roc_auc"]:
        mean_score = np.mean(scores[metric])
        print(f"Mean {metric}: ", mean_score)
        print("STD: ", np.std(scores[metric]))
else:
    scores = cross_validate(estimator=log_model, X = X_train_smote, y = y_train_smote, cv = kfold, scoring=["accuracy", "precision", "recall", "roc_auc"], return_train_score=True)
    for metric in ["train_accuracy", "train_precision", "train_recall", "train_roc_auc", "test_accuracy", "test_precision", "test_recall", "test_roc_auc"]:
        mean_score = np.mean(scores[metric])
        print(f"Mean {metric}: ", mean_score)
        print("STD: ", np.std(scores[metric]))




### <u> **Testing it on the Final Model**</u> 
- We have data leakage as we performed SMOTE before Validation so its actually like overfitting
- WE must tune SMOTE parameters, LogisticRegression parameters, PCA parameters, Model feature engineering and selection

In [44]:

for name, X_set, y_set in [
        ("Test", X_test_pca, y_test)
    ]:
        y_pred = log_model.predict(X_set)
        y_pred_proba = log_model.predict_proba(X_set)[:, 1]

        print(f"\n{name} Set Results:")
        print(f"Accuracy: {accuracy_score(y_set, y_pred):.4f}")
        print(f"Precision: {precision_score(y_set, y_pred):.4f}")
        print(f"Recall: {recall_score(y_set, y_pred):.4f}")
        print(f"ROC AUC: {roc_auc_score(y_set, y_pred_proba):.4f}")


Test Set Results:
Accuracy: 0.7886
Precision: 0.1580
Recall: 0.3739
ROC AUC: 0.6541


#### <u>**Classification Reports**</u>

- By default, the Classification Report returns a string.

- Our evaluation metrics typically treat the true positive class as the only positive class. However, the classification report can evaluate both classes as the positive class in separate scenarios.

- The classification report allows us to see both macro and weighted metrics:
  - Macro assumes all classes are equally important (useful for balanced datasets).
  - Weighted accounts for class imbalance by weighting metrics according to the number of samples in each class.
####
- Note: Even if our true labels are balanced using SMOTE, this does not guarantee that predictions will be balanced. This is why macro and weighted metrics can differ.

- Using cross-validation, specifically Stratified K-Fold CV, allows us to observe how the classification report metrics vary (or remain consistent) across folds. This helps determine if the model is genuinely performing well or if high performance on a particular fold is just luck, potentially indicating overfitting.

- Choosing the optimal k in K-Fold CV is important and can be done via GridSearchCV. For example, using k = 40 produced similar results in my experiments.
  - A low k introduces high bias but is faster to compute.
  - An extremely large k approaches leave-one-out CV (LOOCV), reducing bias but increasing variance between folds and computation time.

In [None]:
from sklearn.model_selection import cross_validate
from tabulate import tabulate

for fold, (train_idx, test_idx) in enumerate(kfold.split(X_train_pca, y_train_smote),1):
    
    log_model.fit(X_train_pca[train_idx], y_train_smote[train_idx])
    y_prediction = log_model.predict(X_train_pca[test_idx])
    print(f"Fold {fold} Classification report ")
    print(tabulate(pd.DataFrame(classification_report(y_train_smote[test_idx], y_prediction, output_dict=True)).transpose(), headers="keys", tablefmt="fancy_grid"))

    print("*******************************************************")
    ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_train_smote[test_idx], y_prediction), display_labels=["Non-Default", "Default"]).plot(cmap="Blues", values_format=".2f")

    plt.figure(figsize = (10,8))
    fpr, tpr, threshold = roc_curve(y_train_smote[test_idx], y_prediction)
    roc_score = roc_auc_score(y_train_smote[test_idx], y_prediction)
    plt.plot(fpr, tpr, label = f"Log Reg with SMOTE ROC Score:{roc_score}")
    plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
    plt.title(f"AUC Curve for Fold {fold} Test Log Reg Classifier")
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


#### <u>**Storing Validation Splits**</u>
- This code segment stores the validation folds so that we can regenerate the ROC curve. The current train_test_split appears to have been poorly chosen, which likely caused the low metric scores. In a business context, this approach is used to replicate a validation split in order to present a realistic performance representation to stakeholders. It is important to show this process in code to demonstrate that we are not cherry-picking the most favorable split, but that the model performs well in general.

In [None]:
fold_results = []
for fold_index, (train_idx, test_idx) in enumerate(kfold.split(X_train_pca, y_train_smote), 1):
    X_train, X_test = X_train_pca[train_idx], X_train_pca[test_idx]
    y_train, y_test = y_train_smote.iloc[train_idx], y_train_smote.iloc[test_idx]
    model = LogisticRegression(random_state = 32)
    model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)[:,1] #Predict Probability of Class 1
    score = roc_auc_score(y_test, y_prob)
    fold_results.append({"Fold": fold_index, "Train_idx": list(train_idx), "Test_idx": list(test_idx), "ROC Score": score})

cv_fold_df = pd.DataFrame(fold_results).reset_index(drop=True)
print(cv_fold_df.head(10))

    


#### Find Optimal Fold to graph ROC ---> I have the Optimal FOld but Idk how to apply it

### <u> **Model Selection Variables - RFE and Backward Selection** </u>

- Do not simply remove features—consider their contextual importance.

- Model selection must be approached with care. Typically, feature selection is performed before PCA, but we include it here to demonstrate that it is also a viable approach to improving model performance. However, since PCA has already been applied, removing a feature can distort the transformation. For example, if PCA is set with n_components=0.95 (to retain 95% of variance), removing even a single component can reduce the total explained variance in the model. Therefore, feature selection after PCA must be applied cautiously.

- There are two common model selection techniques:
    1. **Sequential Feature Selection (SFS)**:<br>
     - This is a business-standard method that can be performed as forward or backward selection. <br>
     
        A) **Forward selection**: starts with zero features and iteratively adds the feature that improves the model most. <br>
        
        B) **Backward selection**: starts with all features and removes the feature with the weakest predictive power at each step. <br>

        Since this is a classification problem using Logistic Regression, the default scoring metric is accuracy, which can be customized with scoring="evaluation_metric".

    2. **Recursive Feature Elimination (RFE)**:
        - RFE begins with all features and removes them iteratively based on the estimator’s feature importance. <br>
        
        - For **Logistic Regression**, feature importance comes from **coef_** (the weight of each feature). RFE ranks features by the absolute value of the coefficients (|coef|) and removes the least significant feature in each iteration. <br>

        - For **tree-based models**, RFE uses **feature_importances_**, which by default relies on Gini impurity reduction. The feature contributing the least to reducing Gini impurity is removed first. <br>

- **Important note**: Even if a feature is deemed “least significant” by RFE or SFS, it does not necessarily mean the feature is useless. It may have important interactions with other features or provide critical contextual insights:
    - For example, in predicting credit default, removing has_house without considering its context could be misleading, as owning a house may indicate a lower likelihood of default.

- When features interact or are transformed to capture more information (e.g., combining age and income into income_per_age), this is called feature engineering. Such engineered features can be more predictive than individual raw features. This is why we dont remove a feature immediately and have to critically think first

- **PCA and interactions**: Since PCA transforms features into components, direct removal of features can affect the principal components and reduce explained variance. Interactions between features can also be lost if original features are removed prior to transformation. Careful consideration of feature importance in context is therefore crucial.

In [None]:
from sklearn.feature_selection import RFE, SequentialFeatureSelector
sfs = SequentialFeatureSelector(estimator = log_model, n_features_to_select= 80, direction="backward") #Backward Feature Selection on Log Regression Model
print("Parameters of Backward Feature Selection Algorithm: ", sfs.get_params())
sfs.fit(X_train, y_train)
selected_features = sfs.get_support()
not_selected = (X.columns[selected_features == False])
# print("Features Selected: ", X.columns[selected_features])
print("Features Removed: ", not_selected)

In [None]:
from sklearn.feature_selection import RFE
rfe = RFE(estimator=log_model, n_features_to_select=104, step=1) #Gini Criterion
rfe.fit(X_train_smote, y_train_smote)
features_selected = X.columns[rfe.get_support()]
print("Parameters of Recursive Feature Elimination: ", rfe.get_params())
print("Features Selected: " ,rfe.get_support())
print("Features not Selected: " , X.columns[rfe.get_support() == False])


### **<u>Improving Baseline Model RF</u>**

- Random Forest is Forest Model so its more robust to high correlation but looking at feature_importance of the rf tree is telling us that its unable to find any important features that can help us predict for the TARGET feature thus as a result we should not utilize this model in the final model selection and optimization

In [None]:
#High Chance PC is removing importnat information, RF is tree it can handle feature scaling and correlated features(I dont think its good)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, auc, precision_score, accuracy_score, recall_score, f1_score


rf_model = RandomForestClassifier(random_state=42, class_weight="balanced") #balanced_subsample
rf_model.fit(X_train_smote, y_train_smote)


y_pred_val = rf_model.predict(X_val)
y_pred_train = rf_model.predict(X_train_smote)
y_pred_test = rf_model.predict(X_test)

print("RF Accuracy Train Score:", accuracy_score(y_train_smote, y_pred_train))
print("RF Precision Train Score: ", precision_score(y_train_smote, y_pred_train))
print("RF Recall Train Score: ", recall_score(y_train_smote, y_pred_train))
print("RF F1 Train Score: ", f1_score(y_train_smote, y_pred_train))

print("RF Accuracy Validation Score:", accuracy_score(y_val, y_pred_val))
print("RF Precision Validation Score: ", precision_score(y_val, y_pred_val))
print("RF Recall Validation Score: ", recall_score(y_val, y_pred_val))
print("RF F1 Validation Score: ", f1_score(y_val, y_pred_val))


print("RF Accuracy Test Score: ", accuracy_score(y_test, y_pred_test))
print("RF Precision Test Score: ", precision_score(y_test, y_pred_test))
print("RF Recall Test Score: ", recall_score(y_test, y_pred_test)) #Super low means that is overfitting
print("RF F1 Test Score: ", f1_score(y_test, y_pred_test))

#Create a ROC Curve[For Train and Test and Validation]
plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_train_smote, y_pred_train)
roc_score = roc_auc_score(y_train_smote, y_pred_train)
plt.plot(fpr, tpr, label = f"RF Classifier Train with ROC: {roc_score}")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Train of RF")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


fpr, tpr, threshold = roc_curve(y_val, y_pred_val)
roc_auc = auc(fpr, tpr)
roc_score = roc_auc_score(y_test, y_pred_test)
plt.figure(figsize=(10,8))
plt.plot(fpr, tpr, color="red", label = f"ROC CURVE for RF Validation with {roc_score}")
plt.plot([0,1], [0,1], linestyle="--", label=f"ROC CURVE Random model")
plt.legend(loc="lower right")
plt.title("RF ROC CURVE for Validation vs Random Model")
plt.xlabel('FPR')
plt.ylabel("TPR")
plt.grid(True)
plt.tight_layout()
plt.show()


plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_test, y_pred_test)
roc_score = roc_auc_score(y_test, y_pred_test)
plt.plot(fpr, tpr, label = f"RF Classifier Test with ROC Score: {roc_score}")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Test of RF")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


#Feature Importance in RF Tree(Yea so like RF just sucks it cant find any important features all it sees is noise) --> Option is to like do Feature Selection it could improve but we will just not use this
print(pd.Series(rf_model.feature_importances_, index = X_train.columns))

### Improving XGBoost Base Model

In [None]:
#High Chance PC is removing importnat information, RF is tree it can handle feature scaling and correlated features(I dont think its good)
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, roc_auc_score, auc, precision_score, accuracy_score, recall_score, f1_score


xgb_model1 = xgb_model
xgb_model.fit(X_train_smote, y_train_smote)


y_pred_val = xgb_model.predict(X_val)
y_pred_train = xgb_model.predict(X_train_smote)
y_pred_test = xgb_model.predict(X_test)

print("XGB Accuracy Train Score:", accuracy_score(y_train_smote, y_pred_train))
print("XGB Precision Train Score: ", precision_score(y_train_smote, y_pred_train))
print("XGB Recall Train Score: ", recall_score(y_train_smote, y_pred_train))
print("XGB F1 Train Score: ", f1_score(y_train_smote, y_pred_train))

print("XGB Accuracy Validation Score:", accuracy_score(y_val, y_pred_val))
print("XGB Precision Validation Score: ", precision_score(y_val, y_pred_val))
print("XGB Recall Validation Score: ", recall_score(y_val, y_pred_val))
print("XGB F1 Validation Score: ", f1_score(y_val, y_pred_val))


print("XGB Accuracy Test Score: ", accuracy_score(y_test, y_pred_test))
print("XGB Precision Test Score: ", precision_score(y_test, y_pred_test))
print("XGB Recall Test Score: ", recall_score(y_test, y_pred_test)) #Super low means that is overfitting
print("XGB F1 Test Score: ", f1_score(y_test, y_pred_test))

#Create a ROC Curve[For Train and Test and Validation]
plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_train_smote, y_pred_train)
roc_score = roc_auc_score(y_train_smote, y_pred_train)
plt.plot(fpr, tpr, label = f"XGB Classifier Train with ROC: {roc_score}")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Train of XGB")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


fpr, tpr, threshold = roc_curve(y_val, y_pred_val)
roc_auc = auc(fpr, tpr)
roc_score = roc_auc_score(y_test, y_pred_test)
plt.figure(figsize=(10,8))
plt.plot(fpr, tpr, color="red", label = f"ROC CURVE for XGB Validation with {roc_score}")
plt.plot([0,1], [0,1], linestyle="--", label=f"ROC CURVE Random model")
plt.legend(loc="lower right")
plt.title("XGB ROC CURVE for Validation vs Random Model")
plt.xlabel('FPR')
plt.ylabel("TPR")
plt.grid(True)
plt.tight_layout()
plt.show()


plt.figure(figsize = (10,8))
fpr, tpr, threshold = roc_curve(y_test, y_pred_test)
roc_score = roc_auc_score(y_test, y_pred_test)
plt.plot(fpr, tpr, label = f"XGB Classifier Test with ROC Score: {roc_score}")
plt.plot([0,1],[0,1],linestyle="--",label="Random Model")
plt.title("AUC Curve for Test of XGB")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


#Feature Importance in RF Tree(Yea so like RF just sucks it cant find any important features all it sees is noise) --> Option is to like do Feature Selection it could improve but we will just not use this
print(pd.Series(xgb_model.feature_importances_, index = X_train.columns))