DECISION TREE

In [None]:
# Install dependencies (uncomment if needed)
# !pip install scikit-learn joblib

# Imports
from google.colab import files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    GridSearchCV
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    roc_curve,
    auc
)

# 1) Upload & load your data
uploaded = files.upload()            # select Loan.csv
df = pd.read_csv('Loan.csv')

# 2) Prepare features and target
X = df.drop(columns=['LoanApproved','ApplicationDate','Age'])
y = df['LoanApproved']

# 3) Stratified train/test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    stratify=y,
    test_size=0.20,
    random_state=42
)

# 4) Build preprocessing + model pipeline
cat_feats = X.select_dtypes(include='object').columns.tolist()
num_feats = X.select_dtypes(include=['int64','float64']).columns.tolist()

preprocessor = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), cat_feats)
], remainder='passthrough')

pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', DecisionTreeClassifier(random_state=42))
])

# 5) Hyperparameter grid & CV search
param_grid = {
    'clf__max_depth':       [3, 5, 8, None],
    'clf__min_samples_leaf':[5, 10, 20],
    'clf__criterion':       ['gini','entropy']
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
search = GridSearchCV(
    pipe,
    param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)
search.fit(X_train, y_train)
best_model = search.best_estimator_

print("Best params:", search.best_params_)
print("CV ROC-AUC: %.4f ± %.4f" % (
    np.mean(search.cv_results_['mean_test_score']),
    np.std(search.cv_results_['mean_test_score'])
))

# 6) Evaluate on hold-out test set
y_pred  = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))
print(f"Accuracy  : {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision : {precision_score(y_test, y_pred):.4f}")
print(f"Recall    : {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score  : {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC   : {roc_auc_score(y_test, y_proba):.4f}")

# 7) Plot the confusion matrix nicely
cm = confusion_matrix(y_test, y_pred)
labels = ['Not Approved', 'Approved']
fig, ax = plt.subplots(figsize=(5, 5))
ax.imshow(cm, interpolation='nearest')
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, cm[i, j], ha='center', va='center')
ax.set_xticks(range(len(labels)))
ax.set_yticks(range(len(labels)))
ax.set_xticklabels(labels)
ax.set_yticklabels(labels)
ax.set_xlabel('Predicted label')
ax.set_ylabel('True label')
ax.set_title('Confusion Matrix')
plt.tight_layout()
plt.show()

# 8) Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc_val = auc(fpr, tpr)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc_val:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

# 9) Feature importances
ohe = best_model.named_steps['pre'].named_transformers_['ohe']
cat_names = ohe.get_feature_names_out(cat_feats)
feat_names = list(cat_names) + num_feats
importances = best_model.named_steps['clf'].feature_importances_
imp_df = pd.Series(importances, index=feat_names).sort_values(ascending=False).head(20)

print("\nTop 20 Feature Importances:")
print(imp_df)
imp_df.sort_values().plot.barh(figsize=(6, 8))
plt.title("Top 20 Feature Importances")
plt.show()

# 10) Persist model & export predictions
joblib.dump(best_model, 'dt_pipeline.joblib')
files.download('dt_pipeline.joblib')

full_preds = best_model.predict(X)
out_df = pd.DataFrame({
    'ApplicationDate': df['ApplicationDate'],
    'Age'            : df['Age'],
    'LoanApprovedFlag': full_preds
})
out_df.to_csv('Loan_Predictions.csv', index=False)
files.download('Loan_Predictions.csv')






LOGISTIC REGRESSION

In [None]:
# Step 1: Importing required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve

# Step 2: Load the dataset
file_path = r"C:\Users\umar3\Downloads\Loan.csv"
df = pd.read_csv(file_path)

# Step 3: Initial inspection
print("Initial shape of data:", df.shape)
print(df.head())
print("\nMissing values:\n", df.isnull().sum())

# Step 4: Drop ApplicationDate
if 'ApplicationDate' in df.columns:
    df.drop('ApplicationDate', axis=1, inplace=True)

# Step 5: Encode target variable
# We assume LoanApproved is binary (Yes/No or 1/0)
if df['LoanApproved'].dtype == 'object':
    df['LoanApproved'] = df['LoanApproved'].map({'Yes': 1, 'No': 0})

# Step 6: Handle missing values
# Numeric: Impute with median | Categorical: Impute with most frequent
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Step 7: Encode categorical variables
df = pd.get_dummies(df, drop_first=True)

# Step 8: Define features and target
X = df.drop(['LoanApproved', 'RiskScore'], axis=1)
y = df['LoanApproved']

# Optional: RiskScore can be explored for further models

# Step 9: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 10: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 11: Logistic Regression Model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)

# Step 12: Predictions
y_pred = logreg.predict(X_test_scaled)
y_proba = logreg.predict_proba(X_test_scaled)[:, 1]

# Step 13: Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))  # Important for false negatives!
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))

# Step 14: Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Step 15: Classification Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

# Step 16: ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.plot(fpr, tpr, label="ROC Curve (AUC = {:.2f})".format(roc_auc_score(y_test, y_proba)))
plt.plot([0,1], [0,1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()


LIGHT GBM

In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    roc_auc_score, accuracy_score,
    mean_squared_error, confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load data
df = pd.read_csv("Loan.csv")  # Replace with actual path

# 2. Parse date and extract date features
df["ApplicationDate"] = pd.to_datetime(df["ApplicationDate"])
df["AppYear"] = df["ApplicationDate"].dt.year
df["AppMonth"] = df["ApplicationDate"].dt.month
df["AppDayOfWeek"] = df["ApplicationDate"].dt.dayofweek
df.drop(columns=["ApplicationDate"], inplace=True)

# 3. Encode categorical columns
categorical_cols = [
    "EmploymentStatus", "EducationLevel", "MaritalStatus",
    "HomeOwnershipStatus", "LoanPurpose"
]
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# 4. Feature set and targets
X = df.drop(columns=["LoanApproved", "RiskScore"])
y_class = df["LoanApproved"]
y_reg = df["RiskScore"]

# 5. Train-test split
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X, y_class, test_size=0.2, random_state=42, stratify=y_class
)
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

# 6. LightGBM Classifier
clf = lgb.LGBMClassifier(
    objective="binary",
    n_estimators=10000,
    learning_rate=0.01,
    num_leaves=31,
    random_state=42
)
clf.fit(
    X_train_c, y_train_c,
    eval_set=[(X_test_c, y_test_c)],
    eval_metric="auc",
)

# 7. Classifier Evaluation
y_pred_proba = clf.predict_proba(X_test_c)[:, 1]
y_pred_class = clf.predict(X_test_c)

print("🔹 Classification Metrics:")
print("  - AUC Score:", roc_auc_score(y_test_c, y_pred_proba))
print("  - Accuracy:", accuracy_score(y_test_c, y_pred_class))

# Confusion Matrix
sns.heatmap(confusion_matrix(y_test_c, y_pred_class), annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# 8. LightGBM Regressor
reg = lgb.LGBMRegressor(
    objective="regression",
    n_estimators=10000,
    learning_rate=0.01,
    num_leaves=31,
    random_state=42
)
reg.fit(
    X_train_r, y_train_r,
    eval_set=[(X_test_r, y_test_r)],
    eval_metric="rmse",
)

# 9. Regression Evaluation
y_pred_reg = reg.predict(X_test_r)
from sklearn.metrics import mean_squared_error
import numpy as np

mse = mean_squared_error(y_test_r, y_pred_reg)
rmse = np.sqrt(mse)
rmse = np.sqrt(mean_squared_error(y_test_r, y_pred_reg))
print("🔹 Regression Metrics:")
print("  - RMSE:", rmse)

# 10. Optional: Feature Importance Plot
lgb.plot_importance(clf, max_num_features=15, importance_type="gain", title="Top Features - Classifier")
plt.show()

lgb.plot_importance(reg, max_num_features=15, importance_type="gain", title="Top Features - Regressor")
plt.show()

# 7. Classifier Evaluation
from sklearn.metrics import classification_report

y_pred_proba = clf.predict_proba(X_test_c)[:, 1]
y_pred_class = clf.predict(X_test_c)

print("🔹 Classification Metrics:")
print("  - AUC Score:", roc_auc_score(y_test_c, y_pred_proba))
print("  - Accuracy:", accuracy_score(y_test_c, y_pred_class))

# ➕ Classification Report
print("\n📊 Classification Report:")
print(classification_report(y_test_c, y_pred_class))

from sklearn.metrics import roc_curve, auc

# Compute ROC curve and ROC area
fpr, tpr, thresholds = roc_curve(y_test_c, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label="Random Guess")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUC - ROC Curve")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()


CATBOOST - STACKING MODEL

In [None]:
!pip install catboost

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
import matplotlib.pyplot as plt

# Load and preprocess data
df = pd.read_csv('Loan.csv')
target = 'RiskScore'
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(df.median(numeric_only=True))

# Feature Engineering
df['DisposableIncome'] = df['MonthlyIncome'] - df['MonthlyDebtPayments']
df['NegativeDisposableIncome'] = (df['DisposableIncome'] < 0).astype(int)
df['CreditStress'] = df['CreditCardUtilizationRate'] * df['MonthlyDebtPayments']
df['PaymentBurden'] = df['MonthlyLoanPayment'] / (df['MonthlyIncome'] + 1)
df['AssetsToLiabilities'] = df['TotalAssets'] / (df['TotalLiabilities'] + 1)
df['NetAssets'] = df['TotalAssets'] - df['TotalLiabilities']
df['NetWorthToAssets'] = df['NetWorth'] / (df['TotalAssets'] + 1)
df['ExperienceBin'] = pd.qcut(df['Experience'], q=4, labels=False)

# Prepare features and targets
X = df.drop(['RiskScore', 'LoanApproved', 'ApplicationDate'], axis=1)
y = df[['RiskScore', 'LoanApproved']]
y = (y > y.median()).astype(int)

# Split and scale data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define base and meta models
base_learners = [
    ('catboost', CatBoostClassifier(verbose=0, iterations=500, depth=6, learning_rate=0.1)),
    ('svm', SVC(probability=True, kernel='rbf', C=1.0))
]
meta_model = LogisticRegression()

# Evaluation function
def print_metrics(y_true, y_pred_class, title="Model Metrics"):
    accuracy = accuracy_score(y_true, y_pred_class)
    recall = recall_score(y_true, y_pred_class)
    precision = precision_score(y_true, y_pred_class)
    f1 = f1_score(y_true, y_pred_class)

    print(f"\n📊 {title}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"F1 Score: {f1:.4f}")

# Train models and store predictions
stack_models = {}
predictions = {}

for target in y_train.columns:
    print(f"\n🎯 Training for target: {target}")
    stack = StackingClassifier(estimators=base_learners, final_estimator=meta_model, cv=5)
    stack.fit(X_train_scaled, y_train[target])
    preds = stack.predict(X_test_scaled)
    stack_models[target] = stack
    predictions[target] = preds

# Print metrics for each target
for target in y_test.columns:
    print_metrics(y_test[target], predictions[target], title=f"{target} Model")

# Plot confusion matrices
def plot_conf_matrix(ax, y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap=plt.cm.Blues, ax=ax, colorbar=False)
    ax.set_title(title)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))
plot_conf_matrix(axes[0], y_test['RiskScore'], predictions['RiskScore'], "RiskScore")
plot_conf_matrix(axes[1], y_test['LoanApproved'], predictions['LoanApproved'], "LoanApproved")
plt.tight_layout()
plt.show()

# AUC-ROC Scores
for target in ['RiskScore', 'LoanApproved']:
    if hasattr(stack_models[target], "predict_proba"):
        probs = stack_models[target].predict_proba(X_test_scaled)[:, 1]
        auc = roc_auc_score(y_test[target], probs)
        print(f"🔸 AUC-ROC Score for {target}: {auc:.4f}")
    else:
        print(f"⚠️ Model for {target} does not support probability prediction.")

# Classification reports
for target in ['RiskScore', 'LoanApproved']:
    print(f"\n📋 Classification Report for {target}:\n")
    print(classification_report(y_test[target], predictions[target]))


XGB MODEL


*M. ALI MITHANI 25900*

I USED AN XGBOOST MODEL. This is one additional step and newer direction I used for my model. Instead of a test_train_split that is conventionally used, I used Generative AI to create a test data file for me with the name loan_test with 2000 enteries only and without two columns, namely Loan Approval and Risk Score and that was to be used with training data. The reason I did not use the former technique was because dropping two columns from the training data and using it again to test did not seem like the right thing to do. It would also lead to Data Leakage.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from google.colab import files

# Upload and load your original training dataset
uploaded = files.upload()  # Upload "Loan.csv"
df = pd.read_csv("Loan.csv")

# Drop ID if added previously and ensure the correct columns are present
if 'ID' in df.columns:
    df = df.drop(columns=['ID'])

# Drop target columns to isolate feature distributions
features_df = df.drop(columns=['LoanApproved', 'RiskScore'])

# Encode categorical variables for sampling
cat_cols = features_df.select_dtypes(include='object').columns
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    features_df[col] = le.fit_transform(features_df[col].astype(str))
    encoders[col] = le  # Save encoder to decode later

# Generate synthetic data using sampling with replacement
synthetic_data = features_df.sample(n=2000, replace=True, random_state=42).reset_index(drop=True)

# Decode categorical columns back to original labels
for col in cat_cols:
    synthetic_data[col] = encoders[col].inverse_transform(synthetic_data[col])

# Save the test set (without target columns)
synthetic_data.to_csv('Loan_test.csv', index=False)
files.download('Loan_test.csv')

Here I loaded all required librarieies and uploaded test and training data files

In [None]:
# Install dependencies
!pip install xgboost imbalanced-learn --quiet

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error
from sklearn.feature_selection import mutual_info_classif
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier, XGBRegressor
from google.colab import files

# Upload training and test datasets
print("Upload Loan.csv (training set)")
train_file = files.upload()

print("Upload Loan_test.csv (test set)")
test_file = files.upload()




In [None]:
# Load datasets
train_df = pd.read_csv("Loan.csv")
test_df = pd.read_csv("Loan_test.csv")


In [None]:
# Columns & First 5 rows of trainig dataset
train_df.head()

In [None]:
# Statistical summary of the dataset
train_df.describe()

In [None]:
# Data types
train_df.info()

In [None]:
# Looking for Null values
# There are no null values
train_df.isnull().sum()

In [None]:
test_df.head() # Columns & First 5 rows of trainig dataset

Here, I first created an ID column to be used in the final submission file, then I dropped the Application Date Columns. ALthough I checked above that no null values existed, as a fully functioning model to be used in any dataset, where there may be null values, I added the dropna code chunk.

Then I carried out Feature Engineering using the given columns and created multiple different newer features and even visualised them in a barplot to see which of them are most important. Then I prepared training features, which are Loan Approval and Risk Score and Standardised the features using Standard Scaler as well as handled class imbalances using smote

Then I used parameters for the classification part XGBoost that is used for Loan Approval, and Regression for the Risk Score part, I tried tweaking the parameters by increasing and decreasing n estimators and learning rate although keeping them constant with both models, the best possible parameters were n estimators between 500 and 1000 and the learning rate between 0.05 and 0.5.
Then I predicted both classification and then regression values and visualized a confusion matrix as well as classification report and downloaded a submission file

In [None]:


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, f1_score
from sklearn.feature_selection import mutual_info_classif
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier, XGBRegressor
from google.colab import files


# Assign IDs
test_df['ID'] = range(1, len(test_df) + 1)

# Drop ApplicationDate
train_df.drop(columns=['ApplicationDate'], inplace=True)
test_df.drop(columns=['ApplicationDate'], inplace=True)

# Fill missing values (numeric)
train_df = train_df.dropna(axis=0, thresh=int(0.8 * train_df.shape[1]))
train_df.fillna(train_df.median(numeric_only=True), inplace=True)
test_df.fillna(train_df.median(numeric_only=True), inplace=True)  # Use training stats

# Encode categorical variables
categoricals = train_df.select_dtypes(include='object').columns
label_encoders = {}

for col in categoricals:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    label_encoders[col] = le

# Feature engineering
def engineer_features(df):
    df['IncomePerDependent'] = df['AnnualIncome'] / (df['NumberOfDependents'] + 1)
    df['LoanToIncome'] = df['LoanAmount'] / (df['AnnualIncome'] + 1)
    df['DebtToAssets'] = df['TotalLiabilities'] / (df['TotalAssets'] + 1)
    df['CreditUtilization'] = df['CreditCardUtilizationRate'] * df['NumberOfOpenCreditLines']
    df['MonthlyFreeIncome'] = df['MonthlyIncome'] - df['MonthlyDebtPayments'] - df['MonthlyLoanPayment']
    return df

train_df = engineer_features(train_df)
test_df = engineer_features(test_df)

# Visualize feature importance
X_vis = train_df.drop(columns=['LoanApproved', 'RiskScore'])
y_vis = train_df['LoanApproved']
mi = mutual_info_classif(X_vis, y_vis)
pd.Series(mi, index=X_vis.columns).sort_values(ascending=False).plot(
    kind='bar', figsize=(14, 5), title="Feature Importance (Mutual Info)"
)
plt.show()

# Prepare training features
X = train_df.drop(columns=['LoanApproved', 'RiskScore'])
y_class = train_df['LoanApproved']
y_reg = train_df['RiskScore']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(test_df.drop(columns=['ID']))

# Split data
X_train_cls, X_val_cls, y_train_cls, y_val_cls = train_test_split(X_scaled, y_class, test_size=0.2, random_state=42)
X_train_reg, X_val_reg, y_train_reg, y_val_reg = train_test_split(X_scaled, y_reg, test_size=0.2, random_state=42)

# Handle imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_cls_bal, y_train_cls_bal = smote.fit_resample(X_train_cls, y_train_cls)

clf_params = {
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.5],
    'n_estimators': [500, 1000],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

clf_base = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
clf_grid = GridSearchCV(clf_base, clf_params, scoring='f1', cv=3, verbose=1, n_jobs=-1)
clf_grid.fit(X_train_cls_bal, y_train_cls_bal)
clf = clf_grid.best_estimator_

# Model tuning - Regression
reg_params = {
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.5],
    'n_estimators': [500, 1000],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

reg_base = XGBRegressor(random_state=42)
reg_grid = GridSearchCV(reg_base, reg_params, scoring='neg_mean_squared_error', cv=3, verbose=1, n_jobs=-1)
reg_grid.fit(X_train_reg, y_train_reg)
reg = reg_grid.best_estimator_

# Evaluate classification
y_pred_cls = clf.predict(X_val_cls)
print("Classification Report:\n", classification_report(y_val_cls, y_pred_cls))
sns.heatmap(confusion_matrix(y_val_cls, y_pred_cls), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

# Evaluate regression
y_pred_reg = reg.predict(X_val_reg)
rmse = np.sqrt(mean_squared_error(y_val_reg, y_pred_reg))
print(f"RMSE (RiskScore): {rmse:.2f}")
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Example:
rmse = np.sqrt(mean_squared_error(y_val_reg, y_pred_reg))
r2 = r2_score(y_val_reg, y_pred_reg)

print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

# Predict on test set
test_df['LoanApproved'] = clf.predict(X_test_scaled)
test_df['RiskScore'] = reg.predict(X_test_scaled)

# Save submission
submission = test_df[['ID', 'LoanApproved', 'RiskScore']]
submission.to_csv('submission.csv', index=False)
files.download('submission.csv')
