# Importing Libraries


In [None]:
import numpy as np # For reading and manipulating structured data (like CSV files)
import pandas as pd # For numerical operations and array handling
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt



# Loading the Data

In [25]:
df = pd.read_csv('./Dataset/Loan.csv') #Using pandas to load the data

# Displaying the data

In [26]:
print(df)

            ID  year loan_limit             Gender approv_in_adv loan_purpose  \
0        24913  2019         cf             Female         nopre           p3   
1        24914  2019         cf               Male         nopre           p3   
2        24915  2019        NaN              Joint         nopre           p3   
3        24916  2019         cf              Joint         nopre           p1   
4        24917  2019         cf  Sex Not Available         nopre           p4   
...        ...   ...        ...                ...           ...          ...   
148642  173555  2019         cf  Sex Not Available         nopre           p3   
148643  173556  2019         cf               Male         nopre           p1   
148644  173557  2019         cf               Male         nopre           p4   
148645  173558  2019         cf             Female         nopre           p4   
148646  173559  2019         cf             Female         nopre           p3   

       Credit_Worthiness op

# Selecting Only Important data for Classification

In [27]:
# Identify and keep only non-numeric (categorical) columns
categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()

# Ensure the target column 'Status' is retained, even if numeric-coded
if 'Status' not in categorical_cols:
    categorical_cols.append('Status')

# Create a new DataFrame containing only categorical features and the target
cat_df = df[categorical_cols]

# Drop rows with any missing categorical values to prepare for modeling
cat_df = cat_df.dropna()

# Print the resulting shape: (number of rows, number of categorical columns + target)
print(cat_df)

       loan_limit             Gender approv_in_adv loan_purpose  \
0              cf             Female         nopre           p3   
1              cf               Male         nopre           p3   
3              cf              Joint         nopre           p1   
4              cf  Sex Not Available         nopre           p4   
5              cf             Female         nopre           p1   
...           ...                ...           ...          ...   
148642         cf  Sex Not Available         nopre           p3   
148643         cf               Male         nopre           p1   
148644         cf               Male         nopre           p4   
148645         cf             Female         nopre           p4   
148646         cf             Female         nopre           p3   

       Credit_Worthiness open_credit business_or_commercial Neg_ammortization  \
0                     l1        nopc                  nob/c           not_neg   
1                     l2        n

# Question 2 Decison Tress Algorithm Selection

In [28]:

# Select only non-numeric (categorical) columns and ensure 'Status' is included
categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()
if 'Status' not in categorical_cols:
    categorical_cols.append('Status')
cat_df = df[categorical_cols].dropna()  # drop rows with missing categorical values

# Prepare X and y
#    - X: one-hot encoded categorical features
#    - y: binary target (0 or 1)
X = pd.get_dummies(cat_df.drop('Status', axis=1), drop_first=True)
y = cat_df['Status'].astype(int)

# Split into train and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 5. Train the Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# 6. Make predictions and evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 7. Extract and display the top 10 feature importances
feat_imp = pd.DataFrame({
    'feature': X.columns,
    'importance': clf.feature_importances_
}).sort_values('importance', ascending=False).head(10)

print("\nTop 10 Feature Importances:")
print(feat_imp.to_string(index=False))


Accuracy: 0.8607994998784427

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.97      0.91     21740
           1       0.87      0.51      0.64      7053

    accuracy                           0.86     28793
   macro avg       0.86      0.74      0.78     28793
weighted avg       0.86      0.86      0.85     28793


Top 10 Feature Importances:
                     feature  importance
            credit_type_EQUI    0.680293
   lump_sum_payment_not_lpsm    0.041412
        Credit_Worthiness_l2    0.029704
   Neg_ammortization_not_neg    0.025016
            credit_type_CRIF    0.017708
             credit_type_EXP    0.017106
                Region_south    0.014100
             loan_purpose_p3    0.013859
           approv_in_adv_pre    0.013459
business_or_commercial_nob/c    0.013370


# Gradient Boosting Classifiers

In [29]:
# Select only non-numeric (categorical) columns and ensure 'Status' is included
categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()
if 'Status' not in categorical_cols:
    categorical_cols.append('Status')
cat_df = df[categorical_cols].dropna()  # drop rows with missing categorical values

# Prepare X and y
#    - X: one-hot encoded categorical features
#    - y: binary target (0 or 1)
X = pd.get_dummies(cat_df.drop('Status', axis=1), drop_first=True)
y = cat_df['Status'].astype(int)

# 4. Split into train and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Train the Gradient Boosting classifier
gbc = GradientBoostingClassifier(
    n_estimators=100,      # number of trees
    learning_rate=0.1,     # step size shrinkage
    max_depth=3,           # depth of each tree
    random_state=42
)
gbc.fit(X_train, y_train)

#  Make predictions and evaluate
y_pred = gbc.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Extract and display the top 10 feature importances
feat_imp = pd.DataFrame({
    'feature': X.columns,
    'importance': gbc.feature_importances_
}).sort_values('importance', ascending=False).head(10)

print("\nTop 10 Feature Importances:")
print(feat_imp.to_string(index=False))

Accuracy: 0.8699683950960303

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92     21740
           1       0.96      0.49      0.65      7053

    accuracy                           0.87     28793
   macro avg       0.91      0.74      0.78     28793
weighted avg       0.88      0.87      0.85     28793


Top 10 Feature Importances:
                     feature  importance
            credit_type_EQUI    0.842405
   lump_sum_payment_not_lpsm    0.051857
   Neg_ammortization_not_neg    0.033999
        Credit_Worthiness_l2    0.019381
business_or_commercial_nob/c    0.017201
                Gender_Joint    0.006880
              loan_limit_ncf    0.005369
co-applicant_credit_type_EXP    0.004261
           approv_in_adv_pre    0.003601
             loan_purpose_p2    0.001628


# Question 3 Comprehensive Model development Process

In [30]:
# Data cleaning
#   • Strip whitespace from object columns
#   • Standardize text to lowercase
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].str.strip().str.lower()

# Feature engineering (example)
#   • Create debt_to_income_ratio if not already present
if 'debt_to_income_ratio' not in df.columns and {'debt', 'income'}.issubset(df.columns):
    df['debt_to_income_ratio'] = df['debt'] / df['income']

#   • Bin age into broader groups (ordinal feature)
if 'age' in df.columns and df['age'].dtype == 'object':
    age_mapping = {
        '18-24': 1, '25-34': 2,
        '35-44': 3, '45-54': 4,
        '55-64': 5, '65-74': 6,
        '75+': 7
    }
    df['age_group'] = df['age'].map(age_mapping)

# Define feature sets
target = 'Status'
all_features = df.columns.drop(target)

# Identify column types
numeric_features   = df[all_features].select_dtypes(include=['int64','float64']).columns.tolist()
categorical_feats  = df[all_features].select_dtypes(include=['object','category']).columns.tolist()
ordinal_feats      = ['age_group'] if 'age_group' in df.columns else []

# Handling missing values & Encoding & Scaling
#    • Numeric: impute with median + standard scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',   StandardScaler())
])

#    • Ordinal: impute with most frequent (or constant) — no scaling
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

#    • Categorical: impute with constant 'missing' + one‐hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot',  OneHotEncoder(handle_unknown='ignore'))
])

# Assemble into a ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num',    numeric_transformer,   numeric_features),
    ('ord',    ordinal_transformer,   ordinal_feats),
    ('cat',    categorical_transformer, categorical_feats),
], remainder='drop')  # drop any other columns

# 8. (Optional) Wrap in a Pipeline with your model
from sklearn.ensemble import GradientBoostingClassifier
clf = Pipeline(steps=[
    ('preproc', preprocessor),
    ('model',   GradientBoostingClassifier(
                    n_estimators=100,
                    learning_rate=0.1,
                    max_depth=3,
                    random_state=42))
])

# 9. Train/test split and fit
from sklearn.model_selection import train_test_split
X = df[all_features]
y = df[target].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf.fit(X_train, y_train)

# 10. Evaluate
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22404
           1       1.00      1.00      1.00      7326

    accuracy                           1.00     29730
   macro avg       1.00      1.00      1.00     29730
weighted avg       1.00      1.00      1.00     29730



# Question 4 Performance Metrics


In [None]:
# %% [code]
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_curve, auc, precision_recall_curve,
    confusion_matrix, classification_report
)


# Fit best model or baseline
best_model = grid.best_estimator_  # or use fitted pipeline .fit(...) earlier
y_pred  = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:,1]

# Primary metrics
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1-Score :", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



# Evaluations

# 1. Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# 2. Precision-Recall Curve
prec, rec, _ = precision_recall_curve(y_test, y_proba)
plt.figure(); plt.plot(rec, prec); plt.xlabel("Recall"); plt.ylabel("Precision")
plt.title("Precision-Recall Curve"); plt.show()

# 3. ROC Curve & AUC
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
plt.figure(); plt.plot(fpr, tpr); plt.plot([0,1],[0,1],'--')
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title(f"ROC Curve (AUC={roc_auc:.3f})")
plt.show()
