In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_imb_pipeline
import joblib

In [None]:
# Load data
file_path = r"C:\Users\hp\Desktop\work\loan_data.csv"
df = pd.read_csv(file_path)


In [None]:
# Check dataset info
print(df.info())

# Check for missing values
print(df.isnull().sum())

# Summary statistics
print(df.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            381 non-null    object 
 1   Gender             376 non-null    object 
 2   Married            381 non-null    object 
 3   Dependents         373 non-null    object 
 4   Education          381 non-null    object 
 5   Self_Employed      360 non-null    object 
 6   ApplicantIncome    381 non-null    int64  
 7   CoapplicantIncome  381 non-null    float64
 8   LoanAmount         381 non-null    float64
 9   Loan_Amount_Term   370 non-null    float64
 10  Credit_History     351 non-null    float64
 11  Property_Area      381 non-null    object 
 12  Loan_Status        381 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 38.8+ KB
None
Loan_ID               0
Gender                5
Married               0
Dependents            8
Education  

In [None]:
# Handle missing values
def preprocess_data(df):
    # Drop Loan_ID column
    df = df.drop(columns=['Loan_ID'], errors='ignore')
    
    # Convert 'Dependents' to numerical
    df['Dependents'] = df['Dependents'].replace({'3+': 3}).astype(float)
    
    # Fill missing values
    df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
    df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
    df['Dependents'] = df['Dependents'].fillna(df['Dependents'].median())
    df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])
    df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median())
    df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])
    
    # Feature Engineering
    df['TotalIncome'] = np.log1p(df['ApplicantIncome'] + df['CoapplicantIncome'])
    df['EMI'] = df['LoanAmount'] / df['Loan_Amount_Term']
    df['BalanceIncome'] = df['TotalIncome'] - df['EMI']
    
    return df

df = preprocess_data(df)

In [None]:
# Define features and target
X = df.drop(columns=['Loan_Status', 'ApplicantIncome', 'CoapplicantIncome'])
y = df['Loan_Status'].map({'N': 0, 'Y': 1})

# Identify feature types
numeric_features = ['LoanAmount', 'Loan_Amount_Term', 'Credit_History',
                    'TotalIncome', 'EMI', 'BalanceIncome']
categorical_features = ['Gender', 'Married', 'Dependents', 'Education',
                        'Self_Employed', 'Property_Area']

In [None]:
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ])

# Handle class imbalance
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Define models with class weights
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000),
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'SVM': SVC(class_weight='balanced', probability=True, random_state=42)
}


In [None]:
# Model Training & Evaluation
# -----------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

best_model = None
best_score = 0

for name, model in models.items():
    # Create pipeline with SMOTE
    pipeline = make_imb_pipeline(
        preprocessor,
        smote,
        model
    )
    
    # Cross-validation
    cv = StratifiedKFold(n_splits=5)
    scores = cross_val_score(pipeline, X_train, y_train, 
                            cv=cv, scoring='roc_auc')
    
    # Train final model
    pipeline.fit(X_train, y_train)
    
    # Evaluation
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    
    print(f"\n{name} Performance:")
    print(f"CV ROC-AUC: {np.mean(scores):.3f} (±{np.std(scores):.3f})")
    print(f"Test ROC-AUC: {roc_auc_score(y_test, y_proba):.3f}")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # Track best model
    current_score = roc_auc_score(y_test, y_proba)
    if current_score > best_score:
        best_score = current_score
        best_model = pipeline

# Save best model
if best_model is not None:
    joblib.dump(best_model, 'best_loan_model.pkl')
    print(f"\nSaved best model ({type(best_model.named_steps['svc']).__name__}) with ROC-AUC: {best_score:.3f}")


Logistic Regression Performance:
CV ROC-AUC: 0.767 (±0.034)
Test ROC-AUC: 0.831
              precision    recall  f1-score   support

           0       0.80      0.55      0.65        22
           1       0.84      0.95      0.89        55

    accuracy                           0.83        77
   macro avg       0.82      0.75      0.77        77
weighted avg       0.83      0.83      0.82        77

Confusion Matrix:
[[12 10]
 [ 3 52]]

Random Forest Performance:
CV ROC-AUC: 0.764 (±0.048)
Test ROC-AUC: 0.831
              precision    recall  f1-score   support

           0       0.88      0.64      0.74        22
           1       0.87      0.96      0.91        55

    accuracy                           0.87        77
   macro avg       0.87      0.80      0.83        77
weighted avg       0.87      0.87      0.86        77

Confusion Matrix:
[[14  8]
 [ 2 53]]

SVM Performance:
CV ROC-AUC: 0.777 (±0.062)
Test ROC-AUC: 0.845
              precision    recall  f1-score   suppo

In [None]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ])

# Use only Random Forest
model = make_imb_pipeline(
    preprocessor,
    SMOTE(sampling_strategy='auto', random_state=42),
    RandomForestClassifier(
        n_estimators=200,
        class_weight='balanced',
        max_depth=5,
        random_state=42
    )
)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Train model
model.fit(X_train, y_train)

# Feature Importance Extraction
# -----------------------------------------------
try:
    # Get preprocessor and model from pipeline
    preprocessor = model.named_steps['columntransformer']
    rf_model = model.named_steps['randomforestclassifier']
    
    # Get feature names
    numeric_feats = numeric_features
    cat_encoder = preprocessor.named_transformers_['cat']
    cat_feats = list(cat_encoder.get_feature_names_out(categorical_features))
    all_features = numeric_feats + cat_feats
    
    # Get importances
    importances = rf_model.feature_importances_
    
    print("\nTop 10 Feature Importances:")
    for feat, imp in sorted(zip(all_features, importances), 
                          key=lambda x: x[1], reverse=True)[:10]:
        print(f"{feat}: {imp:.4f}")

except Exception as e:
    print(f"Feature importance error: {str(e)}")

# Save model
joblib.dump(model, 'loan_model_rf.pkl')


Top 10 Feature Importances:
Credit_History: 0.4066
TotalIncome: 0.0865
BalanceIncome: 0.0755
Property_Area_Semiurban: 0.0629
LoanAmount: 0.0608
Married_Yes: 0.0566
EMI: 0.0522
Loan_Amount_Term: 0.0440
Gender_Male: 0.0382
Property_Area_Urban: 0.0291


['loan_model_rf.pkl']