# Loan Status Prediction - Industry Prototype

## Importing the Library 

In [117]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import joblib
import warnings
warnings.filterwarnings("ignore")

## Loading Dataset with Necessary columns 

In [118]:
df = pd.read_csv("loan_data.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [119]:
df.drop(columns=["Loan_ID"],inplace=True)

In [120]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Target and Features Loading 

In [121]:
X = df.drop(columns=["Loan_Status"])
y = df["Loan_Status"].map({"Y":1,"N":0})


In [122]:
y.head()

0    1
1    0
2    1
3    1
4    1
Name: Loan_Status, dtype: int64

## Auto Detect column types

In [123]:
num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

## Preprocessing 

In [124]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num",Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]),num_cols),
        ("cat",Pipeline([
            ("imputer",SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ]
)

## Models && Hyperparameters

In [125]:
models = {
    "LogisticRegression":{
        "model": LogisticRegression(max_iter=500),
        "params" : {"model__C":[0.01,0.1,1,10]}
    },
    "RandomForest": {
        "model": RandomForestClassifier(),
        "params": {"model__n_estimators":[50,100,200],"model__max_depth":[None,5,10]}
    },
    "GradientBoosting": {
        "model": GradientBoostingClassifier(),
        "params": {"model__n_estimators":[50,100,200],"model__learning_rate":[0.01,0.1,0.2]}
    },
    "SVC": {
        "model": SVC(probability=True),
        "params": {"model__C":[0.1,1,10],"model__kernel":["linear","rbf"]}
    },
    "DecisionTree":{
        "model":DecisionTreeClassifier(),
        "params": {"model__max_depth":[None,5,10]}
    },
    "KNN": {
        "model": KNeighborsClassifier(),
        "params": {"model__n_neighbors":[3,5,7]}
    }
    
}

## Cross Validataion

In [126]:
cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
scoring = ["accuracy","precision","recall","f1","roc_auc"]

results = []
best_models = {}
best_params_dict = {}
for name, config in models.items():
    print(f"\n🔍Tuning {name}...")

    pipe = Pipeline([
        ("preprocessor",preprocessor),
        ("model", config["model"])
    ])

    search = RandomizedSearchCV(
        pipe, config["params"],
        n_iter=5,cv=cv,scoring="f1",
        n_jobs=-1,random_state=42

    )
    search.fit(X,y)

    #Save the best model & params
    best_model = search.best_estimator_
    best_models[name] = best_model
    best_params_dict[name] = search.best_params_

    # Cross-validate best model
    scores = cross_validate(best_model,X,y,cv=cv,scoring=scoring)

    results.append({
        "Model": name,
        "Best Params": search.best_params_,
        "Accuracy": np.mean(scores["test_accuracy"]),
        "Precision": np.mean(scores["test_precision"]),
        "Recall": np.mean(scores["test_recall"]),
        "F1": np.mean(scores["test_f1"]),
        "ROC-AUC": np.mean(scores["test_roc_auc"]),
    })



🔍Tuning LogisticRegression...

🔍Tuning RandomForest...

🔍Tuning GradientBoosting...

🔍Tuning SVC...

🔍Tuning DecisionTree...

🔍Tuning KNN...


## Display results

In [127]:
results_df = pd.DataFrame(results)
results_df.head()

Unnamed: 0,Model,Best Params,Accuracy,Precision,Recall,F1,ROC-AUC
0,LogisticRegression,{'model__C': 0.01},0.811102,0.792205,0.983417,0.877443,0.751064
1,RandomForest,"{'model__n_estimators': 200, 'model__max_depth...",0.807837,0.791383,0.978655,0.875045,0.754345
2,GradientBoosting,"{'model__n_estimators': 100, 'model__learning_...",0.804585,0.787201,0.981064,0.873459,0.717574
3,SVC,"{'model__kernel': 'linear', 'model__C': 0.1}",0.809476,0.790695,0.983417,0.876517,0.705772
4,DecisionTree,{'model__max_depth': 5},0.781807,0.777122,0.957311,0.857774,0.678637


In [128]:
results_df = results_df.sort_values(by="F1",ascending=False)
print("\n📊 Model Comparison:\n", results_df)


📊 Model Comparison:
                 Model                                        Best Params  \
0  LogisticRegression                                 {'model__C': 0.01}   
3                 SVC       {'model__kernel': 'linear', 'model__C': 0.1}   
1        RandomForest  {'model__n_estimators': 200, 'model__max_depth...   
2    GradientBoosting  {'model__n_estimators': 100, 'model__learning_...   
4        DecisionTree                            {'model__max_depth': 5}   
5                 KNN                          {'model__n_neighbors': 7}   

   Accuracy  Precision    Recall        F1   ROC-AUC  
0  0.811102   0.792205  0.983417  0.877443  0.751064  
3  0.809476   0.790695  0.983417  0.876517  0.705772  
1  0.807837   0.791383  0.978655  0.875045  0.754345  
2  0.804585   0.787201  0.981064  0.873459  0.717574  
4  0.781807   0.777122  0.957311  0.857774  0.678637  
5  0.780168   0.783194  0.940784  0.854713  0.743730  


## Saving Best Model

In [129]:
best_model_name = results_df.iloc[0]["Model"]
final_model = best_models[best_model_name]
joblib.dump(final_model,"best_model.joblib")

print(f"\n✅ Best model '{best_model_name}' saved as best_model.joblib")
print(f"📝 Best hyperparameters (in memory): {best_params_dict[best_model_name]}")

# Save model comparison results
results_df.to_csv("model_results.csv", index=False)




✅ Best model 'LogisticRegression' saved as best_model.joblib
📝 Best hyperparameters (in memory): {'model__C': 0.01}
