In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_auc_score

In [2]:
# Load data
df = pd.read_csv("train_u6lujuX_CVtuZ9i.csv")

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB
None


In [4]:
df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [5]:
df.drop(columns=["Loan_ID"], inplace=True)

In [6]:
# Separate target
target = df["Loan_Status"]
df.drop(columns=["Loan_Status"], inplace=True)

In [7]:
# Define column types
cat_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

In [8]:
# Impute missing values
cat_imputer = SimpleImputer(strategy="most_frequent")
num_imputer = SimpleImputer(strategy="mean")

In [9]:
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])
df[num_cols] = num_imputer.fit_transform(df[num_cols])

In [10]:
df.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [11]:
# Feature engineering
df["Total_Income"] = df["ApplicantIncome"] + df["CoapplicantIncome"]
df["Income_Loan_Ratio"] = df["Total_Income"] / (df["LoanAmount"] + 1)

In [12]:
# Encode categoricals
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

In [13]:
# Encode target
target_le = LabelEncoder()
y = target_le.fit_transform(target)

In [14]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Total_Income,Income_Loan_Ratio
0,1,0,0,0,0,5849.0,0.0,146.412162,360.0,1.0,2,5849.0,39.677866
1,1,1,1,0,0,4583.0,1508.0,128.0,360.0,1.0,0,6091.0,47.217054
2,1,1,0,0,1,3000.0,0.0,66.0,360.0,1.0,2,3000.0,44.776119
3,1,1,0,1,0,2583.0,2358.0,120.0,360.0,1.0,2,4941.0,40.834711
4,1,0,0,0,0,6000.0,0.0,141.0,360.0,1.0,2,6000.0,42.253521


In [15]:
# Save target encoder
joblib.dump(target_le, "target_encoder.pkl")

['target_encoder.pkl']

In [16]:
# Save encoders
joblib.dump(encoders,"encoders.pkl")

['encoders.pkl']

In [17]:
# Prepare model input
X = df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# -----------------------------
# Model 1: Logistic Regression
# -----------------------------


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


log_reg = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)
log_reg.fit(X_train_scaled, y_train)
y_pred_log = log_reg.predict(X_test_scaled)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))
print("\nClassification Report:\n", classification_report(y_test, y_pred_log))
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("roc-auc-score:", roc_auc_score(y_test, log_reg.predict_proba(X_test_scaled)[:,1]))


Confusion Matrix:
 [[20 23]
 [ 7 73]]

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.47      0.57        43
           1       0.76      0.91      0.83        80

    accuracy                           0.76       123
   macro avg       0.75      0.69      0.70       123
weighted avg       0.75      0.76      0.74       123

Accuracy: 0.7560975609756098
roc-auc-score: 0.7308139534883721


In [24]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
print("\nClassification Report:\n", classification_report(y_test, y_pred_knn))
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("roc-auc-score:", roc_auc_score(y_test, knn.predict_proba(X_test)[:,1]))


Confusion Matrix:
 [[ 5 38]
 [15 65]]

Classification Report:
               precision    recall  f1-score   support

           0       0.25      0.12      0.16        43
           1       0.63      0.81      0.71        80

    accuracy                           0.57       123
   macro avg       0.44      0.46      0.43       123
weighted avg       0.50      0.57      0.52       123

Accuracy: 0.5691056910569106
roc-auc-score: 0.5068313953488371


In [25]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Scale data for SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm = SVC(kernel='rbf', class_weight='balanced', random_state=42)
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("roc-auc-score:", roc_auc_score(y_test, svm.decision_function(X_test_scaled)))



Confusion Matrix:
 [[18 25]
 [ 6 74]]

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.42      0.54        43
           1       0.75      0.93      0.83        80

    accuracy                           0.75       123
   macro avg       0.75      0.67      0.68       123
weighted avg       0.75      0.75      0.73       123

Accuracy: 0.7479674796747967
roc-auc-score: 0.7380813953488372


In [26]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("roc-auc-score:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))




Confusion Matrix:
 [[21 22]
 [ 6 74]]

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.49      0.60        43
           1       0.77      0.93      0.84        80

    accuracy                           0.77       123
   macro avg       0.77      0.71      0.72       123
weighted avg       0.77      0.77      0.76       123

Accuracy: 0.7723577235772358
roc-auc-score: 0.770203488372093


In [None]:
# Model Comparison
# Logistic Regression
log_acc = accuracy_score(y_test, y_pred_log)
log_prec = precision_score(y_test, y_pred_log)
log_rec = recall_score(y_test, y_pred_log)
log_f1 = f1_score(y_test, y_pred_log)
log_auc = roc_auc_score(y_test, log_reg.predict_proba(X_test_scaled)[:,1])

# KNN
knn_acc = accuracy_score(y_test, y_pred_knn)
knn_prec = precision_score(y_test, y_pred_knn)
knn_rec = recall_score(y_test, y_pred_knn)
knn_f1 = f1_score(y_test, y_pred_knn)
knn_auc = roc_auc_score(y_test, knn.predict_proba(X_test)[:,1])

# SVM
svm_acc = accuracy_score(y_test, y_pred_svm)
svm_prec = precision_score(y_test, y_pred_svm)
svm_rec = recall_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)
svm_auc = roc_auc_score(y_test, svm.decision_function(X_test_scaled))

# Random Forest
rf_acc = accuracy_score(y_test, y_pred_rf)
rf_prec = precision_score(y_test, y_pred_rf)
rf_rec = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)
rf_auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])

# Put results in DataFrame
results_df = pd.DataFrame({
    "Model": ["Logistic Regression", "KNN", "SVM", "Random Forest"],
    "Accuracy": [log_acc, knn_acc, svm_acc, rf_acc],
    "Precision": [log_prec, knn_prec, svm_prec, rf_prec],
    "Recall": [log_rec, knn_rec, svm_rec, rf_rec],
    "F1 Score": [log_f1, knn_f1, svm_f1, rf_f1],
    "ROC AUC": [log_auc, knn_auc, svm_auc, rf_auc]
})

print(results_df)

                 Model  Accuracy  Precision  Recall  F1 Score   ROC AUC
0  Logistic Regression  0.756098   0.760417  0.9125  0.829545  0.730814
1                  KNN  0.569106   0.631068  0.8125  0.710383  0.506831
2                  SVM  0.747967   0.747475  0.9250  0.826816  0.738081
3        Random Forest  0.772358   0.770833  0.9250  0.840909  0.770203


In [29]:
# Pick best model (by F1 Score, then ROC AUC)
best_idx = results_df.sort_values(by=["F1 Score","ROC AUC","Accuracy"], ascending=False).index[0]
best_model_name = results_df.iloc[best_idx]["Model"]
print(f"\nBest Model: {best_model_name}")


Best Model: Random Forest


In [30]:
# Save the best model
if best_model_name == "Logistic Regression":
    joblib.dump(log_reg, "best_model.pkl")
elif best_model_name == "KNN":
    joblib.dump(knn, "best_model.pkl")
elif best_model_name == "SVM":
    joblib.dump(svm, "best_model.pkl")
else:
    joblib.dump(rf, "best_model.pkl")

print("✅ Best model saved as best_model.pkl")

✅ Best model saved as best_model.pkl
