In [6]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# import training and test data from Kaggle
df = pd.read_csv("loan-train.csv")

In [None]:
# check for null values
df.info()

In [None]:
# clean training data by filling in missing values with the most common value
df = train_df.copy() # copy df
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])
df['LoanAmount'] = df['LoanAmount'].fillna(train_df['LoanAmount'].median())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])

In [None]:
# check again for null values
df.info()

In [None]:
# convert target label from "Y"/"N" to 1/0
df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})

In [None]:
# One-hot encode categorical features
df = pd.get_dummies(df, columns=['Gender', 'Married', 'Education','Self_Employed', 'Property_Area', 'Dependents'], drop_first=True)

In [None]:
# drop loan id
df.drop('Loan_ID', axis=1, inplace=True)

In [None]:
# split features vs target
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

In [None]:
# Step 5: Split into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 6: Scale the data
scaler = X()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

In [None]:
# Step 7: Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

In [None]:
# Step 8: Predict and evaluate
y_pred = model.predict(X_valid_scaled)

In [None]:
# Plot confusion matrix
sns.heatmap(confusion_matrix(y_valid, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Classification Report
print("\nClassification Report:")
print(classification_report(y_valid, y_pred))

In [None]:
# Step 10: Show 5 misclassified examples
misclassified_indices = np.where(y_pred != y_valid)[0][:5]
print("\n5 Misclassified Samples:")
print(X_valid.iloc[misclassified_indices])
print("\nTrue Labels:", y_valid.iloc[misclassified_indices].values)
print("Predicted Labels:", y_pred[misclassified_indices])