In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the dataset
df = pd.read_csv('loan_approval_dataset.csv')

# Strip any extra spaces from the column names
df.columns = df.columns.str.strip()

# Check the column names
print(df.columns)

# Data Preprocessing
# Ensure the column names are correct
if 'education' in df.columns and 'self_employed' in df.columns and 'loan_status' in df.columns:
  # Encode categorical variables
  label_encoder = LabelEncoder()
  df['education'] = label_encoder.fit_transform(df['education'])
  df['self_employed'] = label_encoder.fit_transform(df['self_employed'])
  df['loan_status'] = label_encoder.fit_transform(df['loan_status'])
else:
  print("One or more columns are missing or incorrectly named.")

# Define features and target variable
X = df.drop(columns=['loan_id', 'loan_status'])
y = df['loan_status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

# Model Building and Evaluation
models = {
  'Random Forest': RandomForestClassifier(random_state=42),
  'SVM': SVC(random_state=42),
  'KNN': KNeighborsClassifier(),
  'Logistic Regression': LogisticRegression(random_state=42)
}

for model_name, model in models.items():
  # Train the model
  model.fit(X_train, y_train)
  
  # Save the model to a .pkl file
  joblib.dump(model, f'{model_name.replace(" ", "_").lower()}_model.pkl')
  
  # Predictions
  y_pred = model.predict(X_test)
  
  # Evaluation
  accuracy = accuracy_score(y_test, y_pred)
  report = classification_report(y_test, y_pred)
  
  print(f'--- {model_name} ---')
  print(f'Accuracy: {accuracy}')
  print('Classification Report:')
  print(report)
  print('\n')

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')
--- Random Forest ---
Accuracy: 0.977751756440281
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       536
           1       0.98      0.96      0.97       318

    accuracy                           0.98       854
   macro avg       0.98      0.97      0.98       854
weighted avg       0.98      0.98      0.98       854



--- SVM ---
Accuracy: 0.9238875878220141
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       536
           1       0.88      0.92      0.90       318

    accuracy                           0.92       854
   macro avg       0.92      0.9