In [1]:
#Import every library needed
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
import joblib

# Load the CSV file containing the loan dataset
ruta_csv = r'C:\Users\Usuario\Documents\Formación\Code\Python\Aplicaciones propias\Loan Predictor\loan_approval_dataset.csv'
df = pd.read_csv(ruta_csv)

# Display general information about the dataset structure and the first few rows
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


(None,
    loan_id   no_of_dependents   education      self_employed   income_annum  \
 0        1                  2   Graduate       No                   9600000   
 1        2                  0   Not Graduate   Yes                  4100000   
 2        3                  3   Graduate       No                   9100000   
 3        4                  3   Graduate       No                   8200000   
 4        5                  5   Not Graduate   Yes                  9800000   
 
     loan_amount   loan_term   cibil_score   residential_assets_value  \
 0      29900000          12           778                    2400000   
 1      12200000           8           417                    2700000   
 2      29700000          20           506                    7100000   
 3      30700000           8           467                   18200000   
 4      24200000          20           382                   12400000   
 
     commercial_assets_value   luxury_assets_value   bank_asset_value  

In [2]:
# Make a copy of the original DataFrame for processing
df_processed = df.copy()

# Remove the unique identifier column (not useful for prediction)
df_processed.drop(columns=['loan_id'], inplace=True)

# Clean column names by removing leading/trailing spaces
df_processed.columns = df_processed.columns.str.strip()

# Convert monetary amounts from cents to dollars
columns_to_convert = [
    'income_annum',
    'loan_amount',
    'residential_assets_value',
    'luxury_assets_value',
    'bank_asset_value'
]
df_processed[columns_to_convert] = df_processed[columns_to_convert] / 100

# Encode categorical variables using one-hot encoding
df_processed = pd.get_dummies(df_processed, columns=['education', 'self_employed'], drop_first=True)

# Encode the target variable (loan_status) into binary format
le = LabelEncoder()
df_processed['loan_status'] = le.fit_transform(df_processed['loan_status'])

# Split features and target
X = df_processed.drop(columns=['loan_status'])
y = df_processed['loan_status']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features to normalize the input values
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the models to train and compare
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": SVC(random_state=42)
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    report = classification_report(y_test, y_pred, output_dict=True)
    results[name] = report

# Create a summary table with key evaluation metrics
summary = pd.DataFrame({
    model: {
        "Accuracy": metrics["accuracy"],
        "Precision (1)": metrics["1"]["precision"],
        "Recall (1)": metrics["1"]["recall"],
        "F1-Score (1)": metrics["1"]["f1-score"]
    } for model, metrics in results.items()
}).T

# Display the summary results
print(summary)


                     Accuracy  Precision (1)  Recall (1)  F1-Score (1)
Random Forest        0.975410       0.977492    0.955975      0.966614
Logistic Regression  0.905152       0.878594    0.864780      0.871632
SVM                  0.923888       0.882175    0.918239      0.899846


In [3]:

# Define the hyperparameter grid for each model
param_grids = {
    "Random Forest": {
        "n_estimators": [100, 200],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5]
    },
    "Logistic Regression": {
        "C": [0.1, 1.0, 10.0],
        "penalty": ['l2'],
        "solver": ['lbfgs']
    },
    "SVM": {
        "C": [0.1, 1.0, 10.0],
        "kernel": ['linear', 'rbf']
    }
}

# Initialize base models with default parameters
base_models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": SVC(random_state=42)
}

# Perform Grid Search Cross Validation for each model
best_models = {}
best_scores = {}

for name in base_models:
    grid = GridSearchCV(base_models[name], param_grids[name], cv=5, scoring='accuracy')
    grid.fit(X_train_scaled, y_train)
    best_models[name] = grid.best_estimator_
    best_scores[name] = grid.best_score_

# Display the best hyperparameters and corresponding cross-validation scores
best_scores, {name: model.get_params() for name, model in best_models.items()}


({'Random Forest': np.float64(0.9777452415812592),
  'Logistic Regression': np.float64(0.9191800878477308),
  'SVM': np.float64(0.9484626647144948)},
 {'Random Forest': {'bootstrap': True,
   'ccp_alpha': 0.0,
   'class_weight': None,
   'criterion': 'gini',
   'max_depth': None,
   'max_features': 'sqrt',
   'max_leaf_nodes': None,
   'max_samples': None,
   'min_impurity_decrease': 0.0,
   'min_samples_leaf': 1,
   'min_samples_split': 5,
   'min_weight_fraction_leaf': 0.0,
   'monotonic_cst': None,
   'n_estimators': 200,
   'n_jobs': None,
   'oob_score': False,
   'random_state': 42,
   'verbose': 0,
   'warm_start': False},
  'Logistic Regression': {'C': 0.1,
   'class_weight': None,
   'dual': False,
   'fit_intercept': True,
   'intercept_scaling': 1,
   'l1_ratio': None,
   'max_iter': 1000,
   'multi_class': 'deprecated',
   'n_jobs': None,
   'penalty': 'l2',
   'random_state': 42,
   'solver': 'lbfgs',
   'tol': 0.0001,
   'verbose': 0,
   'warm_start': False},
  'SVM': {'C

In [4]:

# Create a voting classifier using the best-tuned models from Grid Search
voting_clf = VotingClassifier(
    estimators=[
        ('rf', best_models['Random Forest']),
        ('lr', best_models['Logistic Regression']),
        ('svm', best_models['SVM'])
    ],
    voting='hard'  # can be set to 'soft' if all models support predict_proba
)

# Train the ensemble model on the training data
voting_clf.fit(X_train_scaled, y_train)

# Make predictions on the test set using the ensemble
y_pred_voting = voting_clf.predict(X_test_scaled)

# Generate classification metrics for the ensemble model
report_voting = classification_report(y_test, y_pred_voting, output_dict=True)

# Build a summary table of the ensemble model's performance
ensemble_summary = pd.DataFrame({
    "Voting Classifier": {
        "Accuracy": report_voting["accuracy"],
        "Precision (1)": report_voting["1"]["precision"],
        "Recall (1)": report_voting["1"]["recall"],
        "F1-Score (1)": report_voting["1"]["f1-score"]
    }
}).T

# Display the evaluation results
print(ensemble_summary)


                   Accuracy  Precision (1)  Recall (1)  F1-Score (1)
Voting Classifier  0.959016       0.949206    0.940252      0.944708


In [5]:
# Retrain the final ensemble model on the entire dataset to maximize available training information
voting_clf.fit(scaler.fit_transform(X), y)

# Save the trained ensemble model to a file for future use
joblib.dump(voting_clf, "loan_approval_model.pkl")

# Save the fitted scaler to ensure consistent preprocessing during prediction
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [6]:
def predict_loan(data, model, scaler):
    """
    Receives a dictionary with loan application data, processes it, and returns the prediction.
    
    Parameters:
        data (dict): Dictionary with input data for a single loan application.
        model: Trained machine learning model (e.g., voting ensemble).
        scaler: Pre-fitted scaler for numeric feature normalization.
    
    Returns:
        str: Predicted loan status ("Approved" or "Rejected").
    """
    # Convert the input dictionary to a DataFrame
    new_df = pd.DataFrame([data])
    
    # One-hot encode categorical variables
    new_df = pd.get_dummies(new_df)
    
    # Ensure the input has the same columns as the training data
    new_df = new_df.reindex(columns=X.columns, fill_value=0)
    
    # Scale the numeric features
    new_df = scaler.transform(new_df)
    
    # Generate the prediction
    result = model.predict(new_df)
    
    # Convert numeric prediction back to original label
    return le.inverse_transform(result)[0]
