In [1]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

In [2]:
import pandas as pd
import os

data_folder = os.getcwd().split("loan_approval_prediction")[0] + "loan_approval_prediction\\data\\"
df = pd.read_csv(data_folder + 'credit_risk_dataset.csv')  


# Data Cleaning

## Remove Duplicates 

In [None]:
# Identify duplicate rows
duplicates = df.duplicated(keep='first')
num_duplicates = duplicates.sum()

# Remove duplicate rows
df = df.drop_duplicates(keep='first')

print('no. duplicates: ', num_duplicates)

## Change data types as appropriate - not including categorical data, yet

In [4]:
df['person_emp_length'] = df['person_emp_length'].fillna(-1)
df['loan_int_rate'] = df['loan_int_rate'].fillna(-1)

# We're also adding 'missing indicator' fields to explicity call out rows with missing data:
df['missing_emp_length'] = (df['person_emp_length'] == -1).astype(int)
df['missing_loan_rate'] = (df['loan_int_rate'] == -1).astype(int)

df['person_age'] = df['person_age'].astype('uint8')
df['person_income'] = df['person_income'].astype('uint32')
df['loan_amnt'] = df['loan_amnt'].astype('uint32')
df['loan_int_rate'] = df['loan_int_rate'].astype('float32')
df['loan_status'] = df['loan_status'].astype('uint8')
df['loan_percent_income'] = df['loan_percent_income'].astype('float32')
df['cb_person_cred_hist_length'] = df['cb_person_cred_hist_length'].astype('uint8')
df['missing_emp_length'] = df['missing_emp_length'].astype('uint8')
df['missing_loan_rate'] = df['missing_loan_rate'].astype('uint8')

In [None]:
# Results show a saving in memory of > 0.9 MB. 
# This step is more relevant to large datasets and pipelines.
df.info()

# Scaling & One-Hot Encoding

In [None]:
# We are using min-max scaling for continuous variables as the underlying data is not normally distributed
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
df[['person_age', 'person_income','loan_amnt']] = min_max_scaler.fit_transform(df[['person_age','person_income','loan_amnt']])
df.sample(5)

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse_output=False)

encoded_data = encoder.fit_transform(df[['person_home_ownership','loan_intent','loan_grade']])

encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['person_home_ownership','loan_intent','loan_grade']))
df_final = pd.concat([df.drop(['person_home_ownership','loan_intent','loan_grade'], axis=1).reset_index(drop=True) , encoded_df.reset_index(drop=True) ], axis=1)

df_final.sample(5)

In [8]:
# df_final = df_final[~df_final["person_age"].isna()]

In [None]:
df.shape[0] - df_final.shape[0]

In [None]:
# Converting Ys and Ns to 1s and 0s
df_final['cb_person_default_on_file'] = df_final['cb_person_default_on_file'].replace({'Y': 1, 'N': 0})
df_final.head()

In [None]:
# Mark NaN values in person_emp_length and loan_int_rate

df_final['missing_emp_length'] = df_final['person_emp_length'].isnull().astype(int)
df_final['missing_int_rate'] = df_final['loan_int_rate'].isnull().astype(int)

# Also mark loans with 0 interest as a missing value
df_final.loc[df_final['loan_int_rate'] == 0, 'missing_int_rate'] = 1

# Let NaNs equal -1 to indicate missing value
df_final.loc[df_final['person_emp_length'].isnull(), 'person_emp_length'] = -1
df_final.loc[df_final['loan_int_rate'].isnull(), 'loan_int_rate'] = -1

df_final.info()

In [None]:
df_final.info()

In [None]:
df_final[df_final.isnull().any(axis=1)]

In [None]:
# Assuming your DataFrame is df
correlation_matrix = df_final.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.show()


In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Assuming df has a continuous variable (e.g., 'loan_amount') and a categorical target (e.g., 'loan_status')
model = ols('loan_status ~ person_income + loan_int_rate + loan_percent_income', data=df_final).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)

# 'loan_int_rate', 'loan_percent_income', 'person_home_ownership_RENT', 'person_income'

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Prepare the data (features and labels)
X = df_final.drop('loan_status', axis=1)  # Features
y = df_final['loan_status']  # Target label

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Define the column transformer for preprocessing
# - StandardScaler for numerical columns: 'age', 'income', 'credit_score'
# # - OneHotEncoder for categorical columns: 'gender'
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), ['age', 'income', 'credit_score']),
#         ('cat', OneHotEncoder(drop='first'), ['gender'])
#     ])

# Step 3: Create and train the Logistic Regression model
classifier = LogisticRegression(solver='liblinear')  # 'liblinear' is a good choice for smaller datasets
classifier.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = classifier.predict(X_test)

# Step 5: Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Create and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 2: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 3: Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Optional: Print detailed evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Create and train the SVM model
model = SVC()  # By default, SVC uses an RBF kernel
model.fit(X_train, y_train)

# Step 2: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 3: Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Optional: Print detailed evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Create and train the Decision Tree model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Step 2: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 3: Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Optional: Print detailed evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Create and train the Random Forest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Step 2: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 3: Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Optional: Print detailed evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [21]:
# !pip uninstall  numpy

In [22]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# # Define the parameter grid
# param_grid = {
#     'n_estimators': [50, 100, 200],        # Number of trees
#     'max_depth': [None, 10, 20, 30],        # Depth of trees
#     'min_samples_split': [2, 5, 10],        # Minimum samples to split a node
#     'min_samples_leaf': [1, 2, 4],          # Minimum samples per leaf
#     'criterion': ['gini', 'entropy']       # Splitting criteria
# }

# # Create the Random Forest model
# rf = RandomForestClassifier(random_state=42)

# # Perform Grid Search
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
#                            cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# # Fit the grid search to the training data
# grid_search.fit(X_train, y_train)

# # Best parameters from Grid Search
# print("Best Parameters from GridSearchCV:")
# print(grid_search.best_params_)

# # Train the model with the best parameters
# best_rf_model = grid_search.best_estimator_

# # Evaluate the tuned model on the test set
# y_pred_best = best_rf_model.predict(X_test)
# best_accuracy = accuracy_score(y_test, y_pred_best)
# print(f"Random Forest Accuracy (with best parameters): {best_accuracy:.2f}")

# print("\nClassification Report (Best Model):")
# print(classification_report(y_test, y_pred_best))

# print("\nConfusion Matrix (Best Model):")
# print(confusion_matrix(y_test, y_pred_best))



In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Create and train the XGBoost model
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

# Step 2: Make predictions on the test set
xgb_y_pred = xgb_model.predict(X_test)

# Step 3: Evaluate the model's accuracy
xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
print("XGBoost Model Accuracy:", xgb_accuracy)
print("\nXGBoost Classification Report:\n", classification_report(y_test, xgb_y_pred))
print("\nXGBoost Confusion Matrix:\n", confusion_matrix(y_test, xgb_y_pred))


In [None]:
from lightgbm import LGBMClassifier

# Step 1: Create and train the LightGBM model
lgb_model = LGBMClassifier()
lgb_model.fit(X_train, y_train)

# Step 2: Make predictions on the test set
lgb_y_pred = lgb_model.predict(X_test)

# Step 3: Evaluate the model's accuracy
lgb_accuracy = accuracy_score(y_test, lgb_y_pred)
print("LightGBM Model Accuracy:", lgb_accuracy)
print("\nLightGBM Classification Report:\n", classification_report(y_test, lgb_y_pred))
print("\nLightGBM Confusion Matrix:\n", confusion_matrix(y_test, lgb_y_pred))


In [None]:
# With Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

best_params = {
    'colsample_bytree': 0.9,
    'learning_rate': 0.1,
    'max_depth': 5,
    'min_child_weight': 1,
    'n_estimaors': 300,
    'subsample': 0.9
}

best_model = XGBClassifier(**best_params)
best_model.fit(X_train, y_train)

# Grid Search
# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=3, verbose=2, n_jobs=1)

# grid_search.fit(X_train, y_train)

# best_params = grid_search.best_params_
# best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Best Hyperparameters:", best_params)
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)