<a href="https://colab.research.google.com/github/Michael-HK/mlops-loan_approval_model/blob/main/loan_approval_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install dabl
!pip install imbalanced-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import dabl
from pickle import dump
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function() {
    return False;
}

In [None]:
df=pd.read_csv("/kaggle/input/loan-approval-prediction/Training Dataset.csv")
df

In [None]:
#============= check shape ==================
df.shape

In [None]:
# Check for data types of the columns
df.info()

In [None]:
df.describe()

In [None]:
##==== check for nan ===========

df.isnull().sum()

In [None]:
##========== fill nan present in the whole datasets =================

df.LoanAmount=df['LoanAmount'].fillna(df.LoanAmount.mean())

df.Credit_History=df['Credit_History'].fillna(df.Credit_History.mean())

df.Loan_Amount_Term=df['Loan_Amount_Term'].fillna(df.Loan_Amount_Term.mean())

df['Gender'].fillna(df['Gender'].value_counts().idxmax(), inplace=True)

df['Married'].fillna(df['Married'].value_counts().idxmax(), inplace=True)

df.Dependents.fillna(df['Dependents'].value_counts().idxmax(), inplace=True)

df.Self_Employed.fillna(df['Self_Employed'].value_counts().idxmax(), inplace=True)

In [None]:
##========== replace N and Y with binary ==========
def Replace(a):
  if a=='N'
    return 0
  else:
    return 1

df['loan_Status'] = df['loan_Status'].apply(Replace)

#df.Loan_Status.replace('N',0,inplace=True)
#df.Loan_Status.replace('Y',1,inplace=True)

In [None]:
#======== drop ID column ==================
df.drop(['Loan_ID'],axis=1)

**Initial data exploration**

In [None]:
plt.style.use('ggplot')
warnings.filterwarnings('ignore')
dabl.plot(df, target_col = 'Loan_Status')

**Data Analysis**

In [None]:
cat_var = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(15,15))

## create pie chart for each categories variables
for i, var in enumerate(cat_vars):
  if i < len(axs.flat):
    # count the number of occurence in each category
    cat_counts = df[var].value_counts()

    #create pie
    axs.flat[i].pie(cat_counts, labels=cat_counts.index, autopct='%1.1f%%', startangle=90)
    #set a title
    axs.flat[i].set_title(f'{var} Distribution')

# adjust spacing
fig.tight_layout()

#show the plot
plt.show()

In [None]:
#==============create figure with subplots====================

fig, axis = plt.subplots(nrows=3, ncols=3, figsize=(15,15))

for i, var in enumerate(df.select_dtypes(include=['object']).columns:):
  sns.countplot(x=var, hue='Loan_Status', data=df, ax=axis[i])
  axs[i].set_xticklabels(axis[i].get_xticklabels(), rotation=90)
fig.tight_layout()

In [None]:
#========create figure with subplots========

fig, axis = plt.subplots(nrows=3, ncols=3, figsize=(15,15))

for i, var in enumerate(df.select_dtypes(include=['object']).columns:):
  sns.histplot(x=var, hue='Loan_Status', data=df, ax=axis[i], multiple='fill', kde=False, elements='bar')
  axs[i].set_xticklabels(axis[i].get_xticklabels(), rotation=90)

fig.tight_layout()

In [None]:
plt.figure(figsize=(12,5))
sns.distplot(df['ApplicantIncome'][df.Loan_Status==0])
sns.distplot(df['ApplicantIncome'][df.Loan_Status==1])
plt.legend(['Loan_Status=0','Loan_Status=1'])
plt.show()

plt.figure(figsize=(12,5))
sns.distplot(df['LoanAmount'][df.Loan_Status==0])
sns.distplot(df['LoanAmount'][df.Loan_Status==1])
plt.legend(['Loan_Status=0','Loan_Status=1'])
plt.show()

In [None]:
con_var = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(20,10))
axs = axs.flatten()

for i, var in enumerate(con_var):
  sns.boxplot(x=var, data=df, ax=axs[i])

fig.tight_layout()
plt.show()

In [None]:
con_var = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(20,10))
axs = axs.flatten()

for i, var in enumerate(con_var):
  sns.boxplot(y=var, x='Loan_Status' data=df, ax=axs[i])

fig.tight_layout()
plt.show()

In [None]:
con_var = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(20,10))
axs = axs.flatten()

for i, var in enumerate(con_var):
  sns.violinplot(x=var, data=df, ax=axs[i])

fig.tight_layout()
plt.show()

In [None]:
con_var = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(20,10))
axs = axs.flatten()

for i, var in enumerate(con_var):
  sns.violinplot(y=var, x='Loan_Status' data=df, ax=axs[i])

fig.tight_layout()
plt.show()

In [None]:
##==== check if label is unbalanced =================
plt.figure(figsize=(8,4))
sns.countplot(df.Loan_Status)
plt.show()

**Encoding categorical variables**

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_encode = df[['Education','Dependents','Self_Employed', 'Gender', 'Married', 'Property_Area']]

df[['Education','Dependents','Self_Employed', 'Gender', 'Married', 'Property_Area']] = df_encode.apply(label_encoder.fit_transform)

# save encode feature
output = open('label_encoder.pkl', 'wb')
pickle.dump(le, output)
output.close()

#label_encoder = joblib.load('label_encoder.pkl')  # Load the saved LabelEncoder object
#new_data_encoded = new_data[['column1', 'column2', 'column3']].apply(label_encoder.transform)

In [None]:
##========== using chi-square to check how categorical input data affect target value==============


**Correlation heatmap**

In [None]:
plt.figure(figsize=(20, 16))
sns.heatmap(df.corr(), fmt='.2g', annot=True)

In [None]:
##========= disploy top influencing factors ====================================

plt.figure(figsize=(8, 12))
heatmap = sns.heatmap(df.corr()[['Loan_status']].sort_values(by='target'), ascending=False)
heatmap.set_title('Feature correlating with target', fontdict={'fontsize':18})

corr = df.corrwith(df['Loan_status0']).sort_values(ascending = False).to_frame()
corr.columns = ['Loan_status']

plt.subplots(figsize = (5,5))
sns.heatmap(corr, annot = True,cmap=colors, linewidth = 0.4, linecolor = 'black');
plt.title('LOAN_STATUS Correlation')

In [None]:
##
x=df[['Dependents','Education','Self_Employed','ApplicantIncome','LoanAmount','Credit_History']]
y=df[['Loan_Status']]

In [None]:
##======== handling unbalanced dataset =============

over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
sample_X, sample_y = pipeline.fit_resample(x, y)
Counter(sample_y)

In [None]:
#==============split==================
x_train, x_test, y_train, y_test = train_test_split(sample_X, sample_y, test_size=0.25,random_state=40)

In [None]:
#======= Remove outliers using Z-score test ================
from scipy import stats

cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
z_scores = np.abs(stats.zscore(x_train[col]))

#=======set thresold==========
threshold=3

#========== find the indices of the outliers ================

outlier_indices = np.where(z_scores > threshold)[0]

#remove outliers from the training data
x_train = x_train.drop([x_train.index[outlier_indices]])
y_train = y_train.drop([y_train.index[outlier_indices]])

## Modeling

In [None]:
def model(classifier, x_train, y_train):
  sns.set(rc={'figure.figure':(5,3)})
  sns.set(style='whitegrid')
  classifier.fit(x_train, y_train)
  prediction = classifier.predict(x_test)
  cv = RepeatedStratifiedKfold(n_splits = 10, n_repeats = 3, random = 1)
  print('Cross Validation Score : ', '{0:.2%}'.format(cross_val_score(classifier, x_train, y_train, cv=cv)))
  print('ROC_AUC Score : ','{0.2%}'.format(roc_auc_score(y_test,prediction)))
  #plot_roc_curve
  RocCurveDisplay.from_estimator(classifier, x_test, y_test)
  plt.title('ROC_AUC_Plot')
  plt.show()

def model_evaluation():

  #Confusion_Matrix
  cm = confusion_matrix(y_test,classifier(x_test))
  name = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
  counts = [value for value in cm.flatten()]
  percentages =

##Conclusion

In [None]:
import shap
explainer = shap.TreeExplainer(dtree)
shap_values = explainer.shap_values(x_test)
shap.summary_plot(shap_values, x_test)

In [None]:
#compute SHAP values
explainer = shap.TreeExplainer(dtree)
shap_values = explainer.shap_values(x_test)
shap.summary_plot(shap_values[1], x_test.values, feature_name = x_test.columns, plot_type='violin')