<a href="https://colab.research.google.com/github/MahdiFaourr/MahdiFaourr/blob/main/loan_acceptance_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install missing libraries
!pip install opendatasets

In [None]:
# Import the necessary libraries and functions for this notebook
import opendatasets as od
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,cross_validate,GridSearchCV
from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier, KernelDensity
from joblib import dump

In [None]:
# Rnandom seed
SEED=15

In [None]:
# Download the data from kaggle
od.download("https://www.kaggle.com/datasets/ninzaami/loan-predication")

In [None]:
# Read the data in a pandas dataframe
loan_data=pd.read_csv("/content/loan-predication/train_u6lujuX_CVtuZ9i (1).csv")

In [None]:
# Display the first 5 rows from the data
loan_data.head()

In [None]:
# The shape of the data
print(f"The data contains {loan_data.shape[0]} rows and {loan_data.shape[1]} columns.")

In [None]:
# Data-info
loan_data.info()

In [None]:
# Get the number of missing values in each column
loan_data.isnull().sum()

In [None]:
# Fill nulls in categorical attributes with mode
categorical_cols =loan_data.select_dtypes(include=['object']).columns
for categorical_column in categorical_cols:
  loan_data[categorical_column] = loan_data[categorical_column].fillna(loan_data[categorical_column].mode().iloc[0])
# Fill nulls in numerical attributes with mean
numerical_cols =loan_data.select_dtypes(include=['int64', 'float64']).columns
for numerical_column in numerical_cols:
  loan_data[numerical_column] = loan_data[numerical_column].fillna(loan_data[numerical_column].mean())

In [None]:
# Check again the number of missing values in each column
loan_data.isnull().sum()
# Well, no missing values know

In [None]:
# Get some statistical measures for the numerical columns
loan_data.describe()

In [None]:
# Get some statistical measures for the categorical columns
loan_data.describe(include='O')

In [None]:
# Convert all categorical columns into numerical columns
encoder=LabelEncoder()
for categorical_column in categorical_cols:
  loan_data[categorical_column]=encoder.fit_transform(loan_data[categorical_column])

In [None]:
# Check again the data-info
loan_data.info()
# Well, no categorical column left

In [None]:
# Perform cross-tabulation over the  categorical and 'Loan_Status' columns
for categorical_column in categorical_cols:
 if categorical_column not in ['Loan_Status','Loan_ID']:
  gender_crosstab = pd.crosstab(index=loan_data[categorical_column], columns=loan_data['Loan_Status'])

# Plot the cross-tabulation results with different colors
  plt.figure(figsize=(8, 6))
  gender_crosstab.plot(kind='bar', color=['skyblue', 'salmon'], legend=False)
  plt.xlabel(f"{categorical_column}")
  plt.ylabel('Count')
  plt.title(f'Count of {categorical_column} by Loan Status')
  plt.xticks(rotation=0)
  plt.show()

In [None]:
# Plot distribution plots for each numerical column in the second class
filtered_data_0=loan_data[loan_data['Loan_Status']==0]
for numerical_column in numerical_cols:
    plt.figure(figsize=(8, 6))
    sns.histplot(data=filtered_data_0, x=numerical_column, kde=True)
    plt.title(f'Distribution of {numerical_column}')
    plt.xlabel(numerical_column)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Plot distribution plots for each numerical column
filtered_data_1=loan_data[loan_data['Loan_Status']==1]
for numerical_column in numerical_cols:
    plt.figure(figsize=(8, 6))
    sns.histplot(data=filtered_data_1, x=numerical_column, kde=True)
    plt.title(f'Distribution of {numerical_column}')
    plt.xlabel(numerical_column)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
from sklearn.neighbors import KernelDensity

# Choose bandwidth parameter
bandwidth =0.1

# Plot KDE for each column in the dataset
plt.figure(figsize=(10, 8))
for col in loan_data[numerical_cols]:
    # Prepare data
    data = loan_data[col].values.reshape(-1, 1)

    # Fit KDE model
    kde = KernelDensity(bandwidth=bandwidth)
    kde.fit(data)

    # Generate samples from KDE
    samples = kde.sample(100)

    # Plot KDE
    plt.hist(data, bins=30, density=True, alpha=0.5, label=f'{col} (Data)')
    plt.hist(samples, bins=50, density=True, alpha=0.5, label=f'{col} (KDE)')
    plt.xlabel('Value')
    plt.ylabel('Density')
    plt.title('Kernel Density Estimation (KDE) for All Columns')
    plt.legend()
    plt.show()


In [None]:
loan_data=loan_data.drop(['Loan_ID'],axis=1)
loan_data_max=loan_data.max()
loan_data=loan_data.divide(loan_data_max)

In [None]:
loan_data['Loan_Status'].value_counts()

In [None]:
over_sampler=RandomOverSampler(sampling_strategy=0.6)
x=loan_data.drop('Loan_Status',axis=1).values
y=loan_data['Loan_Status'].values
x_new,y_new=over_sampler.fit_resample(x,y)

In [None]:
lr=LogisticRegression()
lda=LinearDiscriminantAnalysis()
KNN=KNeighborsClassifier(n_neighbors=5)
models=[lr,lda,KNN]
for model in models:
  results=cross_validate(model,x_new,y_new,cv=3,scoring=['accuracy','precision','recall'])
  print("".join("-" for x in range(50)))
  print(f'{model}:')
  for metric in ['test_accuracy', 'test_precision', 'test_recall']:
        print(f'{metric}: {np.mean(results[metric])}')




In [None]:
# Define parameter grid
param_grid = {
    'solver': ['svd', 'lsqr', 'eigen'],
    'shrinkage': [None, 'auto']
}

# Perform grid search
grid_search = GridSearchCV(lda, param_grid, cv=3, scoring='accuracy')
grid_search.fit(x_new,y_new)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
lda=LinearDiscriminantAnalysis(shrinkage='auto',solver='lsqr')

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x_new,y_new,test_size=0.2)
lda.fit(x_train,y_train)

In [None]:
# Test the performance of LDA model using different metrics
y_hat=lda.predict(x_test)
accuracy=accuracy_score(y_hat,y_test)
precision=precision_score(y_hat,y_test)
recall=recall_score(y_hat,y_test)
print(''.join('-' for x in range(50)))
print(f"accuracy:{accuracy*100}%.")
print(''.join('-' for x in range(50)))
print(f"precision: {precision*100}%.")
print(''.join('-' for x in range(50)))
print(f"accuracy: {recall*100}%.")

In [None]:
# Save the model to a file
dump(lda,'lda_model.joblib')


In [None]:
# Define parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(KNN, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_new, y_new)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
KNN=KNeighborsClassifier(algorithm='auto',n_neighbors=9,weights='distance')
KNN.fit(x_train,y_train)

In [None]:
# Test the performance of KNN model using different metrics
y_hat=KNN.predict(x_test)
accuracy=accuracy_score(y_hat,y_test)
precision=precision_score(y_hat,y_test)
recall=recall_score(y_hat,y_test)
print(''.join('-' for x in range(50)))
print(f"accuracy:{accuracy*100}%.")
print(''.join('-' for x in range(50)))
print(f"precision: {precision*100}%.")
print(''.join('-' for x in range(50)))
print(f"accuracy: {recall*100}%.")

In [None]:
# Save the trained model to a file
dump(KNN, 'knn_model.joblib')

In [None]:
# Compute the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_hat)
# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# This function predicts the loan status using KNN model
def loan_status_predictor(input_array):
  if KNN.predict([input_array])==1:
    return "Loan Apporoved"
  else:
    return "Loan Rejected"

In [None]:
# Example usage 1
loan_status_predictor([1.0,	0.0,	0.000000,	0.0,	0.0,	0.072210	,0.000000	,0.209160	,0.75,	1.0,	1.0])

In [None]:
# Example usage 2
loan_status_predictor([0.0,	0.0,	0.000000,	0.0,	1.0,	0.056580,	0.00000,	0.190000,	0.750,	0.0,	0.5])