<a href="https://colab.research.google.com/github/Navin1130/Dissertation-/blob/main/Run_Dissertation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install libraries
!pip install xgboost shap lime scikit-learn pandas matplotlib seaborn

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [None]:
# Load dataset
data_path = '/content/drive/MyDrive/bank-full.csv'
df = pd.read_csv(data_path, sep=';')
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

# **EDA**

In [None]:
# Class distribution
sns.countplot(data=df, x='y')
plt.title('Target Variable Distribution (Loan Approval)')
plt.xlabel('Loan Approved')
plt.ylabel('Count')
plt.show()

# Show class balance
print(df['y'].value_counts(normalize=True) * 100)

In [None]:
# Numeric Feature distribution
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

df[numerical_cols].hist(bins=15, figsize=(12, 8), color='skyblue', edgecolor='black')
plt.suptitle('Distributions of Numerical Features')
plt.tight_layout()
plt.show()


In [None]:
categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    plt.figure(figsize=(8, 4))
    sns.countplot(data=df, x=col, hue='y')
    plt.title(f'{col} vs Loan Approval')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
# BoxPlot numeric columns by target class
for col in ['age', 'balance', 'duration', 'campaign', 'pdays']:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x='y', y=col, data=df)
    plt.title(f'{col} by Loan Approval Status')
    plt.tight_layout()
    plt.show()


In [None]:
# Bivariate Analysis: Job and Education by Approval Rate
cross = pd.crosstab(df['job'], df['y'], normalize='index') * 100
cross.plot(kind='bar', stacked=True, colormap='viridis')
plt.title('Loan Approval Rate by Job')
plt.ylabel('Percentage')
plt.xticks(rotation=45)
plt.legend(title='Loan Approved')
plt.tight_layout()
plt.show()

cross2 = pd.crosstab(df['education'], df['y'], normalize='index') * 100
cross2.plot(kind='bar', stacked=True, colormap='Set2')
plt.title('Loan Approval Rate by Education')
plt.ylabel('Percentage')
plt.xticks(rotation=45)
plt.legend(title='Loan Approved')
plt.tight_layout()
plt.show()


In [None]:
#  Correlation Matrix (Heatmap)
# Encode 'y' for numeric correlation
df_corr = df.copy()
df_corr['y'] = df_corr['y'].map({'yes': 1, 'no': 0})

# Select only numerical columns for correlation calculation
numerical_cols = df_corr.select_dtypes(include=['int64', 'float64']).columns

plt.figure(figsize=(12, 8))
sns.heatmap(df_corr[numerical_cols].corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# **Data Preprocessing**

In [None]:
# Encode target
df['y'] = df['y'].map({'yes': 1, 'no': 0})

# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [None]:
# Handle imbalance
majority = df_encoded[df_encoded.y == 0]
minority = df_encoded[df_encoded.y == 1]
minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
df_balanced = pd.concat([majority, minority_upsampled])

# Split
X = df_balanced.drop("y", axis=1)
y = df_balanced["y"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# **XGBoost Model**

In [None]:
# Implement model

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the XGBoost model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)
print('Confusion Matrix:')
conf_matrix

In [None]:
# Confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No (0)', 'Yes (1)'], yticklabels=['No (0)', 'Yes (1)'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

#**Random Forest Model**

In [None]:
# Random Forest Classifier Model

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialise and train the Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Making predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluating the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)

print(f'Random Forest Accuracy: {accuracy_rf:.4f}')
print('Random Forest Classification Report:')
print(report_rf)
print('Random Forest Confusion Matrix:')
conf_matrix_rf

In [None]:
# Confusion matrix for Random Forest
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_rf, annot=True, fmt='d', cmap='Blues', xticklabels=['No (0)', 'Yes (1)'], yticklabels=['No (0)', 'Yes (1)'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

#**K-Nearest Neighbors Model**

In [None]:
# K-Nearest Neighbors Classifier Model

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialise and train the KNN model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

# Making predictions
y_pred_knn = knn_model.predict(X_test)

# Evaluating the model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
report_knn = classification_report(y_test, y_pred_knn)
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)

print(f'KNN Accuracy: {accuracy_knn:.4f}')
print('KNN Classification Report:')
print(report_knn)
print('KNN Confusion Matrix:')
conf_matrix_knn

In [None]:
# Confusion matrix for KNN
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_knn, annot=True, fmt='d', cmap='Blues', xticklabels=['No (0)', 'Yes (1)'], yticklabels=['No (0)', 'Yes (1)'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Comparison of  Models

models = ['XGBoost', 'Random Forest', 'KNN']
accuracies = [accuracy, accuracy_rf, accuracy_knn]

plt.figure(figsize=(8, 6))
ax = sns.barplot(x=models, y=accuracies, palette='viridis')
plt.ylabel('Accuracy')
plt.title('Model Comparison by Accuracy')
# Accuracy is between 0 and 1
plt.ylim(0, 1)

# Adding accuracy values on top of the bars
for container in ax.containers:
    ax.bar_label(container, fmt='%.4f')

plt.show()