---
## Step 1: Importing Data

In [None]:
# Supressing warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing all the libraries used in the case study
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import precision_score, recall_score, precision_recall_curve, roc_curve, auc

In [None]:
# importing data and reading it
pd.options.display.max_columns=None
bank = pd.read_csv('/kaggle/input/bank-csv/bank.csv')
bank

---
## Step 2: Inspecting Data

In [None]:
# counting null values of every column
bank.isnull().sum()

In [None]:
# checking datatypes and null values in each columns
bank.info()

In [None]:
# shape of data
bank.shape

In [None]:
# statistical aspects of the dataframe
bank.describe(include='all')

---
## Step 3: Manipulating data

Changing all the **yes** and **no** columns to 1 and 0.

In [None]:
# list of variables which needs to be changed
col = ['default','housing','loan','y']

# function definition
def convert(x):
    return x.map({'yes':1,'no':0})

# calling the function
bank[col] = bank[col].apply(convert)

In [None]:
bank.head()

---
### Count of the every categorical variable present in the data

In [None]:
# count of column job
bank['job'].astype("category").value_counts()

In [None]:
# count of column marital
bank['marital'].astype("category").value_counts()

In [None]:
# count of column education
bank['education'].astype("category").value_counts()

In [None]:
# count of column contact
bank['contact'].astype("category").value_counts()

In [None]:
# count of column month
bank['month'].astype("category").value_counts()

In [None]:
# count of column poutcome
bank['poutcome'].astype("category").value_counts()

---
### Making dataframe of all the categorical columns to make dummy variables for all these categorical variables.

In [None]:
categorical = bank.select_dtypes(include=['object'])
categorical.head()

In [None]:
# dummy variables of all categorical columns
dummies = pd.get_dummies(categorical,drop_first=True)
dummies.head()

In [None]:
# concatination of two dataframes 'bank' and 'dummies'
bank = pd.concat([bank,dummies],axis=1)
bank.drop(columns=categorical.columns,axis=1,inplace=True)

In [None]:
bank.head()

In [None]:
bank.shape

In [None]:
bank.info()

#####  Now we have 43 columns in the dataframe on which we need to perform the analysis and make prediction model

---
## Checking outliers

Collecting the columns with continuous values in the dataframe and checking outliers for it

In [None]:
# collecting all the continuous valued columns in a dataframe
check_out = bank[['age','balance','day','duration','campaign','pdays','previous']]
check_out.head()

In [None]:
# Checking outliers at 25%, 50%, 75%, 90%, 95% and 99%
check_out.describe(percentiles=[0.25,0.5,0.75,0.90,0.95,0.99])

In [None]:
# creating boxplots for all the continuous columns of the dataframe
plt.figure(figsize=(15,10))
plt.subplot(2,4,1)
sns.boxplot(y='age',data=bank)
plt.subplot(2,4,2)
sns.boxplot(y='balance',data=bank)
plt.subplot(2,4,3)
sns.boxplot(y='day',data=bank)
plt.subplot(2,4,4)
sns.boxplot(y='duration',data=bank)
plt.subplot(2,4,5)
sns.boxplot(y='campaign',data=bank)
plt.subplot(2,4,6)
sns.boxplot(y='pdays',data=bank)
plt.subplot(2,4,7)
sns.boxplot(y='previous',data=bank)

##### There are lot of outliers present in the dataframe but we can't drop them because of the very large quantity and are important for model making
---

In [None]:
# Making a heatmap to find correlation
plt.figure(figsize=(40,30))
sns.heatmap(bank.corr(),annot=True)

##### We are not gonna drop any column because none of the column is very highly correlated with the target column
---

In [None]:
bank.head()

In [None]:
# subscription rate
round((sum(bank['y'])/len(bank.index))*100,2)

##### This shows that approximately 12% people are subscribed to the bank term deposit

---
##### Splitting the target variable and the predictor features in two different dataframes from Train Test Split 

In [None]:
print(bank.columns)


In [None]:
# X will have all the features except the target variable 'y'
X = bank.drop(columns=['y'])

# Y will have the target variable 'y'
Y = bank['y']


In [None]:
X.head()

In [None]:
Y.head()

---
## Step 3: Train Test Split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, test_size=0.3, random_state=100)

##### Splitting the data in train and test dataframe in the ratio of 7:3


---
## Step 4: Feature Scaling

In [None]:
scaler = StandardScaler()

x_train[['age','balance','day','duration','campaign','pdays','previous']] = scaler.fit_transform(x_train[['age','balance','day','duration','campaign','pdays','previous']])

In [None]:
x_train.head()

---
## Step 5: Model Building

##### Buiding first logistic regression model 

In [None]:
# Standardizing the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


In [None]:
# Building the logistic regression model
logreg = LogisticRegression()
logreg.fit(x_train_scaled, y_train)



In [None]:
# Making predictions
y_train_pred = logreg.predict(x_train_scaled)
y_test_pred = logreg.predict(x_test_scaled)


In [None]:
# Model accuracy
train_accuracy = metrics.accuracy_score(y_train, y_train_pred)
test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
print(f'Train Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')


# Step 6  Feature Selection using RFE:

In [None]:
# Feature selection using RFE
rfe = RFE(logreg, n_features_to_select=20)
rfe = rfe.fit(x_train_scaled, y_train)

In [None]:
# Selected features
selected_features = X.columns[rfe.support_]
print(f'Selected Features: {selected_features}')

# Step 7: Checking VIFs for the selected features

In [None]:
# Calculating VIFs for the selected features
def calculate_vif(X):
    vif = pd.DataFrame()
    vif["Features"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif

In [None]:
# Selecting only the chosen features
X_selected = x_train_scaled[:, rfe.support_]
vif = calculate_vif(pd.DataFrame(X_selected, columns=selected_features))
print(vif)

# Step 8: Metrics beyond accuracy

In [None]:
# Precision, Recall, F1-Score, and Confusion Matrix
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1_score = metrics.f1_score(y_test, y_test_pred)
confusion = metrics.confusion_matrix(y_test, y_test_pred)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')
print(f'Confusion Matrix:\n{confusion}')


# Step 9: Plotting the ROC Curve

In [None]:
# Plotting the ROC Curve
y_test_pred_proba = logreg.predict_proba(x_test_scaled)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_test_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


# Step 10: Finding the Optimal Cutoff Point

In [None]:
# Finding the optimal cutoff point
precision, recall, thresholds = precision_recall_curve(y_test, y_test_pred_proba)
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

print(f'Optimal Threshold: {optimal_threshold}')


# Step 11: Precision and Recall with Optimal Threshold

In [None]:
# Making predictions using the optimal threshold
y_test_pred_optimal = (y_test_pred_proba >= optimal_threshold).astype(int)

# Precision, Recall, F1-Score with optimal threshold
precision_optimal = precision_score(y_test, y_test_pred_optimal)
recall_optimal = recall_score(y_test, y_test_pred_optimal)
f1_score_optimal = metrics.f1_score(y_test, y_test_pred_optimal)

print(f'Precision (Optimal Threshold): {precision_optimal}')
print(f'Recall (Optimal Threshold): {recall_optimal}')
print(f'F1 Score (Optimal Threshold): {f1_score_optimal}')


# Step 12: Making Predictions on Test Data

In [None]:
# Making predictions on test data using the optimal threshold
y_test_pred_final = (y_test_pred_proba >= optimal_threshold).astype(int)

# Final metrics
final_precision = precision_score(y_test, y_test_pred_final)
final_recall = recall_score(y_test, y_test_pred_final)
final_f1_score = metrics.f1_score(y_test, y_test_pred_final)
final_confusion = metrics.confusion_matrix(y_test, y_test_pred_final)

print(f'Final Precision: {final_precision}')
print(f'Final Recall: {final_recall}')
print(f'Final F1 Score: {final_f1_score}')
print(f'Final Confusion Matrix:\n{final_confusion}')


#  recommended improvements step by step:

**Step 1: Addressing Data Imbalance using SMOTE**
* SMOTE (Synthetic Minority Over-sampling Technique) will help create synthetic samples for the  minority class.


In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

print(f'Original class distribution: {Counter(y_train)}')
print(f'Class distribution after SMOTE: {Counter(y_train_smote)}')


**Step 2:  Feature Scaling:**

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_smote)
x_test_scaled = scaler.transform(x_test)


**3. Train Logistic Regression with Best Parameters:**

In [None]:
best_logreg = LogisticRegression(C=0.1, penalty='l1', solver='liblinear', random_state=42)
best_logreg.fit(x_train_scaled, y_train_smote)


**4. Evaluate Model Performance:**

In [None]:
# Training data evaluation
y_train_pred = best_logreg.predict(x_train_scaled)
train_accuracy = best_logreg.score(x_train_scaled, y_train_smote)
train_precision = precision_score(y_train_smote, y_train_pred)
train_recall = recall_score(y_train_smote, y_train_pred)
train_f1 = metrics.f1_score(y_train_smote, y_train_pred)

print(f'Training Accuracy: {train_accuracy}')
print(f'Training Precision: {train_precision}')
print(f'Training Recall: {train_recall}')
print(f'Training F1 Score: {train_f1}')

In [None]:
# Test data evaluation
y_test_pred = best_logreg.predict(x_test_scaled)
test_accuracy = best_logreg.score(x_test_scaled, y_test)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = metrics.f1_score(y_test, y_test_pred)

print(f'Test Accuracy: {test_accuracy}')
print(f'Test Precision: {test_precision}')
print(f'Test Recall: {test_recall}')
print(f'Test F1 Score: {test_f1}')

**5. Plot the ROC Curve and Determine Optimal Threshold:**

In [None]:
# Predict probabilities
y_test_pred_proba = best_logreg.predict_proba(x_test_scaled)[:, 1]

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_test_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
# Finding the optimal cutoff point
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, y_test_pred_proba)
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

print(f'Optimal Threshold: {optimal_threshold}')


In [None]:
# Predictions using the optimal threshold
y_test_pred_optimal = (y_test_pred_proba >= optimal_threshold).astype(int)

# Evaluate performance with optimal threshold
precision_optimal = precision_score(y_test, y_test_pred_optimal)
recall_optimal = recall_score(y_test, y_test_pred_optimal)
f1_score_optimal = metrics.f1_score(y_test, y_test_pred_optimal)

print(f'Precision (Optimal Threshold): {precision_optimal}')
print(f'Recall (Optimal Threshold): {recall_optimal}')
print(f'F1 Score (Optimal Threshold): {f1_score_optimal}')

In [None]:
# Final predictions on test data
final_precision = precision_score(y_test, y_test_pred_optimal)
final_recall = recall_score(y_test, y_test_pred_optimal)
final_f1_score = metrics.f1_score(y_test, y_test_pred_optimal)
final_confusion = metrics.confusion_matrix(y_test, y_test_pred_optimal)

print(f'Final Precision: {final_precision}')
print(f'Final Recall: {final_recall}')
print(f'Final F1 Score: {final_f1_score}')
print(f'Final Confusion Matrix:\n{final_confusion}')