In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler

sns.set(style='whitegrid')
%matplotlib inline

In [None]:
df = pd.read_csv('c:/Users/c/Downloads/Bank_analysis/Bank_Analysis/notebooks/Data/bank-additional-full.csv', sep=';')
df.head()

In [None]:
df.info()
df.describe()
df.isnull().sum()

In [None]:
df.replace('unknown', np.nan, inplace=True)
df.isnull().sum()
categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [None]:
sns.countplot(x='y', data=df)
plt.title('Subscription Distribution')
plt.show()

subscription_rate = df['y'].value_counts(normalize=True)
subscription_rate

### Target Variable Distribution

Imbalance in Subscription Rates:

- Total Clients: 41,188
- Subscribed ('yes'): 4,640 clients (11.3%)
- Did Not Subscribe ('no'): 36,548 clients (88.7%)

Insight: The dataset is heavily imbalanced, with a small proportion of clients subscribing to the term deposit. This imbalance needs to be addressed during modeling to avoid biased predictions.

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(df['age'], bins=30, kde=True)
plt.title('Age Distribution')
plt.show()

In [None]:
df['age_group'] = pd.cut(df['age'], bins=[17,25,35,45,55,65,95],labels=['18-25','26-35','36-45','46-55','56-65','66+'])

plt.figure(figsize=(10,5))
sns.countplot(x='age_group',hue='y', data=df)
plt.title('Subscription by Age Group')
plt.show()

### Age Distribution:

The majority of clients are between 30 and 40 years old.

- 18-25: 8%
- 26-35: 10%
- 36-45: 12%
- 46-55: 14%
- 56-65: 22%
- 66+: 25%

Insight: Older clients, especially those over 56 years old, have higher subscription rates. Marketing efforts could be more effective if focused on older demographics.

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(y='job', hue='y',data=df, order=df['job'].value_counts().index)
plt.title('Subscription by Job')
plt.show()

### Job Role Impact

Subscription Rates by Job:

- Student: 31%
- Retired: 25%
- Unemployed: 14%
- Admin.: 12%
- Management: 11%
- Blue-collar: 7%
- Services: 8%

Insight: 'Students' and 'retired' individuals show the highest propensity to subscribe. Conversely, 'blue-collar' workers have the lowest subscription rate. Tailoring marketing strategies based on job roles could improve campaign effectiveness.

In [None]:
numerical_cols = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
corr_matrix = df[numerical_cols].corr()

plt.figure(figsize=(12,10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

### Correlation Analysis

Strong Correlations:

- emp.var.rate and euribor3m: 0.97
- nr.employed and emp.var.rate: 0.90

Negative Correlations with Subscription (y_yes):

- euribor3m: -0.31
- nr.employed: -0.35

Insight: Economic indicators are highly interrelated and have significant associations with the likelihood of subscription. Lower employment rates and interest rates are linked to higher subscription rates.

In [None]:
df_encoded = pd.get_dummies(df, drop_first=True)

scaler = StandardScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

In [None]:
# Implementing SMOTE for class balancing
smote = SMOTE(random_state=42)
X = df_encoded.drop('y_yes', axis=1)
y = df_encoded['y_yes']

X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()

print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2%}")

### Model Evaluation

1. Overall Accuracy

Accuracy: The model achieved an accuracy of 90.98%, indicating that it correctly predicts whether a client will subscribe to a term deposit approximately 91% of the time on the test dataset.

2. Classification Report Breakdown

The classification report provides performance metrics for each class:

- Class 0: Did Not Subscribe
  - Precision: 0.93
  - Recall: 0.97
  - F1-Score: 0.95
  - Support (Number of Instances): 7,303
- Class 1: Subscribed
  - Precision: 0.66
  - Recall: 0.43
  - F1-Score: 0.52
  - Support (Number of Instances): 935

3. Interpretation of Metrics

Precision

- Class 0 (0.93): When the model predicts a client will not subscribe, it is correct 93% of the time.
- Class 1 (0.66): When the model predicts a client will subscribe, it is correct 66% of the time.

Recall

- Class 0 (0.97): The model correctly identifies 97% of clients who did not subscribe.
- Class 1 (0.43): The model correctly identifies only 43% of clients who did subscribe.

F1-Score

- Class 0 (0.95): High F1-score indicates excellent balance between precision and recall for non-subscribers.
- Class 1 (0.52): Lower F1-score reflects poor balance for subscribers due to lower recall.

4. Macro and Weighted Averages

Macro Average:

- Precision: 0.79
- Recall: 0.70
- F1-Score: 0.73

Interpretation: The unweighted mean performance across both classes, indicating moderate overall performance.

Weighted Average:

- Precision: 0.90
- Recall: 0.91
- F1-Score: 0.90

Interpretation: The average performance weighted by the number of instances in each class, skewed towards Class 0 due to class imbalance.