In [2]:
import pandas as pd                            # For handling data in tabular form
from sklearn.model_selection import train_test_split   # For splitting dataset into training and testing
from sklearn.preprocessing import LabelEncoder         # For encoding categorical variables to numeric
from sklearn.ensemble import RandomForestClassifier    # Random Forest classifier
from imblearn.over_sampling import SMOTE               # SMOTE for oversampling minority class
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  # Evaluation metrics

In [4]:
df = pd.read_csv('bank-full.csv', sep=';')

In [6]:
# 📌 Initialize a dictionary to store label encoders for each categorical feature
label_encoders = {}

# 📌 Encode categorical variables
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le  # Store each encoder in case we need to decode later


In [8]:
X = df.drop('y', axis=1)  
y = df['y'] 

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [12]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [14]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=42)  # 100 trees in the forest
rf_model.fit(X_train_res, y_train_res)

In [18]:
y_pred = rf_model.predict(X_test)

# 📌 Step 9: Evaluate the model's performance
print("🔍 Accuracy of the model: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📑 Classification Report:\n", classification_report(y_test, y_pred))

🔍 Accuracy of the model: 88.28%

📊 Confusion Matrix:
 [[10949  1028]
 [  562  1025]]

📑 Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.91      0.93     11977
           1       0.50      0.65      0.56      1587

    accuracy                           0.88     13564
   macro avg       0.73      0.78      0.75     13564
weighted avg       0.90      0.88      0.89     13564



In [15]:
# 📌 Import mutual_info_classif for feature importance ranking
from sklearn.feature_selection import mutual_info_classif
import numpy as np

# 📌 Calculate Mutual Information (MI) scores
mi_scores = mutual_info_classif(X, y, random_state=42)

# 📌 Create a DataFrame to view feature importance nicely
mi_df = pd.DataFrame({'Feature': X.columns, 'MI Score': mi_scores})

# 📌 Sort features by MI Score descending
mi_df = mi_df.sort_values(by='MI Score', ascending=False)

# 📌 Display feature importance scores
print(mi_df)


      Feature  MI Score
11   duration  0.070507
15   poutcome  0.036672
10      month  0.027404
13      pdays  0.025954
5     balance  0.022288
6     housing  0.019413
8     contact  0.017775
14   previous  0.012734
1         job  0.012238
0         age  0.011020
9         day  0.006068
2     marital  0.006001
3   education  0.004320
12   campaign  0.003924
7        loan  0.003559
4     default  0.000115


In [16]:
# 📌 Select top 10 features
top_features = mi_df['Feature'].head(10).tolist()

# 📌 Create new training and testing sets with selected features
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

# 📌 Apply SMOTE on the reduced training set
X_train_res_top, y_train_res_top = smote.fit_resample(X_train_top, y_train)

# 📌 Train Random Forest on selected features
rf_model_top = RandomForestClassifier(n_estimators=1000, random_state=42)
rf_model_top.fit(X_train_res_top, y_train_res_top)

# 📌 Predict and evaluate
y_pred_top = rf_model_top.predict(X_test_top)

# 📌 Evaluate model
print("Accuracy with top 10 features: {:.2f}%".format(accuracy_score(y_test, y_pred_top) * 100))
print("\nClassification Report:\n", classification_report(y_test, y_pred_top))


Accuracy with top 10 features: 86.91%

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.89      0.92     11977
           1       0.46      0.68      0.55      1587

    accuracy                           0.87     13564
   macro avg       0.71      0.79      0.74     13564
weighted avg       0.90      0.87      0.88     13564

