# Importing Packages

In [38]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.model_selection import train_test_split
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
import plotly.figure_factory as ff
import plotly.graph_objects as go
from sklearn import tree
import graphviz

### Loading the dataset

In [1]:
  
# fetch dataset 
bank_marketing = fetch_ucirepo(id=222) 
  
# data (as pandas dataframes) 
X = bank_marketing.data.features 
y = bank_marketing.data.targets 
  
# metadata 
print(bank_marketing.metadata) 
  
# variable information 
print(bank_marketing.variables) 


{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'title': 'A data-driven approach to predict the success of bank telemarketing'

In [2]:
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,


In [3]:
y

Unnamed: 0,y
0,no
1,no
2,no
3,no
4,no
...,...
45206,yes
45207,yes
45208,yes
45209,no


### Cheacking for nulls

In [6]:
X.isnull().sum()

age                0
job              288
marital            0
education       1857
default            0
balance            0
housing            0
loan               0
contact        13020
day_of_week        0
month              0
duration           0
campaign           0
pdays              0
previous           0
poutcome       36959
dtype: int64

### Filling NaNs with modes

In [8]:
# Function to fill missing values with mode for specified columns
def fill_with_mode(df, columns):
    for column in columns:
        mode_value = df[column].mode()[0]  # Find the mode
        df[column].fillna(mode_value, inplace=True)  # Fill NaN with mode
    return df

# List of columns to fill
columns_to_fill = ['job', 'education', 'contact', 'poutcome']

# Fill the missing values with mode
X = fill_with_mode(X, columns_to_fill)
X

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column].fillna(mode_value, inplace=True)  # Fill NaN with mode


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,cellular,5,may,261,1,-1,0,failure
1,44,technician,single,secondary,no,29,yes,no,cellular,5,may,151,1,-1,0,failure
2,33,entrepreneur,married,secondary,no,2,yes,yes,cellular,5,may,76,1,-1,0,failure
3,47,blue-collar,married,secondary,no,1506,yes,no,cellular,5,may,92,1,-1,0,failure
4,33,blue-collar,single,secondary,no,1,no,no,cellular,5,may,198,1,-1,0,failure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,failure
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,failure
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,failure


### Plotting a pie chart to check class imbalance

In [14]:
# Count the occurrences of each class
class_counts = y['y'].value_counts().reset_index()
class_counts.columns = ['class', 'count']

# Create the interactive pie chart with adjusted width, height, and percentage display
fig = px.pie(
    class_counts, 
    values='count', 
    names='class', 
    title='Class Imbalance in y Variable',
    width=800,  # Adjust width here
    height=600  # Adjust height here
)

# Center the title
fig.update_layout(title={'text': 'Class Imbalance in y Variable', 'x': 0.5})

# Show the figure
fig.show()

### Lable encoding for SMOTE analysis, in order to conduct resampling 

In [22]:
label_encoders = {}
for column in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le

# Encode the target variable if it's categorical
y_le = LabelEncoder()
y = y_le.fit_transform(y)

# Display the first few rows of the encoded dataframe
X.head()
# y[:5]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,58,4,1,2,0,2143,1,0,0,5,8,261,1,-1,0,0
1,44,9,2,1,0,29,1,0,0,5,8,151,1,-1,0,0
2,33,2,1,1,0,2,1,1,0,5,8,76,1,-1,0,0
3,47,1,1,1,0,1506,1,0,0,5,8,92,1,-1,0,0
4,33,1,2,1,0,1,0,0,0,5,8,198,1,-1,0,0


### Fitting SMOTE

In [23]:
from imblearn.over_sampling import SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

### Traing decision tree classifier

In [28]:
from sklearn.metrics import classification_report, confusion_matrix

# Train a Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train_res, y_train_res)

# Make predictions on the test set
y_pred = clf.predict(X_test)



### Plotting confusion matrix and classification report using Plotly

In [42]:
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred, output_dict=True)

# Plot confusion matrix using Plotly
fig_cm = ff.create_annotated_heatmap(z=cm, x=['Predicted No', 'Predicted Yes'], y=['Actual No', 'Actual Yes'], colorscale='Viridis')
fig_cm.update_layout(title='Confusion Matrix', xaxis_title='Predicted Label', yaxis_title='True Label')

# Plot classification report using Plotly
fig_cr = go.Figure(data=[go.Table(
    header=dict(values=['Metric', 'No', 'Yes', 'Accuracy', 'Macro Avg']),
    cells=dict(values=[
        list(cr.keys()), 
        [cr['0']['precision'], cr['1']['precision'], '', cr['macro avg']['precision']],
        [cr['0']['recall'], cr['1']['recall'], '', cr['macro avg']['recall']],
        [cr['0']['f1-score'], cr['1']['f1-score'], '', cr['macro avg']['f1-score']],
        ['', '', cr['accuracy'], '', '']
    ])
)])
fig_cr.update_layout(title='Classification Report')

# Show figures
fig_cm.show()
fig_cr.show()