# Prodigy InfoTech Task 03

We were taksed on creating a Decision Tree Classifier to predict whether a customer will purchase a product/service or not based on the provided dataset

## 1. Install Packages needed

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('darkgrid')
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import metrics
from prettytable import PrettyTable 

## 2. Import Data
Importing the "bank-additional-full.xlsx" dataset

In [2]:
df = pd.read_excel("bank-additional-full.xlsx")
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


## 3. Explore data

The dataset was explored to find out it's characteristics and answer questions like "how many entries it has", "does it have any missing values" and "how many unique elemens are in each column"

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [4]:
df.isna().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [5]:
cols = list(df.columns)

for col in cols:
    print("There is a total of " + str(df[col].nunique()) + " elements in the "+ col + " column.")

There is a total of 78 elements in the age column.
There is a total of 12 elements in the job column.
There is a total of 4 elements in the marital column.
There is a total of 8 elements in the education column.
There is a total of 3 elements in the default column.
There is a total of 3 elements in the housing column.
There is a total of 3 elements in the loan column.
There is a total of 2 elements in the contact column.
There is a total of 10 elements in the month column.
There is a total of 5 elements in the day_of_week column.
There is a total of 1544 elements in the duration column.
There is a total of 42 elements in the campaign column.
There is a total of 27 elements in the pdays column.
There is a total of 8 elements in the previous column.
There is a total of 3 elements in the poutcome column.
There is a total of 10 elements in the emp.var.rate column.
There is a total of 26 elements in the cons.price.idx column.
There is a total of 26 elements in the cons.conf.idx column.
Ther

## 4. Data Splitting

+ Here the dataset has to be split into the Independent and Dependent variabes and encoded. 

+ Then the independent variable was Normalized.

+ Then the variable sets were further split into their respective Training and Testing sets.

#### Independent var.

In [22]:
X = df.iloc[:,:-1]
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [23]:
label_encoders = {}
for column in X.select_dtypes(include=['object']).columns:
    encoder = LabelEncoder()
    X[column] = encoder.fit_transform(X[column])
    label_encoders[column] = encoder
    
X.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,3,1,0,0,0,0,1,6,1,261,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
1,57,7,1,3,1,0,0,1,6,1,149,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
2,37,7,1,3,0,2,0,1,6,1,226,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
3,40,0,1,1,0,0,0,1,6,1,151,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
4,56,7,1,3,0,0,2,1,6,1,307,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0


### Dependent Var.

In [9]:
y = df.iloc[:, -1]
y

0         no
1         no
2         no
3         no
4         no
        ... 
41183    yes
41184     no
41185     no
41186    yes
41187     no
Name: y, Length: 41188, dtype: object

In [10]:
label_encoder_y= LabelEncoder()
y= label_encoder_y.fit_transform(y)
y

array([0, 0, 0, ..., 0, 1, 0])

### Normalizing

In [24]:
st= StandardScaler()
X_norm = st.fit_transform(X)
X = pd.DataFrame(X_norm, columns=X.columns)

### Train/Test Split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Decision Tree Model

In [26]:
dt = DecisionTreeClassifier()
dt = dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

In [27]:
table = PrettyTable(["Metric", "Score"]) 

table.add_row(["Accuracy", metrics.accuracy_score(y_test, y_pred)])
table.add_row(["F1", metrics.f1_score(y_test, y_pred)])
table.add_row(["Recall", metrics.recall_score(y_test, y_pred)])
table.add_row(["Precision", metrics.precision_score(y_test, y_pred)])
table

Metric,Score
Accuracy,0.8909929594561787
F1,0.5332640332640333
Recall,0.5457446808510639
Precision,0.5213414634146342
