# Train Three Different Models and Use Evaluation Metrics to Pick the Best Performing Model

<b> Load the necessary libraries </b>

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

<b> Read in the data </b>

In [2]:
# url path
url_path = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter06/Dataset/bank-additional-full.csv'

In [3]:
# load data
df = pd.read_csv(url_path, sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


<b> Explore the data </b>

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [5]:
df.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


<b> Convert categorical variables </b>

In [6]:
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']

In [7]:
_df = pd.get_dummies(df, columns=cat_cols, prefix=cat_cols, drop_first=True)
_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 54 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            41188 non-null  int64  
 1   duration                       41188 non-null  int64  
 2   campaign                       41188 non-null  int64  
 3   pdays                          41188 non-null  int64  
 4   previous                       41188 non-null  int64  
 5   emp.var.rate                   41188 non-null  float64
 6   cons.price.idx                 41188 non-null  float64
 7   cons.conf.idx                  41188 non-null  float64
 8   euribor3m                      41188 non-null  float64
 9   nr.employed                    41188 non-null  float64
 10  y                              41188 non-null  object 
 11  job_blue-collar                41188 non-null  uint8  
 12  job_entrepreneur               41188 non-null 

<b> Prepare the X and y variables </b>

In [8]:
X = _df.drop(['y'], axis=1).values
y = _df['y'].apply(lambda x: 0 if x == 'no' else 1)
y = y.values

<b> Split the data into training and evaluation sets </b>

In [9]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.3, random_state=0)

X_val, X_test, y_val, y_test = train_test_split(X_eval, y_eval, random_state=0)

<b> Create an instance of LogisticRegression </b>

In [10]:
lr_model = LogisticRegression()

<b> Fit the training data to the LogisticRegression model </b>

In [11]:
lr_model.fit(X_train, y_train)

LogisticRegression()

<b> Use the evaluation set to make a prediction </b>

In [12]:
lr_pred = lr_model.predict(X_val)

<b> Use the prediction from the LogisticRegression model to compute the classification report </b>

In [13]:
print(classification_report(y_val, lr_pred))

              precision    recall  f1-score   support

           0       0.93      0.98      0.95      8220
           1       0.67      0.40      0.50      1047

    accuracy                           0.91      9267
   macro avg       0.80      0.69      0.72      9267
weighted avg       0.90      0.91      0.90      9267



<b> Create an instance of DecisionTreeClassifier </b>

In [14]:
dt_model = DecisionTreeClassifier(max_depth=6)

<b> Fit the training data to the DecisionTreeClassifier model </b>

In [15]:
dt_model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=6)

<b> Using the DecisionTreeClassifier model, make a prediction on the evaluation dataset </b>

In [16]:
dt_pred = dt_model.predict(X_val)

<b> Use the prediction from the DecisionTreeClassifier model to compute the classification report </b>

In [17]:
print(classification_report(y_val, dt_pred))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      8220
           1       0.66      0.54      0.60      1047

    accuracy                           0.92      9267
   macro avg       0.80      0.75      0.78      9267
weighted avg       0.91      0.92      0.91      9267



<b> Create an instance of RandomForestClassifier </b>

In [18]:
rf_model = RandomForestClassifier()

<b> Fit the training data to the RandomForestClassifier model </b>

In [19]:
rf_model.fit(X_train, y_train)

RandomForestClassifier()

<b> Using the RandomForestClassifier model, make a prediction on the evaluation dataset </b>

In [20]:
rf_pred = rf_model.predict(X_val)

<b> Using the prediction from the random forest classifier, compute the classification report </b>

In [21]:
print(classification_report(y_val, rf_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95      8220
           1       0.68      0.47      0.56      1047

    accuracy                           0.92      9267
   macro avg       0.81      0.72      0.76      9267
weighted avg       0.91      0.92      0.91      9267



<b> Compare the classification report from the linear regression model with the classification report from the random forest classifier to decide which model to keep or improve upon </b>

The Random Forest model is the best

<b> Compare the R2 scores of all three models </b>

In [22]:
print(f'     Logistic Score: {lr_model.score(X_val, y_val)}\n \
    Decision Tree Score: {dt_model.score(X_val, y_val)}\n \
    Random Forest Score: {rf_model.score(X_val, y_val)}')

     Logistic Score: 0.9100032372936225
     Decision Tree Score: 0.9172331930506097
     Random Forest Score: 0.9156145462393439
