In [19]:
import numpy as np
import pandas as pd

## Importing The Train and Test sets ##

In [20]:
dataset_train = pd.read_csv("hayes-roth.data", names=["name","hobby","age","edu_level","material_status","class"], header=None)
dataset_test = pd.read_csv("hayes-roth.test", names=["hobby","age","edu_level","material_status","class"], header=None)

In [21]:
dataset_train.head(), dataset_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   name             132 non-null    int64
 1   hobby            132 non-null    int64
 2   age              132 non-null    int64
 3   edu_level        132 non-null    int64
 4   material_status  132 non-null    int64
 5   class            132 non-null    int64
dtypes: int64(6)
memory usage: 6.3 KB


(   name  hobby  age  edu_level  material_status  class
 0    92      2    1          1                2      1
 1    10      2    1          3                2      2
 2    83      3    1          4                1      3
 3    61      2    4          2                2      3
 4   107      1    1          3                4      3,
 None)

In [22]:
dataset_test.head(), dataset_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   hobby            28 non-null     int64
 1   age              28 non-null     int64
 2   edu_level        28 non-null     int64
 3   material_status  28 non-null     int64
 4   class            28 non-null     int64
dtypes: int64(5)
memory usage: 1.2 KB


(   hobby  age  edu_level  material_status  class
 0      1    1          1                2      1
 1      1    1          2                1      1
 2      1    2          1                1      1
 3      1    1          1                3      1
 4      1    1          3                1      1,
 None)

In [23]:
X_train = dataset_train.drop(columns=["name", "class"]).copy()
y_train = dataset_train["class"].copy()
X_test = dataset_test.drop(columns=["class"]).copy()
y_test = dataset_test["class"].copy()

## Data preprocessing ##

In [24]:
X_train[["hobby","edu_level","material_status"]] = X_train[["hobby","edu_level","material_status"]].astype(str)
X_test[["hobby","edu_level","material_status"]] = X_test[["hobby","edu_level","material_status"]].astype(str)

## Decision Tree Model

In [25]:
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(random_state=0)
dt_classifier.fit(X_train, y_train)

DecisionTreeClassifier(random_state=0)

In [26]:
from sklearn.metrics import confusion_matrix, classification_report
y_pred = dt_classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
str1 = "Witout Hyperparameter Tuning (Depth=10)"
print(20*'-' + str1 + (50-len(str1))*'-' + '\n')
print(classification_report(y_test, y_pred))
str1 = "Training Set Report"
print(20*'-' + str1 + (50-len(str1))*'-' + '\n')
print(classification_report(y_train, dt_classifier.predict(X_train)))


[[11  3  0]
 [ 1 12  0]
 [ 0  0  1]]
--------------------Witout Hyperparameter Tuning (Depth=10)-----------

              precision    recall  f1-score   support

           1       0.92      0.79      0.85        14
           2       0.80      0.92      0.86        13
           3       1.00      1.00      1.00         1

    accuracy                           0.86        28
   macro avg       0.91      0.90      0.90        28
weighted avg       0.87      0.86      0.86        28

--------------------Training Set Report-------------------------------

              precision    recall  f1-score   support

           1       0.84      0.94      0.89        51
           2       0.93      0.82      0.87        51
           3       1.00      1.00      1.00        30

    accuracy                           0.91       132
   macro avg       0.93      0.92      0.92       132
weighted avg       0.91      0.91      0.91       132



## 5-Cross Validation for Tuing Depth Hyperparameter

In [27]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth': range(1,11)}
scoring = 'accuracy'

dt_classifier_grid = DecisionTreeClassifier()
dt_gridsearch = GridSearchCV(dt_classifier_grid, param_grid = params, scoring = scoring)
dt_gridsearch.fit(X_train, y_train)

print("Optimal Depth:", dt_gridsearch.best_estimator_.max_depth)
print("validation accuracy score:", dt_gridsearch.best_score_)

Optimal Depth: 6
validation accuracy score: 0.8638176638176638


In [28]:
print("validation accuracy score for Descision Tree with Depth 10:", dt_gridsearch.cv_results_['mean_test_score'][9])

validation accuracy score for Descision Tree with Depth 10: 0.8336182336182336


In [29]:
dt_optimaldepth_classifier = DecisionTreeClassifier(max_depth = 6, random_state = 0)
dt_optimaldepth_classifier.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=6, random_state=0)

In [30]:
y_pred_opt = dt_optimaldepth_classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred_opt))
str1 = "With Hyperparameter Tuning (Depth=6)"
print(20*'-' + str1 + (50-len(str1))*'-' + '\n')
print(classification_report(y_test, y_pred_opt))
str1 = "Training Set Report"
print(20*'-' + "Training Set Report" + (50-len(str1))*'-' + '\n')
print(classification_report(y_train, dt_optimaldepth_classifier.predict(X_train)))

[[ 7  7  0]
 [ 0 13  0]
 [ 0  0  1]]
--------------------With Hyperparameter Tuning (Depth=6)--------------

              precision    recall  f1-score   support

           1       1.00      0.50      0.67        14
           2       0.65      1.00      0.79        13
           3       1.00      1.00      1.00         1

    accuracy                           0.75        28
   macro avg       0.88      0.83      0.82        28
weighted avg       0.84      0.75      0.73        28

--------------------Training Set Report-------------------------------

              precision    recall  f1-score   support

           1       1.00      0.65      0.79        51
           2       0.74      1.00      0.85        51
           3       1.00      1.00      1.00        30

    accuracy                           0.86       132
   macro avg       0.91      0.88      0.88       132
weighted avg       0.90      0.86      0.86       132

