In [1]:
import numpy as np
import pandas as pd

## Importing The Train and Test sets ##

In [2]:
dataset_train = pd.read_csv("hayes-roth.data", names=["name","hobby","age","edu_level","material_status","class"], header=None)
dataset_test = pd.read_csv("hayes-roth.test", names=["hobby","age","edu_level","material_status","class"], header=None)

In [3]:
dataset_train.head(), dataset_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   name             132 non-null    int64
 1   hobby            132 non-null    int64
 2   age              132 non-null    int64
 3   edu_level        132 non-null    int64
 4   material_status  132 non-null    int64
 5   class            132 non-null    int64
dtypes: int64(6)
memory usage: 6.3 KB


(   name  hobby  age  edu_level  material_status  class
 0    92      2    1          1                2      1
 1    10      2    1          3                2      2
 2    83      3    1          4                1      3
 3    61      2    4          2                2      3
 4   107      1    1          3                4      3,
 None)

In [4]:
dataset_test.head(), dataset_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   hobby            28 non-null     int64
 1   age              28 non-null     int64
 2   edu_level        28 non-null     int64
 3   material_status  28 non-null     int64
 4   class            28 non-null     int64
dtypes: int64(5)
memory usage: 1.2 KB


(   hobby  age  edu_level  material_status  class
 0      1    1          1                2      1
 1      1    1          2                1      1
 2      1    2          1                1      1
 3      1    1          1                3      1
 4      1    1          3                1      1,
 None)

In [5]:
X_train = dataset_train.drop(columns=["name", "class"]).copy()
y_train = dataset_train["class"].copy()
X_test = dataset_test.drop(columns=["class"]).copy()
y_test = dataset_test["class"].copy()

## Data preprocessing ##

In [6]:
X_train[["hobby","edu_level","material_status"]] = X_train[["hobby","edu_level","material_status"]].astype(str)
X_test[["hobby","edu_level","material_status"]] = X_test[["hobby","edu_level","material_status"]].astype(str)

## Decision Tree Model

In [24]:
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(random_state=0)
dt_classifier.fit(X_train, y_train)

DecisionTreeClassifier(random_state=0)

In [35]:
from sklearn.metrics import confusion_matrix, classification_report
y_pred = dt_classifier.predict(X_test)
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))
print(30*'-' + "Training Set Report" + 30*'-' + '\n')
print(classification_report(dt_classifier.predict(X_train), y_train))
print("\nwithout hyperparameter DT Depth is", dt_classifier.get_depth())

[[11  1  0]
 [ 3 12  0]
 [ 0  0  1]]
              precision    recall  f1-score   support

           1       0.79      0.92      0.85        12
           2       0.92      0.80      0.86        15
           3       1.00      1.00      1.00         1

    accuracy                           0.86        28
   macro avg       0.90      0.91      0.90        28
weighted avg       0.87      0.86      0.86        28

------------------------------Training Set Report------------------------------

              precision    recall  f1-score   support

           1       0.94      0.84      0.89        57
           2       0.82      0.93      0.87        45
           3       1.00      1.00      1.00        30

    accuracy                           0.91       132
   macro avg       0.92      0.93      0.92       132
weighted avg       0.91      0.91      0.91       132


without hyperparameter DT Depth is 10


## 5-Cross Validation for tuing Depth hyperparameter

In [23]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth': range(1,11)}
scoring = 'accuracy'

dt_classifier_grid = DecisionTreeClassifier()
dt_gridsearch = GridSearchCV(dt_classifier_grid, param_grid = params, scoring = scoring)
dt_gridsearch.fit(X_train, y_train)

print("Optimal Depth:", dt_gridsearch.best_estimator_.max_depth)
print("accuracy score:", dt_gridsearch.best_score_)

Optimal Depth: 6
accuracy score: 0.8638176638176638


In [39]:
dt_gridsearch.cv_results_['mean_test_score'] # for max_depth=10 mean accuracy score for val sets is 0.83

array([0.43219373, 0.51538462, 0.60626781, 0.61339031, 0.63589744,
       0.86381766, 0.86353276, 0.81823362, 0.83361823, 0.83361823])

In [21]:
dt_optimaldepth_classifier = DecisionTreeClassifier(max_depth = 6, random_state = 0)
dt_optimaldepth_classifier.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=6, random_state=0)

In [36]:
y_pred = dt_optimaldepth_classifier.predict(X_test)
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))
print(30*'-' + "Training Set Report" + 30*'-' + '\n')
print(classification_report(dt_optimaldepth_classifier.predict(X_train), y_train))
print("\nusing hyperparameter tuning DT Depth is", dt_optimaldepth_classifier.get_depth())

[[ 7  0  0]
 [ 7 13  0]
 [ 0  0  1]]
              precision    recall  f1-score   support

           1       0.50      1.00      0.67         7
           2       1.00      0.65      0.79        20
           3       1.00      1.00      1.00         1

    accuracy                           0.75        28
   macro avg       0.83      0.88      0.82        28
weighted avg       0.88      0.75      0.77        28

------------------------------Training Set Report------------------------------

              precision    recall  f1-score   support

           1       0.65      1.00      0.79        33
           2       1.00      0.74      0.85        69
           3       1.00      1.00      1.00        30

    accuracy                           0.86       132
   macro avg       0.88      0.91      0.88       132
weighted avg       0.91      0.86      0.87       132


using hyperparameter tuning DT Depth is 6
