# Decision Tree Model Application on Custom Dataset

## Data Loading and Preprocessing

In [2]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the dataset
data = pd.read_csv('dataset.csv', delimiter=';')

# Encoding the target variable
label_encoder = LabelEncoder()
data['Target'] = label_encoder.fit_transform(data['Target'])

# Identifying feature columns and target column
X = data.drop(columns=['Target'])
y = data['Target']

# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


## Decision Tree Model Training and Evaluation

In [3]:

# Initialize the Decision Tree classifier with the 'gini' criterion
clf_gini = DecisionTreeClassifier(criterion='gini', random_state=42)
clf_gini.fit(X_train, y_train)

# Predicting the results
y_pred_train_gini = clf_gini.predict(X_train)
y_pred_test_gini = clf_gini.predict(X_test)

# Training and test accuracy
train_accuracy_gini = accuracy_score(y_train, y_pred_train_gini)
test_accuracy_gini = accuracy_score(y_test, y_pred_test_gini)

# Confusion matrices for training and test sets
train_conf_matrix_gini = confusion_matrix(y_train, y_pred_train_gini)
test_conf_matrix_gini = confusion_matrix(y_test, y_pred_test_gini)

# Repeating the process with 'entropy' criterion
clf_entropy = DecisionTreeClassifier(criterion='entropy', random_state=42)
clf_entropy.fit(X_train, y_train)

y_pred_train_entropy = clf_entropy.predict(X_train)
y_pred_test_entropy = clf_entropy.predict(X_test)

# Training and test accuracy for entropy criterion
train_accuracy_entropy = accuracy_score(y_train, y_pred_train_entropy)
test_accuracy_entropy = accuracy_score(y_test, y_pred_test_entropy)

# Confusion matrices for training and test sets with entropy criterion
train_conf_matrix_entropy = confusion_matrix(y_train, y_pred_train_entropy)
test_conf_matrix_entropy = confusion_matrix(y_test, y_pred_test_entropy)

# Collecting the results
decision_tree_results = {
    "Gini Criterion": {
        "Train Accuracy": train_accuracy_gini,
        "Test Accuracy": test_accuracy_gini,
        "Train Confusion Matrix": train_conf_matrix_gini,
        "Test Confusion Matrix": test_conf_matrix_gini
    },
    "Entropy Criterion": {
        "Train Accuracy": train_accuracy_entropy,
        "Test Accuracy": test_accuracy_entropy,
        "Train Confusion Matrix": train_conf_matrix_entropy,
        "Test Confusion Matrix": test_conf_matrix_entropy
    }
}

decision_tree_results


{'Gini Criterion': {'Train Accuracy': 1.0,
  'Test Accuracy': 0.6844879518072289,
  'Train Confusion Matrix': array([[ 980,    0,    0],
         [   0,  549,    0],
         [   0,    0, 1567]], dtype=int64),
  'Test Confusion Matrix': array([[311,  62,  68],
         [ 73,  89,  83],
         [ 53,  80, 509]], dtype=int64)},
 'Entropy Criterion': {'Train Accuracy': 1.0,
  'Test Accuracy': 0.6746987951807228,
  'Train Confusion Matrix': array([[ 980,    0,    0],
         [   0,  549,    0],
         [   0,    0, 1567]], dtype=int64),
  'Test Confusion Matrix': array([[308,  77,  56],
         [ 74,  87,  84],
         [ 63,  78, 501]], dtype=int64)}}