In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [59]:
drug_data = pd.read_csv("drug.csv")

In [60]:
# Count how many missing values occurs
print("Number of missing values:\n", drug_data.isnull().sum())

Number of missing values:
 Age            0
Sex            0
BP             2
Cholesterol    2
Na_to_K        1
Drug           0
dtype: int64


In [61]:
# Features types
categorical_features = [i for i in drug_data.columns if drug_data[i].dtype == 'O' and i != 'Drug']
numerical_features = [i for i in drug_data.columns if drug_data[i].dtype != 'O']
print(f"The categorical features are:\n{categorical_features}\n\n")
print(f"The numerical features are:\n{numerical_features}")

The categorical features are:
['Sex', 'BP', 'Cholesterol']


The numerical features are:
['Age', 'Na_to_K']


In [62]:
# Handle empty cell by your own way
drug_data.dropna(inplace=True)
print(drug_data.isnull().sum())

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64


In [63]:
# The features and targets are separated
features = drug_data.drop(['Drug'], axis=1)
drug_target = drug_data['Drug']


In [64]:
# Encode categorical features
label_encoder = LabelEncoder()
for feature in categorical_features:
    features[feature] = label_encoder.fit_transform(features[feature])

In [65]:
# Encode categorical Target
drug_target = label_encoder.fit_transform(drug_target)

In [66]:
# Create an empty DataFrame to store results
results_df = pd.DataFrame(columns=['Experiment', 'Decision Tree Size', 'Accuracy'])

# Perform the experiment five times
for i in range(5):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        features, drug_target, test_size=0.3, random_state=i)

    # Create a decision tree classifier
    clf = DecisionTreeClassifier(random_state=0)

    # Train the model
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)

    # Get the decision tree size
    tree_size = clf.tree_.node_count

    # Append results to the DataFrame
    results_df = pd.concat([results_df, pd.DataFrame({
        'Experiment': [i + 1],
        'Decision Tree Size': [tree_size],
        'Accuracy': [accuracy]
    })], ignore_index=True)

    # Report the results for each experiment
    print(f"\nExperiment {i + 1}:")
    print(f"Decision Tree Size: {tree_size}")
    print(f"Accuracy: {accuracy}\n")

# Identify and report the best model
# idxmax() returns the index of the first occurrence of the maximum value.
best_model = results_df.loc[results_df['Accuracy'].idxmax()]

print("Best Model (Highest Accuracy):")
print(f"Experiment: {int(best_model['Experiment'])}")
print(f"Decision Tree Size: {int(best_model['Decision Tree Size'])}")
print(f"Accuracy: {best_model['Accuracy']}")


Experiment 1:
Decision Tree Size: 11
Accuracy: 0.9830508474576272


Experiment 2:
Decision Tree Size: 11
Accuracy: 1.0


Experiment 3:
Decision Tree Size: 11
Accuracy: 1.0


Experiment 4:
Decision Tree Size: 11
Accuracy: 1.0


Experiment 5:
Decision Tree Size: 11
Accuracy: 0.9830508474576272

Best Model (Highest Accuracy):
Experiment: 2
Decision Tree Size: 11
Accuracy: 1.0


  results_df = pd.concat([results_df, pd.DataFrame({


In [67]:
# Another way without using DataFrame
###First Experiment
# Variables to store best model information
best_accuracy = 0.0
best_experiment = 0
best_tree_size = 0

# Repeat this experiment five times
for i in range(5):
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        features, drug_target, test_size=0.3, random_state=i)

    # Create a decision tree classifier
    dtc = DecisionTreeClassifier(random_state=0)

    # Train model
    dtc.fit(X_train, y_train)

    # Make predictions on test set
    y_pred = dtc.predict(X_test)

    # Evaluation of the model
    accuracy = accuracy_score(y_test, y_pred)
    
    # Getting DT size
    tree_size = dtc.tree_.node_count

    # Report the sizes and accuracies of the decision trees in each experiment
    print(f"\nExperiment {i + 1}:")
    print(f"Decision Tree Size: {tree_size}")
    print(f"Accuracy: {accuracy}\n")

# Compare the results of different models to get the highest accuracy 
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_experiment = i + 1
        best_tree_size = tree_size



Experiment 1:
Decision Tree Size: 11
Accuracy: 0.9830508474576272


Experiment 2:
Decision Tree Size: 11
Accuracy: 1.0


Experiment 3:
Decision Tree Size: 11
Accuracy: 1.0


Experiment 4:
Decision Tree Size: 11
Accuracy: 1.0


Experiment 5:
Decision Tree Size: 11
Accuracy: 0.9830508474576272



In [68]:
# Report the best model
print("Best Model (Highest Accuracy)")
print(f"  Experiment: {best_experiment}")
print(f"  Decision Tree Size: {best_tree_size}")
print(f"  Accuracy: {best_accuracy}")

Best Model (Highest Accuracy)
  Experiment: 2
  Decision Tree Size: 11
  Accuracy: 1.0
