# Installerer pakkene

In [1]:
import pandas as pd
import numpy as np
from DecisionTree import DecisionTree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

# Seed

In [2]:
seed = 123

# Datasett

In [3]:
data = pd.read_csv("data/wine_dataset.csv")
data.head()
X = data.loc[:,:"alcohol"]
y = data.loc[:,"type"]

In [4]:
#Split the data into training and testing sets
X_train,X_val_test,y_train,y_val_test = train_test_split(X,y,test_size=0.3, random_state=seed)

In [5]:
# Split the data into validation sets
X_test, X_val, y_test, y_val = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=seed)

# Implement the model with entropy and Gini.

In [6]:
# Implement the model with entropy and Gini
category = ["entropy", "gini"]
prune_list = [True, False]
results = {}

for impurity in category:
    for prune in prune_list:
        model = DecisionTree(impurity_measure=impurity, random_state=seed)
        model.learn(X_train, y_train, impurity, prune)
        model_predictions = np.array(model.predict(X_val))

        # Find accuracy score for my model
        model_accuracy = accuracy_score(y_val, model_predictions)

        results[(impurity, prune)] = {model: model_accuracy}

# Find the best model based on validation accuracy
best_model_parameters = max(results.items(), key=lambda x: list(x[1].values()))
best_model_impurity_measure = best_model_parameters[0]
best_model_accuracy_score = max(best_model_parameters[1].values())

# The best model chosen is the one with the highest accuracy score
best_model_chosen = max(best_model_parameters[1], key=best_model_parameters[1].get)

In [7]:
# Test the best selected model on the X_test dataset
final_y_pred = best_model_chosen.predict(X_test)
final_model_accuracy_score = accuracy_score(y_test, final_y_pred)

# Implement impurity measure (entropy and Gini) models using Scikit-Learn!

In [8]:
# Implement impurity measure models using Scikit-Learn
sklearn_results = {}
for model in category:
    sklearn_model = DecisionTreeClassifier(criterion=model, random_state=seed)
    sklearn_model.fit(X_train, y_train)
    sklearn_model_pred = sklearn_model.predict(X_val)
    sklearn_model_accuracy = accuracy_score(y_val, sklearn_model_pred)
    sklearn_results[(sklearn_model, model)] = sklearn_model_accuracy

# Find the best Scikit-Learn model that achieved the highest accuracy
best_sklearn_model_parameters = max(zip(sklearn_results.values(), sklearn_results.keys()))
best_sklearn_impurity_measure = best_sklearn_model_parameters[1]
best_sklearn_model = best_sklearn_model_parameters[1][0]

# Take the selected model further and predict it on the entire X_test dataset, then test its accuracy
final_sklearn_pred = best_sklearn_model.predict(X_test)
final_sklearn_accuracy_score = accuracy_score(y_test, final_sklearn_pred)

# Final results

In [9]:
# Calculate accuracy scores for different impurity measure models
print("1. Model accuracy score parameters:")
for key, value in results.items():
    print(f"     - Model parameters: {key}, accuracy: {list(value.values())[0]}.")

print("\n2. Scikit-Learn accuracy score parameters:")
for key, value in sklearn_results.items():
    print(f"    - Scikit-Learn model '{key[1]}' with accuracy score: {value}.")

print("\n3. Best final accuracy score parameters:")
print(f"    - Model: {best_model_parameters[0]}, accuracy: {best_model_accuracy_score}.")
print(f"    - Scikit-Learn: {best_sklearn_model_parameters[1][1]}, accuracy: {best_sklearn_model_parameters[0]}.")

print("\n--> 4. Final model accuracy scores:")
print(f"    - Final model accuracy score: {final_model_accuracy_score}.")
print(f"    - Final Scikit-Learn model accuracy score: {final_sklearn_accuracy_score}.")

1. Model accuracy score parameters:
     - Model parameters: ('entropy', True), accuracy: 0.8354166666666667.
     - Model parameters: ('entropy', False), accuracy: 0.8666666666666667.
     - Model parameters: ('gini', True), accuracy: 0.8395833333333333.
     - Model parameters: ('gini', False), accuracy: 0.86875.

2. Scikit-Learn accuracy score parameters:
    - Scikit-Learn model 'entropy' with accuracy score: 0.8895833333333333.
    - Scikit-Learn model 'gini' with accuracy score: 0.88125.

3. Best final accuracy score parameters:
    - Model: ('gini', False), accuracy: 0.86875.
    - Scikit-Learn: entropy, accuracy: 0.8895833333333333.

--> 4. Final model accuracy scores:
    - Final model accuracy score: 0.8583333333333333.
    - Final Scikit-Learn model accuracy score: 0.8895833333333333.
