In [None]:
# mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# change current working directory
import os
os.chdir('/content/drive/MyDrive/AI/3-B-AI/')

In [None]:
# check we can see the dataset
os.path.isfile('sepal.csv')

In [None]:
# work with a naive Bayes classifier 

# data preparation steps

# Importing the packages we use
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
import seaborn as sns

# Loading all the observations
observations = pd.read_csv('sepal.csv')

# Shuffling all the observations
observations_shuffled = observations.sample(frac=1, random_state=99)

# Setting the fraction of observations we will use for testing
testing_fraction = 0.25
split_index = int(observations_shuffled.shape[0] * testing_fraction)

# Splitting into testing observations and training observations ("horizontal split")
observations_test = observations_shuffled.iloc[:split_index]
observations_train = observations_shuffled.iloc[split_index:]

# Splitting into testing examples and testing labels ("vertical split")
test_examples = observations_test.drop(columns='species').to_numpy()
test_labels = observations_test['species'].to_numpy()

# Splitting into training examples and training labels ("vertical split")
train_examples = observations_train.drop(columns='species').to_numpy()
train_labels = observations_train['species'].to_numpy()

# model training and model evaluation steps

# Create a naive Bayes model object (no hyperparameters to set)
model = GaussianNB()

# Call the model's fitting algorithm, passing in our training examples and training labels
model.fit(train_examples, train_labels)

# Use the trained model to generate predictions for our testing examples
predictions = model.predict(test_examples)

# Find the total number of model predictions that matched with the corresponding testing labels
correct_predictions = sum(predictions == test_labels)
# Calculate the model's accuracy: the fraction of predictions that were correct
accuracy = correct_predictions / len(test_labels)
# Display the accuracy as a single quantitative measure of overall performance
print("Accuracy:", accuracy, "(or", round(accuracy*100, 1), "%)")

# visualise the final generalisation
plt.figure()
disp = DecisionBoundaryDisplay.from_estimator(model, train_examples)
sns.scatterplot(x=train_examples[:, 0], y=train_examples[:, 1], hue=train_labels)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.title('Naive Bayes: final generalisation')
plt.show()

In [None]:
# work with a Decision Tree classifier

# data preparation steps

# Importing the packages we use
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
import seaborn as sns

# Loading all the observations
observations = pd.read_csv('sepal.csv')

# Grabbing a copy of the features so we can show the names when plotting the tree
features = observations.columns.drop('species').to_numpy()

# Shuffling all the observations
observations_shuffled = observations.sample(frac=1, random_state=99)

# Setting the fraction of observations we will use for testing
testing_fraction = 0.25
split_index = int(observations_shuffled.shape[0] * testing_fraction)

# Splitting into testing observations and training observations ("horizontal split")
observations_test = observations_shuffled.iloc[:split_index]
observations_train = observations_shuffled.iloc[split_index:]

# Splitting into testing examples and testing labels ("vertical split")
test_examples = observations_test.drop(columns='species').to_numpy()
test_labels = observations_test['species'].to_numpy()

# Splitting into training examples and training labels ("vertical split")
train_examples = observations_train.drop(columns='species').to_numpy()
train_labels = observations_train['species'].to_numpy()

# model training and model evaluation steps

# Create a Decision Tree model object and set the minimum parent size hyperparameter to 100:
model = DecisionTreeClassifier(min_samples_split=100, random_state=99)

# Call the model's fitting algorithm, passing in our training examples and training labels
model.fit(train_examples, train_labels)

# Use the trained model to generate predictions for our testing examples
predictions = model.predict(test_examples)

# Find the total number of model predictions that matched with the corresponding testing labels
correct_predictions = sum(predictions == test_labels)
# Calculate the model's accuracy: the fraction of predictions that were correct
accuracy = correct_predictions / len(test_labels)
# Display the accuracy as a single quantitative measure of overall performance
print("Accuracy:", accuracy, "(or", round(accuracy*100, 1), "%)")

# visualise the final generalisation
plt.figure()
disp = DecisionBoundaryDisplay.from_estimator(model, train_examples)
sns.scatterplot(x=train_examples[:, 0], y=train_examples[:, 1], hue=train_labels)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.title('Decision tree: final generalisation')
plt.show()

# show a diagram of the resulting tree structure
plt.figure()
plot_tree(model, feature_names=features.tolist(), class_names=model.classes_.tolist())
plt.show()

# Show evaluation results graphically
plt.figure()
plt.plot([100], [accuracy], marker='o', linestyle='-')
plt.title('Accuracy vs. minimum parent size for DT')
plt.xlabel('Minimum parent size')
plt.ylabel('Accuracy')
plt.grid(True)
plt.show()