In [None]:
# mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# change current working directory
import os
os.chdir('/content/drive/MyDrive/AI/4-B-AI/')

In [None]:
# check we can see the dataset
os.path.isfile('sepal.csv')

In [None]:
# work with a Decision Tree classifier

# data preparation steps

# Importing the packages we use
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
import seaborn as sns

# Loading all the observations
observations = pd.read_csv('sepal.csv')

# Grabbing a copy of the features so we can show the names when plotting the tree
features = observations.columns.drop('species').to_numpy()

# Shuffling all the observations
observations_shuffled = observations.sample(frac=1, random_state=99)

# Setting the fraction of observations we will use for testing
testing_fraction = 0.25
split_index = int(observations_shuffled.shape[0] * testing_fraction)

# Splitting into testing observations and training observations ("horizontal split")
observations_test = observations_shuffled.iloc[:split_index]
observations_train = observations_shuffled.iloc[split_index:]

# Splitting into testing examples and testing labels ("vertical split")
test_examples = observations_test.drop(columns='species').to_numpy()
test_labels = observations_test['species'].to_numpy()

# Splitting into training examples and training labels ("vertical split")
train_examples = observations_train.drop(columns='species').to_numpy()
train_labels = observations_train['species'].to_numpy()

# model training and model evaluation steps

# Create a Decision Tree model object and set the minimum parent size hyperparameter to 100:
model = DecisionTreeClassifier(min_samples_split=100, random_state=99)

# Call the model's fitting algorithm, passing in our training examples and training labels
model.fit(train_examples, train_labels)

# Use the trained model to generate predictions for our testing examples
predictions = model.predict(test_examples)

# Find the total number of model predictions that matched with the corresponding testing labels
correct_predictions = sum(predictions == test_labels)
# Calculate the model's accuracy: the fraction of predictions that were correct
accuracy = correct_predictions / len(test_labels)
# Display the accuracy as a single quantitative measure of overall performance
print("Accuracy:", accuracy, "(or", round(accuracy*100, 1), "%)")

# visualise the final generalisation
plt.figure()
disp = DecisionBoundaryDisplay.from_estimator(model, train_examples)
sns.scatterplot(x=train_examples[:, 0], y=train_examples[:, 1], hue=train_labels)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.title('Decision tree: final generalisation')
plt.show()

# show a diagram of the resulting tree structure
plt.figure()
plot_tree(model, feature_names=features.tolist(), class_names=model.classes_.tolist())
plt.show()

# Show evaluation results graphically
plt.figure()
plt.plot([100], [accuracy], marker='o', linestyle='-')
plt.title('Accuracy vs. minimum parent size for DT')
plt.xlabel('Minimum parent size')
plt.ylabel('Accuracy')
plt.grid(True)
plt.show()

########### Hyperparameter investigation

# A list of different minimum parent sizes to test
min_parent_sizes = list(range(5, 101, 5))

# A list to store accuracy scores for each min parent size
accuracy_scores = []

# this for loop just contains a copy/paste of the original train/test/visualisation code (further up), ...
# but with the fit() call adjusted to use the min_parent_size loop iterator, and a final line to add the ...
# accuracy result onto the end of the accuracy_scores list we just made above, ready for plotting
for min_parent_size in min_parent_sizes:
    # (almost all the code in this for loop is copy/pasted from the original example above, see comment above)
    
    # Create a Decision Tree model object and set the minimum parent size hyperparameter based on our for loop:
    model = DecisionTreeClassifier(min_samples_split=min_parent_size, random_state=99)

    # Call the model's fitting algorithm, passing in our training examples and training labels
    model.fit(train_examples, train_labels)

    # Use the trained model to generate predictions for our testing examples
    predictions = model.predict(test_examples)

    # Find the total number of model predictions that matched with the corresponding testing labels
    correct_predictions = sum(predictions == test_labels)
    # Calculate the model's accuracy: the fraction of predictions that were correct
    accuracy = correct_predictions / len(test_labels)
    # Display the accuracy as a single quantitative measure of overall performance
    print("Accuracy:", accuracy, "(or", round(accuracy*100, 1), "%)")
    
    # visualise the final generalisation
    plt.figure()
    disp = DecisionBoundaryDisplay.from_estimator(model, train_examples)
    sns.scatterplot(x=train_examples[:, 0], y=train_examples[:, 1], hue=train_labels)
    plt.xlabel('Sepal length')
    plt.ylabel('Sepal width')
    plt.title('Decision tree: final generalisation')
    plt.show()

    # show a diagram of the resulting tree structure
    plt.figure()
    plot_tree(model, feature_names=features.tolist(), class_names=model.classes_.tolist())
    plt.show()
    
    # Add the lastest accuracy score onto the end of our list of results
    accuracy_scores.append(accuracy)

# finally, we copy/paste the code that was supplied to draw the graph (further up)...
# and adjust the .plot() call in order to use the two lists we've just built up in the code above
    
# Show evaluation results graphically (we've just changed the .plot() call to use the two lists we built up in the code above)
plt.figure()
plt.plot(min_parent_sizes, accuracy_scores, marker='o', linestyle='-')
plt.title('Accuracy vs. minimum parent size for DT')
plt.xlabel('Minimum parent size')
plt.ylabel('Accuracy')
plt.grid(True)
plt.show()

########### 
# (the solution shows diagrams of all the trees it builds, and feature space visualisations of the final generalisations...
# ...scroll down to the bottom of all the output to see the final graph)

In [None]:
# work with a hard-voting ensemble classifier

# data preparation steps

# Importing the packages we use
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier 
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
import seaborn as sns

# Loading all the observations
observations = pd.read_csv('sepal.csv')

# Shuffling all the observations
observations_shuffled = observations.sample(frac=1, random_state=99)

# Setting the fraction of observations we will use for testing
testing_fraction = 0.25
split_index = int(observations_shuffled.shape[0] * testing_fraction)

# Splitting into testing observations and training observations ("horizontal split")
observations_test = observations_shuffled.iloc[:split_index]
observations_train = observations_shuffled.iloc[split_index:]

# Splitting into testing examples and testing labels ("vertical split")
test_examples = observations_test.drop(columns='species').to_numpy()
test_labels = observations_test['species'].to_numpy()

# Splitting into training examples and training labels ("vertical split")
train_examples = observations_train.drop(columns='species').to_numpy()
train_labels = observations_train['species'].to_numpy()

# model training and model evaluation steps

# set the details of the classifiers we want to use (all using their default hyperparameters)
model1 = KNeighborsClassifier()
model2 = GaussianNB()
model3 = DecisionTreeClassifier(random_state=99)

# Create a hard-voting ensemble composed of a k-NN, NB and DT classifier
model = VotingClassifier(estimators=[('knn', model1), ('nb', model2), ('dt', model3)])

# Call the model's fitting algorithm, passing in our training examples and training labels
model.fit(train_examples, train_labels)

# Use the trained model to generate predictions for our testing examples
predictions = model.predict(test_examples)

# Find the total number of model predictions that matched with the corresponding testing labels
correct_predictions = sum(predictions == test_labels)
# Calculate the model's accuracy: the fraction of predictions that were correct
accuracy = correct_predictions / len(test_labels)
# Display the accuracy as a single quantitative measure of overall performance
print("Accuracy:", accuracy, "(or", round(accuracy*100, 1), "%)")

# visualise the final generalisation
plt.figure()
disp = DecisionBoundaryDisplay.from_estimator(model, train_examples)
sns.scatterplot(x=train_examples[:, 0], y=train_examples[:, 1], hue=train_labels)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.title('Ensemble: final generalisation')
plt.show()

In [None]:
# work with a bagging ensemble classifier

# data preparation steps

# Importing the packages we use
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier 
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
import seaborn as sns

# Loading all the observations
observations = pd.read_csv('sepal.csv')

# Shuffling all the observations
observations_shuffled = observations.sample(frac=1, random_state=99)

# Setting the fraction of observations we will use for testing
testing_fraction = 0.25
split_index = int(observations_shuffled.shape[0] * testing_fraction)

# Splitting into testing observations and training observations ("horizontal split")
observations_test = observations_shuffled.iloc[:split_index]
observations_train = observations_shuffled.iloc[split_index:]

# Splitting into testing examples and testing labels ("vertical split")
test_examples = observations_test.drop(columns='species').to_numpy()
test_labels = observations_test['species'].to_numpy()

# Splitting into training examples and training labels ("vertical split")
train_examples = observations_train.drop(columns='species').to_numpy()
train_labels = observations_train['species'].to_numpy()

# model training and model evaluation steps

# set the details of the classifier we want to use (default hyperparameters)
model_dt = DecisionTreeClassifier(random_state=99)

# Create a soft-voting ensemble classifier componsed of 100 DTs, each sampling the training data using bagging
model = BaggingClassifier(model_dt, n_estimators=100, random_state=99)

# Call the model's fitting algorithm, passing in our training examples and training labels
model.fit(train_examples, train_labels)

# Use the trained model to generate predictions for our testing examples
predictions = model.predict(test_examples)

# Find the total number of model predictions that matched with the corresponding testing labels
correct_predictions = sum(predictions == test_labels)
# Calculate the model's accuracy: the fraction of predictions that were correct
accuracy = correct_predictions / len(test_labels)
# Display the accuracy as a single quantitative measure of overall performance
print("Accuracy:", accuracy, "(or", round(accuracy*100, 1), "%)")

# visualise the final generalisation
plt.figure()
disp = DecisionBoundaryDisplay.from_estimator(model, train_examples)
sns.scatterplot(x=train_examples[:, 0], y=train_examples[:, 1], hue=train_labels)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.title('Ensemble: final generalisation')
plt.show()

In [None]:
# work with a random forest classifier

# data preparation steps

# Importing the packages we use
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier 
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
import seaborn as sns

# Loading all the observations
observations = pd.read_csv('sepal.csv')

# Shuffling all the observations
observations_shuffled = observations.sample(frac=1, random_state=99)

# Setting the fraction of observations we will use for testing
testing_fraction = 0.25
split_index = int(observations_shuffled.shape[0] * testing_fraction)

# Splitting into testing observations and training observations ("horizontal split")
observations_test = observations_shuffled.iloc[:split_index]
observations_train = observations_shuffled.iloc[split_index:]

# Splitting into testing examples and testing labels ("vertical split")
test_examples = observations_test.drop(columns='species').to_numpy()
test_labels = observations_test['species'].to_numpy()

# Splitting into training examples and training labels ("vertical split")
train_examples = observations_train.drop(columns='species').to_numpy()
train_labels = observations_train['species'].to_numpy()

# model training and model evaluation steps

# Create a random forest classifier
model = RandomForestClassifier(random_state=99)

# Call the model's fitting algorithm, passing in our training examples and training labels
model.fit(train_examples, train_labels)

# Use the trained model to generate predictions for our testing examples
predictions = model.predict(test_examples)

# Find the total number of model predictions that matched with the corresponding testing labels
correct_predictions = sum(predictions == test_labels)
# Calculate the model's accuracy: the fraction of predictions that were correct
accuracy = correct_predictions / len(test_labels)
# Display the accuracy as a single quantitative measure of overall performance
print("Accuracy:", accuracy, "(or", round(accuracy*100, 1), "%)")

# visualise the final generalisation
plt.figure()
disp = DecisionBoundaryDisplay.from_estimator(model, train_examples)
sns.scatterplot(x=train_examples[:, 0], y=train_examples[:, 1], hue=train_labels)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.title('Random forest: final generalisation')
plt.show()

In [None]:
# work with a linear regressor

# data preparation steps

# Importing the packages we use
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import seaborn as sns

# Loading all the observations
observations = pd.read_csv('sepal_regression.csv')

# Shuffling all the observations
observations_shuffled = observations.sample(frac=1, random_state=99)

# Setting the fraction of observations we will use for testing
testing_fraction = 0.25
split_index = int(observations_shuffled.shape[0] * testing_fraction)

# Splitting into testing observations and training observations ("horizontal split")
observations_test = observations_shuffled.iloc[:split_index]
observations_train = observations_shuffled.iloc[split_index:]

# Splitting into testing examples and testing values ("vertical split")
test_examples = observations_test.drop(columns='petal_length').to_numpy()
test_values = observations_test['petal_length'].to_numpy()

# Splitting into training examples and training values ("vertical split")
train_examples = observations_train.drop(columns='petal_length').to_numpy()
train_values = observations_train['petal_length'].to_numpy()

# model training and model evaluation steps

# Create a linear regression model object
model = LinearRegression()

# Call the model's fitting algorithm, passing in our training examples and training values
model.fit(train_examples, train_values)

# Use the trained model to generate predictions for our testing examples
predictions = model.predict(test_examples)

# Find the mean squared error (MSE) between the model's predictions and the testing values
mse = ((predictions - test_values) ** 2).mean()
# Display the MSE as a single quantitative measure of overall performance
print("Mean square error (MSE):", mse)

# Prepare our own grid of testing examples to use in visualization
x_min = train_examples[:, 0].min() - 1
x_max = train_examples[:, 0].max() + 1
y_min = train_examples[:, 1].min() - 1
y_max = train_examples[:, 1].max() + 1
grid_x, grid_y = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))
grid_examples = np.c_[grid_x.ravel(), grid_y.ravel()]

# Make predictions for each of the examples in the grid
grid_predictions = model.predict(grid_examples)
grid_predictions = grid_predictions.reshape(grid_x.shape)

# Visualise the final generalisation
plt.figure()
contour = plt.contourf(grid_x, grid_y, grid_predictions, cmap='gray_r', alpha=0.8)
plt.colorbar(contour, label='Petal Length')
sns.scatterplot(x=train_examples[:, 0], y=train_examples[:, 1], hue=train_values, palette='gray_r', edgecolor='k', legend=False)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.title('Linear Regression: final generalisation')
plt.show()

In [None]:
# work with a linear regressor (matrix multiplication)

# data preparation steps

# Importing the packages we use
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Loading all the observations
observations = pd.read_csv('sepal_regression.csv')

# Shuffling all the observations
observations_shuffled = observations.sample(frac=1, random_state=99)

# Setting the fraction of observations we will use for testing
testing_fraction = 0.25
split_index = int(observations_shuffled.shape[0] * testing_fraction)

# Splitting into testing observations and training observations ("horizontal split")
observations_test = observations_shuffled.iloc[:split_index]
observations_train = observations_shuffled.iloc[split_index:]

# Splitting into testing examples and testing values ("vertical split")
test_examples = observations_test.drop(columns='petal_length').to_numpy()
test_values = observations_test['petal_length'].to_numpy()

# Splitting into training examples and training values ("vertical split")
train_examples = observations_train.drop(columns='petal_length').to_numpy()
train_values = observations_train['petal_length'].to_numpy()

# model training and model evaluation steps

# Create a linear regression model object
model = LinearRegression()

# Call the model's fitting algorithm, passing in our training examples and training values
model.fit(train_examples, train_values)

# Use the trained model to generate predictions for our testing examples manually, with a matrix multiply

# Add a column of ones to the test examples to account for the intercept
test_examples_with_intercept = np.c_[np.ones(test_examples.shape[0]), test_examples]
# Grab the model's parameters (coefficients and intercept)
params = np.append(model.intercept_, model.coef_)
# Calculate the predictions using a single matrix multiply
predictions_manual = test_examples_with_intercept @ params

# Find the mean squared error (MSE) between the model's predictions and the testing values
mse = ((predictions_manual - test_values) ** 2).mean()
# Display the MSE as a single quantitative measure of overall performance
print("Mean square error (MSE):", mse)

In [None]:
# work with a linear regressor (SGD)

# data preparation steps

# Importing the packages we use
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDRegressor
import matplotlib.pyplot as plt
import seaborn as sns

# Loading all the observations
observations = pd.read_csv('sepal_regression.csv')

# Shuffling all the observations
observations_shuffled = observations.sample(frac=1, random_state=99)

# Setting the fraction of observations we will use for testing
testing_fraction = 0.25
split_index = int(observations_shuffled.shape[0] * testing_fraction)

# Splitting into testing observations and training observations ("horizontal split")
observations_test = observations_shuffled.iloc[:split_index]
observations_train = observations_shuffled.iloc[split_index:]

# Splitting into testing examples and testing values ("vertical split")
test_examples = observations_test.drop(columns='petal_length').to_numpy()
test_values = observations_test['petal_length'].to_numpy()

# Splitting into training examples and training values ("vertical split")
train_examples = observations_train.drop(columns='petal_length').to_numpy()
train_values = observations_train['petal_length'].to_numpy()

# model training and model evaluation steps

# Create a linear regression (SGD) model object
model = SGDRegressor(random_state=99)

# Call the model's fitting algorithm, passing in our training examples and training values
model.fit(train_examples, train_values)

# Use the trained model to generate predictions for our testing examples
predictions = model.predict(test_examples)

# Find the mean squared error (MSE) between the model's predictions and the testing values
mse = ((predictions - test_values) ** 2).mean()
# Display the MSE as a single quantitative measure of overall performance
print("Mean square error (MSE):", mse)

# Prepare our own grid of testing examples to use in visualization
x_min = train_examples[:, 0].min() - 1
x_max = train_examples[:, 0].max() + 1
y_min = train_examples[:, 1].min() - 1
y_max = train_examples[:, 1].max() + 1
grid_x, grid_y = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))
grid_examples = np.c_[grid_x.ravel(), grid_y.ravel()]

# Make predictions for each of the examples in the grid
grid_predictions = model.predict(grid_examples)
grid_predictions = grid_predictions.reshape(grid_x.shape)

# Visualise the final generalisation
plt.figure()
contour = plt.contourf(grid_x, grid_y, grid_predictions, cmap='gray_r', alpha=0.8)
plt.colorbar(contour, label='Petal Length')
sns.scatterplot(x=train_examples[:, 0], y=train_examples[:, 1], hue=train_values, palette='gray_r', edgecolor='k', legend=False)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.title('Linear Regression (SGD): final generalisation')
plt.show()