In [None]:
# mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# change current working directory
import os
os.chdir('/content/drive/MyDrive/AI/8-B-AI/')

In [None]:
# check we can see the dataset
os.path.isfile('iris.csv')

In [None]:
# try some visualisation with PCA

# data preparation steps

# Importing the packages we use
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Loading all the observations
observations = pd.read_csv('iris.csv')

# Let's assume an unsupervised learning problem, with no labels known
datapoints = observations.drop(columns='species').to_numpy()

# Create a PCA model object
pca = PCA()

# Use it to process our datapoints
pca.fit(datapoints)

# Plot the cumulative explained variance against the number of PCs
plt.figure()
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_)*100)
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance vs Number of Principal Components')
plt.grid(True)
plt.show()

# Transform the original datapoints to the PCA space
datapoints = pca.transform(datapoints)

# 3D Scatter plot of the datapoints with respect to the first 3 principal components
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(datapoints[:, 0], datapoints[:, 1], datapoints[:, 2])
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('3D Scatter Plot of Datapoints in PCA Space')
plt.show()

In [None]:
# try some supervised learning with PCA

# data preparation steps

# Importing the packages we use
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
import seaborn as sns

# Loading all the observations
observations = pd.read_csv('iris.csv')

# Shuffling all the observations
observations_shuffled = observations.sample(frac=1, random_state=99)

# Setting the fraction of observations we will use for testing
testing_fraction = 0.25
split_index = int(observations_shuffled.shape[0] * testing_fraction)

# Splitting into testing observations and training observations ("horizontal split")
observations_test = observations_shuffled.iloc[:split_index]
observations_train = observations_shuffled.iloc[split_index:]

# Splitting into testing examples and testing labels ("vertical split")
test_examples = observations_test.drop(columns='species').to_numpy()
test_labels = observations_test['species'].to_numpy()

# Splitting into training examples and training labels ("vertical split")
train_examples = observations_train.drop(columns='species').to_numpy()
train_labels = observations_train['species'].to_numpy()

# Apply PCA to our training data

# Create a PCA model object
pca = PCA()

# Use it to process our training examples
pca.fit(train_examples)

# Plot the cumulative explained variance against the number of PCs
plt.figure()
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_)*100)
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance vs Number of Principal Components')
plt.grid(True)
plt.show()

# Let's assume we decide to use the first 2 PCs based on the plot above...

# Create a PCA model object that only retains the first 2 PCs
pca = PCA(n_components=2)

# Use it to process our training examples
pca.fit(train_examples)

# Transform the original training examples to the new 2D space
train_examples = pca.transform(train_examples)

# Transform the original testing examples to the new 2D space
test_examples = pca.transform(test_examples)

# model training and model evaluation steps

# Create a Decision Tree model object:
model = DecisionTreeClassifier(random_state=99)

# Call the model's fitting algorithm, passing in our training examples and training labels
model.fit(train_examples, train_labels)

# Use the trained model to generate predictions for our testing examples
predictions = model.predict(test_examples)

# Find the total number of model predictions that matched with the corresponding testing labels
correct_predictions = sum(predictions == test_labels)
# Calculate the model's accuracy: the fraction of predictions that were correct
accuracy = correct_predictions / len(test_labels)
# Display the accuracy as a single quantitative measure of overall performance
print("Accuracy:", accuracy, "(or", round(accuracy*100, 1), "%)")

# visualise the final generalisation
plt.figure()
disp = DecisionBoundaryDisplay.from_estimator(model, train_examples)
sns.scatterplot(x=train_examples[:, 0], y=train_examples[:, 1], hue=train_labels)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Decision tree: final generalisation')
plt.show()

In [None]:
# try some supervised learning with PCA (manual approach, showing the matrix multiplications)

# data preparation steps

# Importing the packages we use
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
import seaborn as sns

# Loading all the observations
observations = pd.read_csv('iris.csv')

# Shuffling all the observations
observations_shuffled = observations.sample(frac=1, random_state=99)

# Setting the fraction of observations we will use for testing
testing_fraction = 0.25
split_index = int(observations_shuffled.shape[0] * testing_fraction)

# Splitting into testing observations and training observations ("horizontal split")
observations_test = observations_shuffled.iloc[:split_index]
observations_train = observations_shuffled.iloc[split_index:]

# Splitting into testing examples and testing labels ("vertical split")
test_examples = observations_test.drop(columns='species').to_numpy()
test_labels = observations_test['species'].to_numpy()

# Splitting into training examples and training labels ("vertical split")
train_examples = observations_train.drop(columns='species').to_numpy()
train_labels = observations_train['species'].to_numpy()

# Apply PCA to our training data

# Create a PCA model object
pca = PCA()

# Use it to process our training examples
pca.fit(train_examples)

# Plot the cumulative explained variance against the number of PCs
plt.figure()
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_)*100)
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance vs Number of Principal Components')
plt.grid(True)
plt.show()

# Let's assume we decide to use the first 2 PCs based on the plot above...

# Calculate the means of all the training feature values
means = np.mean(train_examples[:, :], axis=0)

# Shift the training data and the testing data using the same transformation
train_examples_centred = train_examples - means

# Shift the testing data and the testing data using the same transformation
test_examples_centred = test_examples - means

# Note scikit-learn's .components_ attribute actually holds one PC per row... Hence us transposing it to look as we expect, below

# Transform the original training examples to the new 2D space (matrix multiply between centred data and principle components)
train_examples = train_examples_centred @ pca.components_.T

# And "delete" everything after the 2nd column
train_examples = train_examples[:, :2]

# Or we could have multiplied with only the first two PCs... Let's use that approach to transform the testing examples
test_examples = test_examples_centred @ pca.components_[:2, :].T

# model training and model evaluation steps

# Create a Decision Tree model object:
model = DecisionTreeClassifier(random_state=99)

# Call the model's fitting algorithm, passing in our training examples and training labels
model.fit(train_examples, train_labels)

# Use the trained model to generate predictions for our testing examples
predictions = model.predict(test_examples)

# Find the total number of model predictions that matched with the corresponding testing labels
correct_predictions = sum(predictions == test_labels)
# Calculate the model's accuracy: the fraction of predictions that were correct
accuracy = correct_predictions / len(test_labels)
# Display the accuracy as a single quantitative measure of overall performance
print("Accuracy:", accuracy, "(or", round(accuracy*100, 1), "%)")

# visualise the final generalisation
plt.figure()
disp = DecisionBoundaryDisplay.from_estimator(model, train_examples)
sns.scatterplot(x=train_examples[:, 0], y=train_examples[:, 1], hue=train_labels)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Decision tree: final generalisation')
plt.show()

In [None]:
# try some supervised learning with PCA and standardisation

# data preparation steps

# Importing the packages we use
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
import seaborn as sns

# Do we standardise (using z-score standardisation), or not
Z = True

# Loading all the observations
observations = pd.read_csv('iris.csv')

# Shuffling all the observations
observations_shuffled = observations.sample(frac=1, random_state=99)

# Setting the fraction of observations we will use for testing
testing_fraction = 0.25
split_index = int(observations_shuffled.shape[0] * testing_fraction)

# Splitting into testing observations and training observations ("horizontal split")
observations_test = observations_shuffled.iloc[:split_index]
observations_train = observations_shuffled.iloc[split_index:]

# Splitting into testing examples and testing labels ("vertical split")
test_examples = observations_test.drop(columns='species').to_numpy()
test_labels = observations_test['species'].to_numpy()

# Splitting into training examples and training labels ("vertical split")
train_examples = observations_train.drop(columns='species').to_numpy()
train_labels = observations_train['species'].to_numpy()

# Apply PCA to our training data

if Z:
    scaler = StandardScaler()
    # z-score standardise the training examples (and store the parameters)
    train_examples = scaler.fit_transform(train_examples)

# Create a PCA model object
pca = PCA()

# Use it to process our training examples
pca.fit(train_examples)

# Plot the cumulative explained variance against the number of PCs
plt.figure()
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_)*100)
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance vs Number of Principal Components')
plt.grid(True)
plt.show()

# Let's assume we decide to use the first 2 PCs based on the plot above...

# Create a PCA model object that only retains the first N PCs
N = 2
pca = PCA(n_components=N)

# Use it to process our training examples
pca.fit(train_examples)

# Transform the original training examples to the new ND space
train_examples = pca.transform(train_examples)

if Z:
    # z-score standardise the testing examples (using the stored parameters)
    test_examples = scaler.transform(test_examples)

# Transform the original testing examples to the new ND space
test_examples = pca.transform(test_examples)

# model training and model evaluation steps

# Create a Decision Tree model object:
model = DecisionTreeClassifier(random_state=99)

# Call the model's fitting algorithm, passing in our training examples and training labels
model.fit(train_examples, train_labels)

# Use the trained model to generate predictions for our testing examples
predictions = model.predict(test_examples)

# Find the total number of model predictions that matched with the corresponding testing labels
correct_predictions = sum(predictions == test_labels)
# Calculate the model's accuracy: the fraction of predictions that were correct
accuracy = correct_predictions / len(test_labels)
# Display the accuracy as a single quantitative measure of overall performance
print("Accuracy:", accuracy, "(or", round(accuracy*100, 1), "%)")

# visualise the final generalisation
plt.figure()
disp = DecisionBoundaryDisplay.from_estimator(model, train_examples)
sns.scatterplot(x=train_examples[:, 0], y=train_examples[:, 1], hue=train_labels)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Decision tree: final generalisation')
plt.show()