In [None]:
# Code source: Jaques Grobler
# License: BSD 3 clause

# importing necessary python libraries
import matplotlib.pyplot as plt # for plotting
import numpy as np # for numerical computation

from sklearn import datasets, linear_model # for linear regression
from sklearn.metrics import mean_squared_error, r2_score # for error calculation

# Load the diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True) # return_X_y=True returns data in X and target in y

# Use only one feature
diabetes_X = diabetes_X[:, np.newaxis, 2] # np.newaxis is used to add a new axis at position 1, and only the values from the third column of diabetes_X are retained

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20] # all but last 20 values
diabetes_X_test = diabetes_X[-20:] # last 20 values

# Split the targets into training/testing sets
diabetes_y_train = diabetes_y[:-20] # all but last 20 values
diabetes_y_test = diabetes_y[-20:] # last 20 values

# Create linear regression object
# The linear_model module contains implementations of various linear models, including linear regression
# linear_model.LinearRegression() initializes a linear regression model object
# Linear regression assumes a linear relationship between the input variables (x) and the single output variable (y)
regr = linear_model.LinearRegression() 

# Train the model using the training sets
# the linear regression model is trained to learn the relationship between the input features (diabetes_X_train) and the target variable (diabetes_y_train)
# The model adjusts its internal parameters during training using gradient descent and possibly with optimisers 
# (that vary the step hyperparameter to reach the optimal value in fewer iterations) to minimize the difference between its predictions and the actual target values (loss function)
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred))

# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test, color="black")
plt.plot(diabetes_X_test, diabetes_y_pred, color="blue", linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

In [None]:
# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap # for coloring the plot

from sklearn.datasets import make_circles, make_classification, make_moons # for generating datasets
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis # for QDA
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier # for ensemble methods
from sklearn.gaussian_process import GaussianProcessClassifier # for Gaussian process classifier
from sklearn.gaussian_process.kernels import RBF # for RBF kernel
from sklearn.inspection import DecisionBoundaryDisplay # for plotting decision boundaries
from sklearn.model_selection import train_test_split # for splitting the dataset into training and testing sets
from sklearn.naive_bayes import GaussianNB # for Naive Bayes
from sklearn.neighbors import KNeighborsClassifier # for KNN
from sklearn.neural_network import MLPClassifier # for neural network
from sklearn.pipeline import make_pipeline # for creating a pipeline
from sklearn.preprocessing import StandardScaler # for scaling the data
from sklearn.svm import SVC # for SVM
from sklearn.tree import DecisionTreeClassifier # for decision tree

names = [
    "Nearest Neighbors",
    # Working: Instance-based learning algorithm. It classifies a new data point based on the majority class of its k-nearest neighbors in the feature space.
    # Parameters: k, the number of neighbors to consider.
    "Linear SVM",
    # Working: Supervised learning algorithm for classification or regression tasks. Linear SVM finds a hyperplane that best separates classes in the feature space.
    # Parameters: The regularization parameter C influences the trade-off between achieving a low training error and a low testing error.
    "RBF SVM",
    # Working: Uses a kernel function, specifically the Radial Basis Function (Gaussian function), to map input data into a higher-dimensional space where a hyperplane is used for separation.
    # Parameters: In addition to C, the RBF SVM has a kernel parameter γ that controls the shape of the decision boundary.
    "Gaussian Process",
    # Working: Non-parametric, probabilistic model. It models the distribution over functions and can be used for regression or classification tasks.
    # Parameters: The choice of a kernel function and its parameters.
    "Decision Tree",
    # Working: Recursively splits the dataset based on features to create a tree structure. Each leaf node represents a class or a regression value.
    # Parameters: Depth of the tree and criteria for splitting.
    "Random Forest",
    # Working: Ensemble method that builds multiple decision trees and combines their predictions. It improves generalization and reduces overfitting.
    # Parameters: The number of trees and their individual parameters.
    "Neural Net",
    # Working: Consist of layers of interconnected neurons. They use an activation function to transform inputs and learn complex mappings between inputs and outputs.
    # Parameters: Architecture (number of layers, neurons per layer), activation functions, and learning rate.
    "AdaBoost",
    # Working: Ensemble method that combines multiple weak learners (usually decision trees) to create a strong classifier. It assigns more weight to misclassified samples.
    # Parameters: Number of weak learners and their individual parameters.
    "Naive Bayes",
    # Working: Probabilistic classifier based on Bayes' theorem with the assumption of independence between features. Particularly effective for text classification.
    # Parameters: The choice of distribution for features (e.g., Gaussian, Multinomial) and any smoothing parameters.
    "QDA",
    # Working: Classification algorithm that models the distribution of each class with a quadratic decision boundary. It's more flexible than linear models.
    # Parameters: Covariance matrices for each class.
]

# Initialise classifiers
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    DecisionTreeClassifier(max_depth=5, random_state=42),
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=42
    ),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    AdaBoostClassifier(random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

# generates a synthetic linearly separable dataset for binary classification with two informative features and one cluster per class. 
# Some random noise is added to make the dataset more realistic.
# make_classification: generates a random n-class classification problem
# n_features=2: No. of features (dimensions) for each data point in the dataset
# n_redundant=0: No. of redundant features. These features are generated as random linear combinations of the informative features
# n_informative=2: No. informative features, i.e. contribute to the relationship between the features and the target variable
# random_state=1: Provides a seed for reproducibility. The same seed will result in the same dataset when the function is called again.
# n_clusters_per_class=1: No. of clusters per class. Each class is generated with a single cluster.
X, y = make_classification( 
    n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1
)


# After generating the dataset, a random seed is set to ensure reproducibility. The random state is then used in the following line.
rng = np.random.RandomState(2)

# Uniform noise is added to the dataset. Each element in X (the feature matrix) is augmented by twice the value of a random number between 0 and 1. 
# This adds variability to the data.
X += 2 * rng.uniform(size=X.shape)

# The resulting feature matrix X and target variable y represent a synthetic dataset that is designed to be linearly separable

# The feature matrix X and target variable y are packed into a tuple (X, y) and assigned to the variable linearly_separable
linearly_separable = (X, y)

# list of datasets
datasets = [
    make_moons(noise=0.3, random_state=0),
    make_circles(noise=0.2, factor=0.5, random_state=1),
    linearly_separable,
]

figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    # preprocess dataset, split into training and test part and set up variables for plotting
    X, y = ds
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.4, random_state=42
    )

    # sets the minimum and maximum values for the x and y axes for plotting by adding or subtracting 0.5 from the minimum and maximum values of the feature matrix X
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5

    # just plot the dataset first 
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(["#FF0000", "#0000FF"])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
    if ds_cnt == 0:
        ax.set_title("Input data")
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")
    # Plot the testing points
    ax.scatter(
        X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k"
    )
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

    # iterate over classifiers
    # For each classifier, a subplot is created in a grid. 
    # The grid has a number of rows equal to the number of datasets, and the number of columns equal to the number of classifiers + 1.
    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)

        #  A new classifier is created as a pipeline using make_pipeline. 
        # This pipeline includes a StandardScaler() for feature scaling and the actual classifier (clf). 
        # The classifier is then fitted (trained) on the training data (X_train, y_train).
        clf = make_pipeline(StandardScaler(), clf)
        clf.fit(X_train, y_train)
        
        # The accuracy score of the classifier is calculated on the testing data (X_test, y_test). 
        # The score variable holds the accuracy of the classifier on the current dataset.
        score = clf.score(X_test, y_test)
        
        # The decision boundary of the classifier is displayed on the current subplot using the DecisionBoundaryDisplay.from_estimator function. 
        # It visualizes the decision boundary based on the trained classifier (clf). Parameters like colormap (cmap), transparency (alpha), and axis (ax) are set.
        DecisionBoundaryDisplay.from_estimator(
            clf, X, cmap=cm, alpha=0.8, ax=ax, eps=0.5
        )

        # Plot the training points, just some matplotlib stuff.... enhh
        ax.scatter(
            X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k"
        )
        # Plot the testing points
        ax.scatter(
            X_test[:, 0],
            X_test[:, 1],
            c=y_test,
            cmap=cm_bright,
            edgecolors="k",
            alpha=0.6,
        )

        ax.set_xlim(x_min, x_max)
        ax.set_ylim(y_min, y_max)
        ax.set_xticks(())
        ax.set_yticks(())
        if ds_cnt == 0:
            ax.set_title(name)
        ax.text(
            x_max - 0.3,
            y_min + 0.3,
            ("%.2f" % score).lstrip("0"),
            size=15,
            horizontalalignment="right",
        )
        i += 1

plt.tight_layout()
plt.show()