# Intelligent Systems 3: Probabilistic and Deep Learning

### Question 1: Regression and classification (42 marks)
a. Train and evaluate a least squares linear regression model predicting the value of
variable D from variables A, B and C.

In [None]:
# This line is needed to use matplotlib in Jupyter notebook
%matplotlib inline
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import scale, normalize
import pandas as pd
import matplotlib.pyplot as plt
from copy import deepcopy
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
np.set_printoptions(suppress=True)

def split_data(data, train_size):
    # Shuffles the data but always in the same way to valid comparisons of models
    np.random.seed(10)
    np.random.shuffle(data)

    # separates D as the target variable
    X, y = data[:,:-1], data[:,-1]

    # Splits the data into training and test sets
    X_train, X_test = X[:-train_size], X[-train_size:]
    y_train, y_test = y[:-train_size], y[-train_size:]
    return X_train, X_test, y_train, y_test, X, y

In [None]:
def least_squares_regression(data, train_size, v=0):
    """Runs linear least squares regression on the data. 
    Normalises and scales the data if specified.
    v=0 no preprocessing, v=1 normalise, v=2 scale, v=3 normalise and scale."""

    # Creates a copy of the data so the original data is not modified by the function
    data = deepcopy(data)
    
    match v:
        case 1:
            data = normalize(data, axis=0) # axis=0 normalises the data by feature
        case 2:
            data = scale(data)
        case 3:
            data = normalize(data, axis=0) # axis=0 normalises the data by feature
            data = scale(data)
    
    # Splits the data into training and test sets
    X_train, X_test, y_train, y_test, _, _ = split_data(data, train_size)

    # Train model and make predictions
    regr = linear_model.LinearRegression()
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)

    # Prints the results
    print("Coefficients of A, B & C:", regr.coef_)
    print('Mean squared error: %.2f'
        % mean_squared_error(y_test, y_pred))
    print('Coefficient of determination (R2): %.2f'
        % r2_score(y_test, y_pred))

# Imports the data for question 1 and 2
DATA = data = np.genfromtxt('data.csv', delimiter=',', skip_header=1, dtype=float)

In [None]:
print("-" * 32 + " QUESTION 1 (a) " + "-" * 32)
least_squares_regression(data=DATA, train_size=75)
print("-" * 76)

b. Repeat the above task after carrying out in turn data normalisation, data scaling and
their combination, and evaluate the benefits of each of these 3 types of data preprocessing.

In [None]:
print("-" * 32 + " QUESTION 1 (b) " + "-" * 32)

# 1. Performs Data Normalisation
print("Data Normalisation:")
least_squares_regression(data=DATA, train_size=75, v=1)
print("-" * 76)

# 2. Performs Data Scaling
print("Data scaling")
least_squares_regression(data=DATA, train_size=75, v=2)
print("-" * 76)

# 3. Performs Data Normalisation and Data Scaling
print("Norm then Scale")
least_squares_regression(data=DATA, train_size=75, v=3)
print("-" * 76)


The combination of the two (normalise then scale) gives the same outcome as just scaling. This is because both are location-scale transformations. 


You should realise that an error measure like MSE is relative. You get a lower MSE after normalising probably because the actual values in y_test and preds are way larger.

c. Try to outperform the best result of the previous step by using regularisation (e.g. L1,
L2 or Elastic Net). Show how any parameter values are tuned and evaluate the benefits of
regularisation.

In [None]:
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

def regularisation(data, train_size):
    # Creates a copy of the data so the original data is not modified by the function
    data = deepcopy(data)

    # Splits the data into training and test sets
    X_train, X_test, y_train, y_test, X, y = split_data(data, train_size)

    pipe = Pipeline([('Scaler', StandardScaler()), ('Ridge', RidgeCV(alphas=np.logspace(-6, 6, 13)))])
    print('Testing complexity parameter values (i.e.alphas): ', np.logspace(-6, 6, 13))
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    print('Cross-validation got this value for the complexity parameter: ', pipe.named_steps['Ridge'].alpha_)

    for i, name in enumerate(["A", "B", "C"]):
        print('Parameter for {0} is {1}'.format(name,pipe.named_steps['Ridge'].coef_[i]))
    
    print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
    print('Coefficient of determination (R2): %.2f' % r2_score(y_test, y_pred))

    # Test of generalisation (10-fold cross-validation)
    scores = cross_val_score(pipe, X, y, cv=10)

    print("\nCross-Validation Accuracy:")
    print("min=" + str(scores.min().round(4)*100) + "%\n" +
            "max=" + str(scores.max().round(4)*100) + "%\n" +
            "mean=" + str(scores.mean().round(4)*100) + "%\n" +
            "std=" + str(scores.std().round(4)*100) + "%")
    print("\n\n")

print("-" * 32 + " QUESTION 1 (c) " + "-" * 32)
regularisation(data=DATA, train_size=75)
print("-" * 76)

# its choosing a low value for alpha so showing that it doesnt have an effect really?


4. Add a set of suitable basis functions to the original data and train a linear regression
with an appropriate type of regularisation to find out which of the new basis functions bring
benefits. Explain briefly (in no more than 4 sentences) your reasoning.

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html

https://jakevdp.github.io/PythonDataScienceHandbook/05.06-linear-regression.html

colab notebook example in week 5 lectures

In [50]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures


def polynomial_regression(data, train_size):
    data = deepcopy(data)
    
    # splits the data into training and test sets
    X_train, X_test, y_train, y_test, X, y = split_data(data, train_size)

    pipe = Pipeline([("BasisFunctions", PolynomialFeatures(3)), ("RidgeCV", RidgeCV(alphas=np.logspace(-6, 6, 13)))])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    print('Cross-validation got this value for the complexity parameter: ', pipe.named_steps['RidgeCV'].alpha_)
    for i, name in enumerate(["1 (bias)", "A", "B", "C", "A^2", "AB", "AC", "B^2", "BC", "C^2", "A^3", "(A^2)B", "(A^2)C", "A(B^2)", "ABC", "A(C^2)", "B^3", "(B^2)C", "B(C^2)", "C^3"]):
        print('Parameter for {0} is {1}'.format(name,pipe.named_steps['RidgeCV'].coef_[i]))
    print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
    print('Coefficient of determination (R2): %.2f' % r2_score(y_test, y_pred))

print("-" * 32 + " QUESTION 1 (d) " + "-" * 32)
polynomial_regression(DATA, 75)
print("-" * 76)


-------------------------------- QUESTION 1 (d) --------------------------------
Cross-validation got this value for the complexity parameter:  100.0
Parameter for 1 (bias) is 0.0
Parameter for A is 0.0001596185791835937
Parameter for B is 0.00025287887263919506
Parameter for C is 0.00018114018536565663
Parameter for A^2 is -0.0012686883315849995
Parameter for AB is -0.0004051410337876371
Parameter for AC is -0.0014317055256297118
Parameter for B^2 is 0.0019005886951444229
Parameter for BC is -0.0004470262421084614
Parameter for C^2 is -0.001615618264015961
Parameter for A^3 is 0.00655774382475216
Parameter for (A^2)B is -0.0023661436320647766
Parameter for (A^2)C is 0.007378014919337428
Parameter for A(B^2) is -0.007821892975950817
Parameter for ABC is -0.002705586058811882
Parameter for A(C^2) is 0.008300671227821521
Parameter for B^3 is 0.006716509637648471
Parameter for (B^2)C is -0.008780774728796459
Parameter for B(C^2) is -0.0030929062652758077
Parameter for C^3 is 0.00933846473

5. Implement an appropriate automated procedure that will train all of the above models
and select the model expected to perform best on unseen data with the same distribution as your
training data. You need to include a code tile at the end of this section of your Jupyter notebook
that attempts to test your final choice of model on a data set stored in a file unseendata.csv
and compute $R^2$ for it. The file will have exactly the same format as file data.csv, including
the header, but possibly a different overall number of rows. This means you can use a renamed
copy of data.csv to debug that part of your code, and to produce the corresponding content
for your PDF file (in order to demonstrate that this part of the code is in working order).


6. Starting with the data in data.csv, find the median value of variable D. Replace all
values up to and including the median value with 0, and all values greater than that with 1. Treat
the resulting values of D as class labels to train and evaluate a classifier based on logistic
regression that takes variables A, B and C as input.

In [55]:
def logistic_regression(data, train_size):
        # Replaces the last column with 1 if the value is greater than the median, 0 otherwise.
        median = np.median(data[:, 3])
        data[:, 3] = np.where(data[:, 3] > median, 1, 0)

        # Splits the data into training and test sets
        X_train, X_test, y_train, y_test, X, y = split_data(data, train_size)

        # Runs logistic regression
        logisticRegr = LogisticRegression()
        logisticRegr.fit(X_train, y_train)

        predictions = logisticRegr.predict(X_test)
        probability_predictions = logisticRegr.predict_proba(X_test)

        # Shows first 5 predictions
        print("First 5 example predictions:")
        for i in range(5):
                print("Predicted class:", predictions[i], 
                        "(Probability:" + str(probability_predictions[i][int(predictions[i])].round(4)) + ")")

        # Use score method to get out of sample accuracy of the model (2 decimal places)

        score = logisticRegr.score(X_test, y_test)
        print("\nOut of sample accuracy = " + str(score.round(2)*100) + "%")

        # Test of generalisation (10-fold cross-validation)
        scores = cross_val_score(logisticRegr, X, y, cv=10)

        print("\nCross-Validation Accuracy (Test of generalisation):")
        print("min=" + str(scores.min().round(4)*100) + "%\n" +
                "max=" + str(scores.max().round(4)*100) + "%\n" +
                "mean=" + str(scores.mean().round(4)*100) + "%\n" +
                "std=" + str(scores.std().round(4)*100) + "%")

print("-" * 32 + " QUESTION f (d) " + "-" * 32)
logistic_regression(DATA, 75)
print("-" * 76)

-------------------------------- QUESTION f (d) --------------------------------
First 5 example predictions:
Predicted class: 0.0 (Probability:0.9875)
Predicted class: 0.0 (Probability:0.9966)
Predicted class: 1.0 (Probability:0.9894)
Predicted class: 0.0 (Probability:0.8681)
Predicted class: 0.0 (Probability:0.9659)

Out of sample accuracy = 91.0%

Cross-Validation Accuracy:
min=66.67%
max=100.0%
mean=89.56%
std=11.59%
----------------------------------------------------------------------------


# Question 2: Principal Component Analysis (8 marks)
Starting with the same data.csv file from Q1, extend the table with 6 additional columns
consisting of the product of each pair of the original 4 variables A, B, C and D.
Apply principal component analysis (PCA) with a number of principal components (PCs) equal to
the number of original variables, i.e. p = 4. Label the resulting principal components in
decreasing order of variance as PC1. . .PC4 and list the linear equations showing how each of
them is calculated from the 10 input variables. Describe which variables affect most strongly
each of the 4 principal components, highlighting any notable findings and providing plausible
explanations for them.

In [None]:
from sklearn.decomposition import PCA
from itertools import combinations

data = np.loadtxt('data.csv',delimiter=',',skiprows=1)
scaler = StandardScaler()
scaler.fit(data)
scaled_data = scaler.transform(data)

# extend the table with 6 additional column consisting of the product of each pair of the original 4 variables A, B, C and D
for i, j in combinations(range(4), 2):
    scaled_data = np.hstack((scaled_data, np.atleast_2d(np.multiply(scaled_data[:, i], scaled_data[:, j])).T))


pca = PCA(n_components=4)
pca.fit(scaled_data)
newData = pca.fit_transform(scaled_data)

NewTotVar = 0
for i, dim in enumerate(('PC1','PC2','PC3','PC4')):
  Var2 = np.var(newData[:,i])
  NewTotVar += Var2

  print('Here is the sample variance for the {0} dimension'.format(dim))
  
  print('var = {0}\n\n'.format(Var2))
