# Intelligent Systems 3: Probabilistic and Deep Learning

### Question 1: Regression and classification (42 marks)
1. Train and evaluate a least squares linear regression model predicting the value of
variable D from variables A, B and C.

In [109]:
# This line is needed to use matplotlib in Jupyter notebook
%matplotlib inline
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import scale, normalize
import pandas as pd
import matplotlib.pyplot as plt
from copy import deepcopy
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
np.set_printoptions(suppress=True)

In [110]:
def least_squares_regression(data, train_size, v=0):
    """Runs linear least squares regression on the data. 
    Normalises and scales the data if specified."""
    # Creates a copy of the data so the original data is not modified by the function
    data = deepcopy(data)
    
    match v:
        case 1:
            # axis=0 normalises the data by column (by feature)
            data = normalize(data, axis=0)
        case 2:
            data = scale(data)
        case 3:
            # axis=0 normalises the data by column (by feature)
            data = normalize(data, axis=0)
            data = scale(data)
    
    # Shuffles the data but always in the same way to valid comparisons of models
    np.random.seed(0)
    np.random.shuffle(data)

    # separates D as the target variable
    X, y= data[:,:-1], data[:,-1]

    # Split the data into training/testing sets
    X_train, X_test = X[:-train_size], X[-train_size:]

    # Split the targets into training/testing sets
    y_train, y_test = y[:-train_size], y[-train_size:]

    # Create linear regression object
    regr = linear_model.LinearRegression()

    # Train the model using the training sets
    regr.fit(X_train, y_train)

    # Make predictions using the testing set
    y_pred = regr.predict(X_test)

    # The coefficients
    print("Coefficients of A, B & C:", regr.coef_)

    # The mean squared error
    print('Mean squared error: %.2f'
        % mean_squared_error(y_test, y_pred))

    # The coefficient of determination: 
    #   1 is perfect prediction
    #   0 is as good as always predicting the mean output value (using the training data).
    #   negative values are for a model that is worse than just predicting the mean.
    print('Coefficient of determination (R2): %.2f'
        % r2_score(y_test, y_pred))

# Imports the data for question 1 and 2
DATA = data = np.genfromtxt('data.csv', delimiter=',', skip_header=1, dtype=float)

In [111]:
print("-" * 32 + " QUESTION 1 (a) " + "-" * 32)
least_squares_regression(data=DATA, train_size=75)
print("-" * 76)

-------------------------------- QUESTION 1 (a) --------------------------------
Coefficients of A, B & C: [ 2.05729568e+12 -1.31857118e+12  2.79130714e+10]
Mean squared error: 0.32
Coefficient of determination (R2): 0.86
----------------------------------------------------------------------------


2. Repeat the above task after carrying out in turn data normalisation, data scaling and
their combination, and evaluate the benefits of each of these 3 types of data preprocessing.

In [112]:
print("-" * 32 + " QUESTION 1 (b)" + "-" * 32)

# 1. Performs Data Normalisation
print("Data Normalisation:")
least_squares_regression(data=DATA, train_size=75, v=1)
print("-" * 76)

# 2. Performs Data Scaling
print("Data scaling")
least_squares_regression(data=DATA, train_size=75, v=2)
print("-" * 76)

# 3. Performs Data Normalisation and Data Scaling
print("Norm -> Scale")
least_squares_regression(data=DATA, train_size=75, v=3)
print("-" * 76)

-------------------------------- QUESTION 1 (b)--------------------------------
Data Normalisation:
Coefficients of A, B & C: [ 2.44529946e+11 -1.12158356e+12  3.31144444e+11]
Mean squared error: 0.00
Coefficient of determination (R2): 0.86
----------------------------------------------------------------------------
Data scaling
Coefficients of A, B & C: [0.31674265 0.31674265 0.31674265]
Mean squared error: 0.14
Coefficient of determination (R2): 0.86
----------------------------------------------------------------------------
Norm -> Scale
Coefficients of A, B & C: [0.31674265 0.31674265 0.31674265]
Mean squared error: 0.14
Coefficient of determination (R2): 0.86
----------------------------------------------------------------------------


https://stats.stackexchange.com/questions/562888/why-not-both-standardize-and-normalize-features-for-machine-learning

Standard -> Norm
That is equivalent to normalizing only X since the standardization step does not change the min/max values. Besides, these transformations are not associated with normality assumption.

Norm -> Standard
If you did the transforms in the opposite order "normalize" then "standardize" you would get the same as just "standardize". Both are location-scale transformations

You should realise that an error measure like MSE is relative. You get a lower MSE after normalising probably because the actual values in y_test and preds are way larger.

3. Try to outperform the best result of the previous step by using regularisation (e.g. L1,
L2 or Elastic Net). Show how any parameter values are tuned and evaluate the benefits of
regularisation.

In [113]:
from sklearn.linear_model import RidgeCV, LassoCV

def regularisation(data, train_size, v):
    if v == 0:
        print("NO PREPROCESSING")
    if v ==1:
        print("NORMALISED")
    elif v == 2:
        print("SCALED")
    match v:
        case 1:
            # axis=0 normalises the data by column (by feature)
            data = normalize(data, axis=0)
        case 2:
            data = scale(data)
        case 3:
            # axis=0 normalises the data by column (by feature)
            data = normalize(data, axis=0)
            data = scale(data)

    # Shuffles the data but always in the same way to valid comparisons of models
    np.random.seed(0)
    np.random.shuffle(data)

    # separates D as the target variable
    X, y= data[:,:-1], data[:,-1]

    # Split the data into training/testing sets
    X_train, X_test = X[:-train_size], X[-train_size:]

    # Split the targets into training/testing sets
    y_train, y_test = y[:-train_size], y[-train_size:]

    for model in RidgeCV, LassoCV:
        print(RidgeCV.__name__ if model == RidgeCV else LassoCV.__name__)
        print("-" * 80)
        print('Trying these complexity parameter values (i.e.alphas): ', np.logspace(-6, 6, 13))
        reg = model(alphas=np.logspace(-6, 6, 13))
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)

        print('Using cross-validation got this value for the complexity parameter: ', reg.alpha_)

        for i, name in enumerate(["A", "B", "C"]):
            print('Parameter for {0} is {1}'.format(name,reg.coef_[i]))
        
        print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
        print('Coefficient of determination (R2): %.2f' % r2_score(y_test, y_pred))

        # Test of generalisation (10-fold cross-validation)
        scores = cross_val_score(reg, X, y, cv=10)

        print("\nCross-Validation Accuracy:")
        print("min=" + str(scores.min().round(4)*100) + "%\n" +
                "max=" + str(scores.max().round(4)*100) + "%\n" +
                "mean=" + str(scores.mean().round(4)*100) + "%\n" +
                "std=" + str(scores.std().round(4)*100) + "%")
    print("\n\n")


regularisation(data=DATA, train_size=75, v=0)
regularisation(data=DATA, train_size=75, v=1)
regularisation(data=DATA, train_size=75, v=2)

# its choosing a low value for alpha so showing that it doesnt have an effect really?


NO PREPROCESSING
RidgeCV
--------------------------------------------------------------------------------
Trying these complexity parameter values (i.e.alphas):  [      0.000001       0.00001        0.0001         0.001
       0.01           0.1            1.            10.
     100.          1000.         10000.        100000.
 1000000.      ]
Using cross-validation got this value for the complexity parameter:  1e-06
Parameter for A is 0.09921004087664187
Parameter for B is 0.1571754573378712
Parameter for C is 0.11258667567744851
Mean squared error: 0.32
Coefficient of determination (R2): 0.86

Cross-Validation Accuracy:
min=61.72%
max=96.58%
mean=84.57000000000001%
std=10.12%
LassoCV
--------------------------------------------------------------------------------
Trying these complexity parameter values (i.e.alphas):  [      0.000001       0.00001        0.0001         0.001
       0.01           0.1            1.            10.
     100.          1000.         10000.        100000.

4. Add a set of suitable basis functions to the original data and train a linear regression
with an appropriate type of regularisation to find out which of the new basis functions bring
benefits. Explain briefly (in no more than 4 sentences) your reasoning.

5. Implement an appropriate automated procedure that will train all of the above models
and select the model expected to perform best on unseen data with the same distribution as your
training data. You need to include a code tile at the end of this section of your Jupyter notebook
that attempts to test your final choice of model on a data set stored in a file unseendata.csv
and compute $R^2$ for it. The file will have exactly the same format as file data.csv, including
the header, but possibly a different overall number of rows. This means you can use a renamed
copy of data.csv to debug that part of your code, and to produce the corresponding content
for your PDF file (in order to demonstrate that this part of the code is in working order).


6. Starting with the data in data.csv, find the median value of variable D. Replace all
values up to and including the median value with 0, and all values greater than that with 1. Treat
the resulting values of D as class labels to train and evaluate a classifier based on logistic
regression that takes variables A, B and C as input.

In [107]:
np.set_printoptions(suppress=True)
np.random.seed(2)
train_size = 75

# Loads the dataset (skipping the headers) and shuffles the data.
data = np.genfromtxt('data.csv', delimiter=',', skip_header=1, dtype=float)
np.random.shuffle(data)

# Replaces the last column with 1 if the value is greater than the median, 0 otherwise.
median = np.median(data[:, 3])
data[:, 3] = np.where(data[:, 3] > median, 1, 0)

# separates D as the class variable
X = data[:,:-1]
y = data[:,-1]

# Split the data into training/testing sets
X_train = X[:-train_size]
X_test = X[-train_size:]

# Split the classifications into training/testing sets
y_train = y[:-train_size]
y_test = y[-train_size:]

# Runs logistic regression
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)

predictions = logisticRegr.predict(X_test)
probability_predictions = logisticRegr.predict_proba(X_test)

# Shows first 5 predictions
print("First 5 example predictions:")
for i in range(5):
    print("Predicted class:", predictions[i], 
            "(Probability:" + str(probability_predictions[i][int(predictions[i])].round(4)) + ")")

# Use score method to get out of sample accuracy of the model (2 decimal places)

score = logisticRegr.score(X_test, y_test)
print("\nOut of sample accuracy = " + str(score.round(2)*100) + "%")

# Test of generalisation (10-fold cross-validation)
scores = cross_val_score(logisticRegr, X, y, cv=10)

print("\nCross-Validation Accuracy:")
print("min=" + str(scores.min().round(4)*100) + "%\n" +
        "max=" + str(scores.max().round(4)*100) + "%\n" +
        "mean=" + str(scores.mean().round(4)*100) + "%\n" +
        "std=" + str(scores.std().round(4)*100) + "%")

First 5 example predictions:
Predicted class: 0.0 (Probability:0.9988)
Predicted class: 1.0 (Probability:0.9989)
Predicted class: 0.0 (Probability:0.8114)
Predicted class: 0.0 (Probability:0.6595)
Predicted class: 0.0 (Probability:0.9999)

Out of sample accuracy = 91.0%

Cross-Validation Accuracy:
min=66.67%
max=100.0%
mean=91.33%
std=9.65%


# Question 2: Principal Component Analysis (8 marks)
Starting with the same data.csv file from Q1, extend the table with 6 additional columns
consisting of the product of each pair of the original 4 variables A, B, C and D.
Apply principal component analysis (PCA) with a number of principal components (PCs) equal to
the number of original variables, i.e. p = 4. Label the resulting principal components in
decreasing order of variance as PC1. . .PC4 and list the linear equations showing how each of
them is calculated from the 10 input variables. Describe which variables affect most strongly
each of the 4 principal components, highlighting any notable findings and providing plausible
explanations for them.

In [108]:
from sklearn.decomposition import PCA
from itertools import combinations

data = np.loadtxt('data.csv',delimiter=',',skiprows=1)
scaler = StandardScaler()
scaler.fit(data)
scaled_data = scaler.transform(data)

# extend the table with 6 additional column consisting of the product of each pair of the original 4 variables A, B, C and D
for i, j in combinations(range(4), 2):
    scaled_data = np.hstack((scaled_data, np.atleast_2d(np.multiply(scaled_data[:, i], scaled_data[:, j])).T))


pca = PCA(n_components=4)
pca.fit(scaled_data)
newData = pca.fit_transform(scaled_data)

NewTotVar = 0
for i, dim in enumerate(('PC1','PC2','PC3','PC4')):
  Var2 = np.var(newData[:,i])
  NewTotVar += Var2

  print('Here is the sample variance for the {0} dimension'.format(dim))
  
  print('var = {0}\n\n'.format(Var2))


NameError: name 'StandardScaler' is not defined