# Intelligent Systems 3: Probabilistic and Deep Learning

### Question 1: Regression and classification (42 marks)
1. Train and evaluate a least squares linear regression model predicting the value of
variable D from variables A, B and C.

In [1]:
# This line is needed to use matplotlib in Jupyter notebook
%matplotlib inline
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt

In [2]:

def least_squares_regression(data, train_size, pipeline):
    # separates D as the target variable
    X = data[:,:-1]
    y = data[:,-1]

    # Split the data into training/testing sets
    X_train = X[:-train_size]
    X_test = X[-train_size:]

    # Split the targets into training/testing sets
    y_train = y[:-train_size]
    y_test = y[-train_size:]

    # Train the model using the training sets
    pipeline.fit(X_train, y_train)

    # Make predictions using the testing set
    y_pred = pipeline.predict(X_test)

    # The coefficients
    print("Coefficients of A, B & C:", pipeline.named_steps['LinearRegression'].coef_)

    # The intercept
    print("Intercept:", pipeline.named_steps['LinearRegression'].intercept_)

    # The mean squared error
    print('Mean squared error: %.2f'
        % mean_squared_error(y_test, y_pred))

    # The coefficient of determination: 
    #   1 is perfect prediction
    #   0 is as good as always predicting the mean output value (using the training data).
    #   negative values are for a model that is worse than just predicting the mean.
    print('Coefficient of determination (R2): %.2f'
        % r2_score(y_test, y_pred))

In [3]:
print("-" * 32 + " QUESTION 1 (a) " + "-" * 32)

# Loads the dataset (skipping the headers) and runs basic linear least squares regression.
data = np.genfromtxt('data.csv', delimiter=',', skip_header=1, dtype=float)
pipeline = Pipeline([('LinearRegression', LinearRegression())])
least_squares_regression(data=data, pipeline=pipeline, train_size=75)

print("-" * 76)


-------------------------------- QUESTION 1 (a) --------------------------------
Coefficients of A, B & C: [-2.94826258e+13  5.23720052e+13 -4.71336538e+13]
Intercept: -524172862739197.44
Mean squared error: 0.43
Coefficient of determination (R2): 0.55
----------------------------------------------------------------------------


2. Repeat the above task after carrying out in turn data normalisation, data scaling and
their combination, and evaluate the benefits of each of these 3 types of data preprocessing.

In [4]:
print("-" * 32 + " QUESTION 1 (b)" + "-" * 32)

# 1. Performs Data Normalisation
from sklearn.preprocessing import Normalizer

print("Normalizer:")
data = np.genfromtxt('data.csv', delimiter=',', skip_header=1, dtype=float)
minmax_scaled_pipeline = Pipeline([('Normalizer', Normalizer()), ('LinearRegression', LinearRegression())])
least_squares_regression(data=data, pipeline=minmax_scaled_pipeline, train_size=75)

# 2. Performs Data Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Standard Scaler
print("\nScaler:")
data = np.genfromtxt('data.csv', delimiter=',', skip_header=1, dtype=float)
std_scaled_pipeline = Pipeline([('Standard Scaler', StandardScaler()), ('LinearRegression', LinearRegression())])
least_squares_regression(data=data, pipeline=std_scaled_pipeline, train_size=75)

-------------------------------- QUESTION 1 (b)--------------------------------
Normalizer:
Coefficients of A, B & C: [10.32618043 53.79900775 11.90608955]
Intercept: -53.56647174364234
Mean squared error: 11.98
Coefficient of determination (R2): -11.48

Scaler:
Coefficients of A, B & C: [0.2139024 0.2139024 0.2139024]
Intercept: 1.8055500000000004
Mean squared error: 0.65
Coefficient of determination (R2): 0.32


3. Try to outperform the best result of the previous step by using regularisation (e.g. L1,
L2 or Elastic Net). Show how any parameter values are tuned and evaluate the benefits of
regularisation.

4. Add a set of suitable basis functions to the original data and train a linear regression
with an appropriate type of regularisation to find out which of the new basis functions bring
benefits. Explain briefly (in no more than 4 sentences) your reasoning.

5. Implement an appropriate automated procedure that will train all of the above models
and select the model expected to perform best on unseen data with the same distribution as your
training data. You need to include a code tile at the end of this section of your Jupyter notebook
that attempts to test your final choice of model on a data set stored in a file unseendata.csv
and compute $R^2$ for it. The file will have exactly the same format as file data.csv, including
the header, but possibly a different overall number of rows. This means you can use a renamed
copy of data.csv to debug that part of your code, and to produce the corresponding content
for your PDF file (in order to demonstrate that this part of the code is in working order).


6. Starting with the data in data.csv, find the median value of variable D. Replace all
values up to and including the median value with 0, and all values greater than that with 1. Treat
the resulting values of D as class labels to train and evaluate a classifier based on logistic
regression that takes variables A, B and C as input.

In [11]:
from sklearn.linear_model import LogisticRegression
np.random.seed(2)
train_size = 75

# Loads the dataset (skipping the headers) and shuffles the data.
data = np.genfromtxt('data.csv', delimiter=',', skip_header=1, dtype=float)
np.random.shuffle(data)

# Replaces the last column with 1 if the value is greater than the median, 0 otherwise.
median = np.median(data[:, 3])
data[:, 3] = np.where(data[:, 3] > median, 1, 0)

# separates D as the class variable
X = data[:,:-1]
y = data[:,-1]

# Split the data into training/testing sets
X_train = X[:-train_size]
X_test = X[-train_size:]

# Split the classifications into training/testing sets
y_train = y[:-train_size]
y_test = y[-train_size:]

# Runs logistic regression
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)

logisticRegr.predict(X_test)

# Use score method to get out of sample accuracy of the model (2 decimal places)
score = logisticRegr.score(X_test, y_test)
print("Out of sample accuracy =", score.round(2))

Out of sample accuracy = 0.91


# Question 2: Principal Component Analysis (8 marks)

In [None]:
from sklearn.decomposition import PCA
from itertools import combinations

data = np.loadtxt('data.csv',delimiter=',',skiprows=1)
scaler = StandardScaler()
scaler.fit(data)
scaled_data = scaler.transform(data)

# extend the table with 6 additional column consisting of the product of each pair of the original 4 variables A, B, C and D
for i, j in combinations(range(4), 2):
    scaled_data = np.hstack((scaled_data, np.atleast_2d(np.multiply(scaled_data[:, i], scaled_data[:, j])).T))


pca = PCA(n_components=4)
pca.fit(scaled_data)
newData = pca.fit_transform(scaled_data)

NewTotVar = 0
for i, dim in enumerate(('PC1','PC2','PC3','PC4')):
  Var2 = np.var(newData[:,i])
  NewTotVar += Var2

  print('Here is the sample variance for the {0} dimension'.format(dim))
  
  print('var = {0}\n\n'.format(Var2))
