# Intelligent Systems 3: Probabilistic and Deep Learning

### Question 1: Regression and classification (42 marks)
1. Train and evaluate a least squares linear regression model predicting the value of
variable D from variables A, B and C.

In [None]:
# This line is needed to use matplotlib in Jupyter notebook
%matplotlib inline

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt

def least_squares_regression(data, train_size, pipeline):
    # separates D as the target variable
    X = data[:,:-1]
    y = data[:,-1]

    # Split the data into training/testing sets
    X_train = X[:-train_size]
    X_test = X[-train_size:]

    # Split the targets into training/testing sets
    y_train = y[:-train_size]
    y_test = y[-train_size:]

    # Train the model using the training sets
    pipeline.fit(X_train, y_train)

    # Make predictions using the testing set
    y_pred = pipeline.predict(X_test)

    # The coefficients
    print("Coefficients of A, B & C:", pipeline.named_steps['LinearRegression'].coef_)

    # The intercept
    print("Intercept:", pipeline.named_steps['LinearRegression'].intercept_)

    # The mean squared error
    print('Mean squared error: %.2f'
        % mean_squared_error(y_test, y_pred))

    # The coefficient of determination: 
    #   1 is perfect prediction
    #   0 is as good as always predicting the mean output value (using the training data).
    #   negative values are for a model that is worse than just predicting the mean.
    print('Coefficient of determination (R2): %.2f'
        % r2_score(y_test, y_pred))

In [None]:
print("-" * 32 + " QUESTION 1 (a) " + "-" * 32)

# Loads the dataset (skipping the headers) and runs basic linear least squares regression.
data = np.genfromtxt('data.csv', delimiter=',', skip_header=1, dtype=float)
pipeline = Pipeline([('LinearRegression', LinearRegression())])
least_squares_regression(data=data, pipeline=pipeline, train_size=75)

print("-" * 76)


2. Repeat the above task after carrying out in turn data normalisation, data scaling and
their combination, and evaluate the benefits of each of these 3 types of data preprocessing.

In [None]:
print("-" * 32 + " QUESTION 1 (b)" + "-" * 32)

# 1. Performs Data Normalisation
from sklearn.preprocessing import Normalizer

print("Normalizer:")
data = np.genfromtxt('data.csv', delimiter=',', skip_header=1, dtype=float)
minmax_scaled_pipeline = Pipeline([('Normalizer', Normalizer()), ('LinearRegression', LinearRegression())])
least_squares_regression(data=data, pipeline=minmax_scaled_pipeline, train_size=75)

# 2. Performs Data Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Standard Scaler
print("\nScaler:")
data = np.genfromtxt('data.csv', delimiter=',', skip_header=1, dtype=float)
std_scaled_pipeline = Pipeline([('Standard Scaler', StandardScaler()), ('LinearRegression', LinearRegression())])
least_squares_regression(data=data, pipeline=std_scaled_pipeline, train_size=75)

# Question 2: Principal Component Analysis (8 marks)

In [None]:
from sklearn.decomposition import PCA
from itertools import combinations

data = np.loadtxt('data.csv',delimiter=',',skiprows=1)

# extend the table with 6 additional column consisting of the product of each pair of the original 4 variables A, B, C and D
for i, j in combinations(range(4), 2):
    data = np.hstack((data, np.atleast_2d(np.multiply(data[:, i], data[:, j])).T))


pca = PCA(n_components=4)
pca.fit(data)
newData = pca.fit_transform(data)

NewTotVar = 0
for i, dim in enumerate(('PC1','PC2','PC3','PC4')):
  Var2 = np.var(newData[:,i])
  NewTotVar += Var2
  print('Here is the sample variance for the {0} dimension'.format(dim))
  print('var = {0}\n\n'.format(Var2))

