# Intelligent Systems 3: Probabilistic and Deep Learning

### Question 1: Regression and classification (42 marks)
1. Train and evaluate a least squares linear regression model predicting the value of
variable D from variables A, B and C.

In [79]:
# This line is needed to use matplotlib in Jupyter notebook
%matplotlib inline

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

def least_squares_regression(data, train_size, pipeline):
    # separates D as the target variable
    X = data[:,:-1]
    y = data[:,-1]

    # Split the data into training/testing sets
    X_train = X[:-train_size]
    X_test = X[-train_size:]

    # Split the targets into training/testing sets
    y_train = y[:-train_size]
    y_test = y[-train_size:]

    # Train the model using the training sets
    pipeline.fit(X_train, y_train)

    # Make predictions using the testing set
    y_pred = pipeline.predict(X_test)

    # The coefficients
    print('Coefficients of A, B & C: \n', pipeline.named_steps['LinearRegression'].coef_)
    # The mean squared error
    print('Mean squared error: %.2f'
        % mean_squared_error(y_test, y_pred))
    # The coefficient of determination: 
    #   1 is perfect prediction
    #   0 is as good as always predicting the mean output value (using the training data).
    #   negative values are for a model that is worse than just predicting the mean.
    print('Coefficient of determination: %.2f'
        % r2_score(y_test, y_pred))

In [81]:
# Loads the dataset (skipping the headers)
data = np.genfromtxt('data.csv', delimiter=',', skip_header=1, dtype=float)
pipeline = Pipeline([('LinearRegression', LinearRegression())])
least_squares_regression(data=data, pipeline=pipeline, train_size=75)

Coefficients of A, B & C: 
 [-2.94826258e+13  5.23720052e+13 -4.71336538e+13]
Mean squared error: 0.43
Coefficient of determination: 0.55


2. Repeat the above task after carrying out in turn data normalisation, data scaling and
their combination, and evaluate the benefits of each of these 3 types of data preprocessing.

In [None]:
# 1. Performs Data Normalisation

In [84]:
# 2. Performs Data Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Standard Scaler
print("-" * 30)
print("Standard Scaler")
print("-" * 30)
data = np.genfromtxt('data.csv', delimiter=',', skip_header=1, dtype=float)
std_scaled_pipeline = Pipeline([('Standard Scaler', StandardScaler()), ('LinearRegression', LinearRegression())])
least_squares_regression(data=data, pipeline=std_scaled_pipeline, train_size=75)

# MinMax Scaler
print("-" * 30)
print("MinMax Scaler")
print("-" * 30)
data = np.genfromtxt('data.csv', delimiter=',', skip_header=1, dtype=float)
minmax_scaled_pipeline = Pipeline([('MinMax Scaler', MinMaxScaler()), ('LinearRegression', LinearRegression())])
least_squares_regression(data=data, pipeline=minmax_scaled_pipeline, train_size=75)

Coefficients of A, B & C: 
 [0.2139024 0.2139024 0.2139024]
Mean squared error: 0.65
Coefficient of determination: 0.32
