In [1]:
# for array computations and loading data
import numpy as np

# for building linear regression models and preparing data
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# for building and training neural networks
import tensorflow as tf

# reduce display precision on numpy arrays
np.set_printoptions(precision=2)

# suppress warnings
tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(0)

In [2]:
# load dataset
data = np.loadtxt('/content/data.csv',delimiter=',')

# Split the inputs and outputs into separate arrays
x = data[:,0]
y = data[:,1]

# Convert 1-D arrays into 2-D because the commands later will require it
x = x.reshape(-1,1)
y = y.reshape(-1,1)

print(x.shape , y.shape)

(50, 1) (50, 1)


In [3]:
# Get 60% of the dataset as the training set. Put the remaining 40% in temporary variables: x_ and y_.
# Split the 40% subset above into two: one half for cross validation and the other for the test set

x_train , x_ , y_train , y_ = train_test_split(x,y , test_size=0.40,random_state=1)
x_cv , x_test , y_cv , y_test = train_test_split(x_ , y_ , test_size=0.50,random_state=1)

del x_ , y_

print(f"the shape of the training set (input) is: {x_train.shape}")
print(f"the shape of the training set (target) is: {y_train.shape}\n")
print(f"the shape of the cross validation set (input) is: {x_cv.shape}")
print(f"the shape of the cross validation set (target) is: {y_cv.shape}\n")
print(f"the shape of the test set (input) is: {x_test.shape}")
print(f"the shape of the test set (target) is: {y_test.shape}")

the shape of the training set (input) is: (30, 1)
the shape of the training set (target) is: (30, 1)

the shape of the cross validation set (input) is: (10, 1)
the shape of the cross validation set (target) is: (10, 1)

the shape of the test set (input) is: (10, 1)
the shape of the test set (target) is: (10, 1)


In [4]:
# feature scaling
scaler = StandardScaler() # z score

# mean and std dev
X_train_scaled = scaler.fit_transform(x_train)
print(f"Computed mean of the training set: {scaler.mean_.squeeze():.2f}")
print(f"Computed standard deviation of the training set: {scaler.scale_.squeeze():.2f}")

Computed mean of the training set: 2504.06
Computed standard deviation of the training set: 574.85


In [5]:
# train model

linear = LinearRegression()

linear.fit(X_train_scaled , y_train)

In [6]:
# evaluate model using mean squared error loss/cost function

yhat = linear.predict(X_train_scaled)

# Use scikit-learn's utility function and divide by 2
print(f"training MSE (using sklearn function): {mean_squared_error(y_train, yhat) / 2}")

# compare with our manual for loop implementation

total_error = 0

for i in range(len(yhat)):
  squared_i = (yhat[i] - y_train[i]) ** 2
  total_error += squared_i

mse = total_error / (2*len(yhat))

print(f"training MSE (for-loop implementation): {mse.squeeze()}")

training MSE (using sklearn function): 406.18142643101237
training MSE (for-loop implementation): 406.1814264310124


In [7]:
# error of cross validation set
X_cv_scaled = scaler.transform(x_cv)
print(f"Mean used to scale the CV set: {scaler.mean_.squeeze():.2f}")
print(f"Standard deviation used to scale the CV set: {scaler.scale_.squeeze():.2f}")

# prediction of cross validation set
yhat_cv = linear.predict(X_cv_scaled)
# Use scikit-learn's utility function and divide by 2
print(f"Cross validation MSE: {mean_squared_error(y_cv, yhat_cv) / 2}")

Mean used to scale the CV set: 2504.06
Standard deviation used to scale the CV set: 574.85
Cross validation MSE: 551.7633686764407


In [8]:
# polynomial features
poly = PolynomialFeatures(degree=2 , include_bias=False)

# fit and transform the training
X_train_mapped = poly.fit_transform(x_train)
print(X_train_mapped[:5])

[[3.32e+03 1.11e+07]
 [2.34e+03 5.50e+06]
 [3.49e+03 1.22e+07]
 [2.63e+03 6.92e+06]
 [2.59e+03 6.71e+06]]


In [9]:
scaler_poly = StandardScaler()
# calc mean and std dev

X_train_mapped_scaled = scaler_poly.fit_transform(X_train_mapped)

# Preview the first 5 elements of the scaled training set.
print(X_train_mapped_scaled[:5])

[[ 1.43  1.47]
 [-0.28 -0.36]
 [ 1.71  1.84]
 [ 0.22  0.11]
 [ 0.15  0.04]]


In [10]:
model = LinearRegression()

# train
model.fit(X_train_mapped_scaled , y_train)

# mse
yhat = model.predict(X_train_mapped_scaled)
print(f"training MSE: {mean_squared_error(y_train, yhat) / 2}")

X_cv_mapped = poly.transform(x_cv)
X_cv_mapped_scaled = scaler_poly.transform(X_cv_mapped)

# cv mse
yhat_cv = model.predict(X_cv_mapped_scaled)
print(f"Cross validation MSE: {mean_squared_error(y_cv, yhat_cv) / 2}")

training MSE: 49.115763074720526
Cross validation MSE: 87.694664384358


You'll notice that the MSEs are significantly better for both the training and cross validation set when you added the 2nd order polynomial. You may want to introduce more polynomial terms and see which one gives the best performance. As shown in class, you can have 10 different models like this:

You can create a loop that contains all the steps in the previous code cells. Here is one implementation that adds polynomial features up to degree=10. We'll plot it at the end to make it easier to compare the results for each model.

In [11]:
# neural networks

# add polynomial features
degree = 1
poly = PolynomialFeatures(degree , include_bias = False)
X_train_mapped = poly.fit_transform(x_train)
X_cv_mapped = poly.transform(x_cv)
X_test_mapped = poly.transform(x_test)

In [12]:
# zscore feature scaling

scaler = StandardScaler()
X_train_mapped_scaled = scaler.fit_transform(X_train_mapped)
X_cv_mapped_scaled = scaler.transform(X_cv_mapped)
X_test_mapped_scaled = scaler.transform(X_test_mapped)

In [None]:
# build and train

nn_train_mses = []
nn_cv_mses = []

nn_models =