In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Load temperature data

In [2]:
data = pd.read_csv('temp.csv');
print("data size:" , data.shape)
data.head()

data size: (366, 2)


Unnamed: 0,time,temp
0,0.002732,0.1
1,0.005464,-4.5
2,0.008197,-6.3
3,0.010929,-9.6
4,0.013661,-9.9


In [3]:
time = data.iloc[:,0:1].values
temp = data.iloc[:,1:2].values
print("time(5 samples): \n" , time[0:5])
print("\ntemp(5 samples): \n" , temp[0:5])

time(5 samples): 
 [[0.00273224]
 [0.00546448]
 [0.00819672]
 [0.01092896]
 [0.0136612 ]]

temp(5 samples): 
 [[ 0.1]
 [-4.5]
 [-6.3]
 [-9.6]
 [-9.9]]


In [4]:
X = time 
y = temp

# Make a Test and random Train Matrix

In [5]:
def shuffle_data(X, y):
    """ Random shuffle of the samples in X and y """
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    return X[idx], y[idx]

In [6]:
def train_test_split(X, y , test_size_ratio = 50):
    '''
        Split the training data from test data in the ratio specified in
        test_size 
    '''
    nSamples = len(y);
    test_size_ratio = test_size_ratio / 100.0;
    testSample = int(test_size_ratio * nSamples);
    split_i = nSamples - testSample;
    
    """ Split the data into train and test sets """
    X, y = shuffle_data(X, y)
    
    X_train, X_test = X[:split_i], X[split_i:]
    y_train, y_test = y[:split_i], y[split_i:]

    return X_train, X_test, y_train, y_test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size_ratio = 40) # 40% total Samples
print("X_train: " , X_train.shape)
print("X_test:  "  , X_test.shape)

print("\n")

print("y_train: " , y_train.shape)
print("y_test:  "  , y_test.shape)

X_train:  (220, 1)
X_test:   (146, 1)


y_train:  (220, 1)
y_test:   (146, 1)


In [8]:
_,nFeatures = data.shape; 
poly_degree = nFeatures;
print(poly_degree)

2


# Cross Validation
    allows us to compare different machine learning methods and get a sense of how well they will work in pratice
1. TRAIN the machine learning method
2. TEST the machine learning methods
    
    A terrible approach would be to use all of the data to estimate the parameters(train the algorithm) because then there wouldnt be any data to test the method. We need to know how the parameters perform.
    
ideally you want :

75% of data for TRAINING

25% of data for TESTING

10 Folds

In [11]:
def k_fold_cross_validation_sets(X, y, k_folds):
    """ Split the data into k sets of training / test data """
    X, y = shuffle_data(X, y)
    n_samples = len(y)
    left_overs = {}
    n_left_overs = (n_samples % k_folds)

    if n_left_overs != 0: # If not evenly divided get the last few samples
        left_overs["X"] = X[-n_left_overs:] # Get last nleftover rows
        left_overs["y"] = y[-n_left_overs:]
        X = X[:-n_left_overs] # X[0 ~ nleftover]
        y = y[:-n_left_overs]

    X_split = np.split(X, k_folds) # numOfFOlds by (numSamples / numFolds)
    y_split = np.split(y, k_folds)
    sets = []
    for i in range(k_folds):
        X_test, y_test = X_split[i], y_split[i]
        X_train = np.concatenate(X_split[:i] + X_split[i + 1:], axis=0)
        y_train = np.concatenate(y_split[:i] + y_split[i + 1:], axis=0)
        sets.append([X_train, X_test, y_train, y_test])

    # Add left over samples to last set as training samples
    if n_left_overs != 0:
        np.append(sets[-1][0], left_overs["X"], axis=0)
        np.append(sets[-1][2], left_overs["y"], axis=0)

    return np.array(sets)

In [18]:
k_folds = 10;
# cross_validation_sets = [X_train, X_test, y_train, y_test]
cross_validation_sets = k_fold_cross_validation_sets(X, y, k_folds); 
print("X_train total: " ,cross_validation_sets[0,0].shape);
print("X_test total:  " ,cross_validation_sets[0,1].shape);
print("Y_train total: " ,cross_validation_sets[0,2].shape);
print("Y_test total:  " ,cross_validation_sets[0,3].shape);

X_train total:  (324, 1)
X_test total:   (36, 1)
Y_train total:  (324, 1)
Y_test total:   (36, 1)


# Polynomial Ridge Regression
Performs a non-linear transformation of the data before fitting the model
and doing predictions which allows for doing non-linear regression.
sarameters
    -----------
    degree: int
        The degree of the polynomial that the independent variable X will be transformed to.
    n_iterations: float
        The number of training iterations the algorithm will tune the weights for.
    learning_rate: float
        The step length that will be used when updating the weights.

Finding regularization constant using cross validation:


In [None]:
lowest_error = float("inf");
best_reg_factor = None;
print ("Finding regularization constant using cross validation:")
k_folds = 10;
for reg_factor in np.arange(0, 1,1):
    cross_validation_sets = k_fold_cross_validation_sets(X, y, k_folds); # [X_train, X_test, y_train, y_test]
    # print(cross_validation_sets[0,0].shape) # X_train total
    # print(cross_validation_sets[0,1].shape) # X_test total
    mse = 0;
    for _X_train, _X_test, _y_train, _y_test in cross_validation_sets:
        model = PolynomialRidgeRegression(degree=poly_degree, 
                                    reg_factor=reg_factor,
                                    learning_rate=0.001,
                                    n_iterations=10000)
        model.fit(_X_train, _y_train)
        y_pred = model.predict(_X_test)
        _mse = mean_squared_error(_y_test, y_pred)
        mse += _mse
    mse /= k