In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import random

In [3]:
# Getting R2 and Mean Square Error of model predictions
def R2(y_data, y_model):
    return 1 - np.sum((y_data - y_model)**2) / np.sum((y_data - np.mean(y_data)) ** 2)

def MSE(y_data,y_model):
    n = np.size(y_model)
    return np.sum((y_data-y_model)**2)/n

In [4]:
#Making feature matrix depending with complexity given by n
def create_X(x, y, n):
    if len(x.shape) > 1:
        x = np.ravel(x) 
        y = np.ravel(y)

    N = len(x) 
    l = int((n+1)*(n+2)/2)    
    X = np.ones((N,l)) 
    
    for i in range(1,n+1):
        q = int((i)*(i+1)/2)
        for k in range(i+1):
            X[:,q+k] = (x**(i-k))*(y**k)

    return X

In [5]:
#OLS 
def find_beta(X, z): 
    XT = X.T
    XTXinv = np.linalg.pinv(np.matmul(XT, X))
    XTz = np.matmul(XT, z)
    beta = np.matmul(XTXinv, XTz)

    return beta

In [None]:
#Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

def standardscaler(X_train, X_test, z_train, z_test): 
    #Using Sci-kit learn standard scaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    # Scaling independent variable
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    #scaling response variable
    z_train_scaled = (z_train - np.mean(z_train))/np.std(z_train)
    z_test_scaled = (z_test - np.mean(z_train))/np.std(z_train)

    return X_train_scaled, X_test_scaled, z_train_scaled, z_test_scaled


def meanscaler(X_train, X_test, z_train, z_test):
    #Taking the mean of the data
    mean_X = np.mean(X_train, axis=0)
    mean_z = np.mean(z_train)
    #Subtract mean from all data points
    X_train_scaled = X_train - mean_X
    X_test_scaled = X_test - mean_X
    z_train_scaled = z_train - mean_z
    z_test_scaled = z_test - mean_z
    
    return X_train_scaled, X_test_scaled, z_train_scaled, z_test_scaled

    
def scalerMinMax(X_train, X_test, z_train, z_test):
    #Using Sci-kit learn min-max scaler
    scaler = MinMaxScaler(feature_range=(0,1))
    scaler.fit(X_train)
    z_test_scaled = (z_test - np.mean(z_train))/np.std(z_train)
    #Scaling independent varaible
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    #reshape response varible to be scaled
    z_train = z_train.reshape((-1,1))
    z_test = z_test.reshape((-1,1))
    #Scaling response variable
    scaler = MinMaxScaler(feature_range=(0,1))
    scaler.fit(z_train)
    z_train_scaled = scaler.transform(z_train)
    scaler = MinMaxScaler(feature_range=(0,1)).fit(z_train)
    z_test_scaled = scaler.transform(z_test)
    # reshaping
    z_train_scaled.flatten()
    z_test_scaled.flatten() 
    return X_train_scaled, X_test_scaled, z_train_scaled, z_test_scaled


def robustscaler(X_train, X_test, z_train, z_test):
     #Using Sci-kit learn robust scaler
    scaler = RobustScaler().fit(X_train)
    #scaling independent variable
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    #Scaling response variable
    z_train = z_train.reshape((-1,1))
    z_test = z_test.reshape((-1,1))
    scaler = RobustScaler().fit(z_train)
    z_train_scaled = scaler.transform(z_train)
    scaler = RobustScaler().fit(z_train)
    z_test_scaled = scaler.transform(z_test)
    z_train_scaled = z_train_scaled.flatten()
    z_test_scaled = z_test_scaled.flatten()
    
    return X_train_scaled, X_test_scaled, z_train_scaled, z_test_scaled


def nonscaler(X_train, X_test, z_train, z_test):
    return(X_train, X_test, z_train, z_test)
    

In [None]:
# Bootstrapping
def bootstrap(x, z, x_test, z_test, iterations = 100):
    MSEs = np.zeros(iterations) 
    R2s = np.zeros(iterations) 
    z_preds = np.zeros((len(z_test), iterations)) 
    for i in range(iterations):
        bt_x, bt_z = resample(x, z)
        beta = find_beta(bt_x, bt_z) #Finding beta with new x train and z train
        z_pred = x_test @ beta #predict z with x_test
        z_preds[:, i] = z_pred.ravel()
        mse = MSE(z_test, z_pred)
        r2 = R2(z_test, z_pred) # getting statistics of prediction in current bootstrap
        MSEs[i] = mse
        R2s[i] = r2
    
    zpreds = z_preds.ravel()
    z_test = z_test.reshape((-1, 1))
    bt_err = np.mean( np.mean((z_test - z_preds)**2, axis=1, keepdims=True))
    bt_bias = np.mean((z_test - np.mean(z_preds, axis=1, keepdims=True))**2)
    bt_var = np.mean( np.var(z_preds, axis=1, keepdims=True) )
    #bt_var = np.mean( np.var(z_preds) )
    boot_MSE = np.mean(MSEs)
    boot_R2 = np.mean(R2s)
    
    return boot_MSE, boot_R2, bt_err, bt_bias, bt_var

In [None]:
#Cross validation

def cross_validation(feature, data, kfolds):
    n = len(data)
    #samme som tidligere, bare her uten replacement
    indices = np.random.choice(n, n, replacement=False)
    scores = np.zeros(kfolds)
    
    #definerer dimensjonene og størrelsene på de ulike delene
    fold_len = int(n/kfolds)
    feature_train = np.zeros([(kfolds-1)*fold_len, len(feature[0])])
    data_train = np.zeros((tkfolds-1)*fold_len)
    feature_test = np.zeros([fold_len, len(feature[0])])
    data_test = np.zeros(fold_len)
    feature_best = np.copy(feature_test)
    data_best = np.copy(data_train)
    data_test_best = np.copy(data_test)
    for i in range(kfolds):
        #De første fire delene sine indekser
        fold_indices = indices[:(kfolds-1)*fold_len]
        
        feature_train = feature[fold_indices]
        data_train = data[fold_indices]
        
        #den siste delens indekser
        test_fold_indices = indices[(kfolds-1)*fold_len:]
        
        feature_test = feature[test_fold_indices]
        data_test = data[test_fold_indices]
        
        #finner scorene
        beta = find_beta(feature_train, data_train)
        data_ = feature_test @ beta
        scores[i] = MSE(data_test, data_)
        #om i>0 er det automatisk den beste modellen så langt
        #if test nr.2 fungerer også ikke for i=0
        #den nye beste modellen blir lagret for senere bruk
        if i > 0:
            #sjekker om den nye scoren er bedre
            if scores[i] < scores[:i].min():
                feature_best = feature_test
                data_best = data_
                data_test_best = data_test
        else:
            feature_best = feature_test
            data_best = data_
            data_test_best = data_test
        
        #ruller indeksene til høyre, slik at den tidligere siste delen er nå først
        indices = np.roll(indices, fold_len)
        
    return scores, data_best, feature_best, data_test_best