### CS-C3240 - Machine Learning
## ML Project
#### Student number: 918697
#### Name: Alex Herrero Pons

This notebook is used for building a machine learning problem that predicts the wave height in function of the weed speed, the wave period and the temperature of the water.

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
%matplotlib inline

We use the data obtained from [windguru](https://www.windguru.cz/ "Windguru home page") for the past years in Jaws, one of the most famous spots in the surfing world.

In [2]:
filename = "JawsData.csv"
hours = np.arange(2, 24, 3).tolist()
hs = len(hours)
features = ['Wind speed', 'Wave height', 'Wave period', 'Temperature']
headers = [f+str(h) for f in features for h in hours]
df = pd.read_csv("./data/" + filename,sep='\t',skiprows=(0,1), names = headers)
df = df.dropna()
df.reset_index(drop=True, inplace=True)

feature = []
for i,f in enumerate(features):
    feature.append(df[[f+str(h) for h in hours]])

N = len(df)
data = []

for i in range(N):
    for j,h in enumerate(hours):
        aux = []
        for k,f in enumerate(features):
            spot = f+str(h)
            aux.append(feature[k][spot][i])
        data.append(aux)

DATA = pd.DataFrame(data, columns = features)
print(DATA.head())
print(len(DATA))

   Wind speed  Wave height  Wave period  Temperature
0          12          2.9           16           20
1           9          2.7           16           20
2           7          2.6           15           21
3           7          2.5           15           23
4           6          2.4           15           23
2928


In [5]:
y = DATA['Wave height'].values
X = DATA.drop('Wave height', axis=1).values

X_train_val,X_test,y_train_val,y_test = train_test_split(X, y, test_size=0.2)

kf = KFold(n_splits = 10)

def polyreg(degree):
    return make_pipeline(PolynomialFeatures(degree),LinearRegression())

models = [LinearRegression(), Lasso(alpha=0.1), svm.SVR(), polyreg(2), polyreg(3), polyreg(4), polyreg(5), polyreg(6)] #LogisticRegression
model_names = ['LinearRegression()', 'Lasso(alpha=0.1)', 'svm.SVR()', 'polyreg(2)', 'polyreg(3)', 'polyreg(4)', 'polyreg(5)', 'polyreg(6)']

N = 20
max_degree = 10
min_error = np.inf

for k in range(1,max_degree+1):
    list_training_errors = []
    list_validation_errors = []
    list_test_errors = []
    for i in range(N):
        poly = PolynomialFeatures(degree = k)    
        training_errors = []
        validation_errors = []
        for train_index, test_index in kf.split(X_train_val):
            X_train, X_val = X_train_val[train_index], X_train_val[test_index]
            y_train, y_val = y_train_val[train_index], y_train_val[test_index]
            
            X_poly_train = poly.fit_transform(X_train)
            X_poly_val = poly.fit_transform(X_val)
            
            regressor = LinearRegression()
            regressor.fit(X_poly_train,y_train)
            
            y_train_pred = regressor.predict(X_poly_train)
            y_val_pred = regressor.predict(X_poly_val)
            
            training_errors.append(mean_squared_error(y_train,y_train_pred))
            validation_errors.append(mean_squared_error(y_val,y_val_pred))
            
        list_training_errors.append(np.mean(training_errors))
        list_validation_errors.append(np.mean(validation_errors))

        X_poly_test = poly.fit_transform(X_test)
        y_test_pred = regressor.predict(X_poly_test)
        list_test_errors.append(mean_squared_error(y_test,y_test_pred))

    training_error = np.mean(list_training_errors)
    validation_error = np.mean(list_validation_errors)
    test_error = np.mean(list_test_errors)
    avg_error = (training_error+validation_error+test_error)/3
    print('dregree =',k)
    print('training_error =', training_error)
    print('validation_error =', validation_error)
    print('test_error =', test_error)
    print('avg_error = ', avg_error)
    print()

    if min_error > avg_error:
        min_error = avg_error
        min_degree = k

print('----------------------------------------------')
print('Best model occurs with degree =', min_degree)
print('Where validation_error =', min_error)
print('----------------------------------------------')
        

dregree = 1
training_error = 0.18900759266645203
validation_error = 0.18996618829856
test_error = 0.18242928240872297
avg_error =  0.18713435445791168

dregree = 2
training_error = 0.16827217916600207
validation_error = 0.17000193747439785
test_error = 0.162815503113814
avg_error =  0.16702987325140461

dregree = 3
training_error = 0.16290557135899336
validation_error = 0.16663701228177913
test_error = 0.16196129290440603
avg_error =  0.1638346255150595

dregree = 4
training_error = 0.15084856986997833
validation_error = 0.17225263989137174
test_error = 0.15332443242920538
avg_error =  0.1588085473968518

dregree = 5
training_error = 0.14442428207016184
validation_error = 0.1592559930251744
test_error = 0.15085325317194986
avg_error =  0.15151117608909537

dregree = 6
training_error = 0.13970129515220325
validation_error = 0.48980479857371434
test_error = 0.16240691597488172
avg_error =  0.2639710032335998

dregree = 7
training_error = 0.1313336340463312
validation_error = 2.4401244473