In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('korkeasaari_dataset.csv')

In [3]:
print(df.to_markdown())

|      |   month | weekday   |   visitors |   tempature | rain   |
|-----:|--------:|:----------|-----------:|------------:|:-------|
|    0 |       1 | tue       |        233 |         2.6 | True   |
|    1 |       1 | wed       |        315 |         2   | False  |
|    2 |       1 | thu       |        696 |         0.6 | False  |
|    3 |       1 | fri       |        643 |         1.1 | False  |
|    4 |       1 | sat       |       1027 |        -2.8 | False  |
|    5 |       1 | sun       |        732 |        -5   | False  |
|    6 |       1 | mon       |        177 |        -5   | False  |
|    7 |       1 | tue       |         88 |        -0.4 | False  |
|    8 |       1 | wed       |         70 |        -0.8 | False  |
|    9 |       1 | thu       |         58 |        -3.7 | False  |
|   10 |       1 | fri       |         62 |        -7.2 | False  |
|   11 |       1 | sat       |        209 |       -10.8 | False  |
|   12 |       1 | sun       |        268 |        -7.8 | Fals

In [4]:
X = df[['month', 'weekday', 'tempature', 'rain']]
y = df['visitors']

In [5]:
X = pd.get_dummies(X, columns=['month', 'weekday'])

In [6]:
print(X.to_markdown())

|      |   tempature | rain   | month_1   | month_2   | month_3   | month_4   | month_5   | month_6   | month_7   | month_8   | month_9   | month_10   | month_11   | month_12   | weekday_fri   | weekday_mon   | weekday_sat   | weekday_sun   | weekday_thu   | weekday_tue   | weekday_wed   |
|-----:|------------:|:-------|:----------|:----------|:----------|:----------|:----------|:----------|:----------|:----------|:----------|:-----------|:-----------|:-----------|:--------------|:--------------|:--------------|:--------------|:--------------|:--------------|:--------------|
|    0 |         2.6 | True   | True      | False     | False     | False     | False     | False     | False     | False     | False     | False      | False      | False      | False         | False         | False         | False         | False         | True          | False         |
|    1 |         2   | False  | True      | False     | False     | False     | False     | False     | False     | False     |

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [8]:
model_1 = LinearRegression()
model_2 = MLPRegressor(max_iter=10000, random_state=42) # Random state 42 for reproducability!

In [9]:
kfold = KFold(n_splits=6, shuffle=True, random_state=42) # Random state 42 for reproducability!

In [10]:
train_errors_1 = []
train_errors_2 = []

val_errors_1 = []
val_errors_2 = []

for train_idx, val_idx in kfold.split(X_train):
    X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    model_1.fit(X_fold_train, y_fold_train)
    model_2.fit(X_fold_train, y_fold_train)

    y_train_pred_1 = model_1.predict(X_fold_train)
    y_train_pred_2 = model_2.predict(X_fold_train)

    y_val_pred_1 = model_1.predict(X_fold_val)
    y_val_pred_2 = model_2.predict(X_fold_val)

    train_err_1 = mean_squared_error(y_fold_train, y_train_pred_1)
    train_err_2 = mean_squared_error(y_fold_train, y_train_pred_2)

    val_err_1 = mean_squared_error(y_fold_val, y_val_pred_1)
    val_err_2 = mean_squared_error(y_fold_val, y_val_pred_2)
    
    train_errors_1.append(train_err_1)
    train_errors_2.append(train_err_2)
    val_errors_1.append(val_err_1)
    val_errors_2.append(val_err_2)

In [11]:
mean_train_1 = sum(train_errors_1) / len(train_errors_1)
mean_train_2 = sum(train_errors_2) / len(train_errors_2)
mean_val_1 = sum(val_errors_1) / len(val_errors_1)
mean_val_2 = sum(val_errors_2) / len(val_errors_2)

print(f"Mean Training error of Linear regression: {mean_train_1}")
print(f"Mean Training error of Multi-layer perceptron: {mean_train_2}")
print(f"Mean Validation error of Linear regression: {mean_val_1}")
print(f"Mean Validation error of Multi-layer perceptron: {mean_val_2}")

Mean Training error of Linear regression: 1142720.5632966806
Mean Training error of Multi-layer perceptron: 953908.3999712206
Mean Validation error of Linear regression: 1159454.6135108806
Mean Validation error of Multi-layer perceptron: 1037753.0262075225


In [12]:
y_test_pred_1 = model_1.predict(X_test)
y_test_pred_2 = model_2.predict(X_test)
test_error_1 = mean_squared_error(y_test, y_test_pred_1)
test_error_2 = mean_squared_error(y_test, y_test_pred_2)

print(f"Test error of Linear regression: {test_error_1}")
print(f"Test error of Multi-layer perceptron: {test_error_2}")

Test error of Linear regression: 1037029.7171598804
Test error of Multi-layer perceptron: 984353.835893942
