# Regression on given dataset

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler
from sklearn.svm import SVR

from xgboost import XGBRegressor, XGBRFRegressor


In [2]:
def check_score(y_true, y_pred):
    print(f"Testing model:\n", f'\t-r2: {r2_score(y_true, y_pred)}\n', f'\t-MSE: {mean_squared_error(y_true, y_pred)}\n', f"\t-ABSE: {mean_absolute_error(y_true, y_pred)}\n")


In [3]:
X = pd.read_csv("./data/Xtrain.csv")
X.drop(["dteday"], axis=1, inplace=True)
y = pd.read_csv("./data/ytrain.csv")
y_1 = y.casual.copy()
y_2 = y.registered.copy()
X1_train, X1_test, y1_train, y1_test = train_test_split(X.values, y_1.values, test_size=0.2, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X.values, y_2.values, test_size=0.2, random_state=42)

In [4]:
print(X.shape)
X.head(2)

(13034, 13)


Unnamed: 0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
0,1,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0
1,2,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0


In [5]:
y.head(2)

Unnamed: 0,casual,registered
0,3,13
1,8,32


In [6]:
#sns.pairplot(X, corner=True)

## <span style="color:green"> Lets try boosted trees regressors

In [7]:
model = XGBRegressor()
model.fit(X1_train, y1_train)
y_pred = model.predict(X1_test)
check_score(y1_test, y_pred)

Testing model:
 	-r2: 0.9327579140663147
 	-MSE: 142.69940185546875
 	-ABSE: 7.346523284912109



In [8]:
model = XGBRegressor()
model.fit(X2_train, y2_train)
y_pred = model.predict(X2_test)
check_score(y2_test, y_pred)

Testing model:
 	-r2: 0.9579663276672363
 	-MSE: 695.7962036132812
 	-ABSE: 16.672039031982422



## <span style="color:green"> Now a little param tuning

In [9]:
model = XGBRegressor(n_estimators=1000, learning_rate=0.1)
model.fit(X1_train, y1_train)
y_pred = model.predict(X1_test)
check_score(y1_test, y_pred)

Testing model:
 	-r2: 0.9372549653053284
 	-MSE: 133.15577697753906
 	-ABSE: 7.128956317901611



In [10]:
model = XGBRegressor(n_estimators=1000, learning_rate=0.1)
model.fit(X2_train, y2_train)
y_pred = model.predict(X2_test)
check_score(y2_test, y_pred)

Testing model:
 	-r2: 0.9608710408210754
 	-MSE: 647.71337890625
 	-ABSE: 15.734640121459961



> sligtly better

# Random Forest regressor

In [11]:
model = XGBRFRegressor(n_estimators=2000)
model.fit(X1_train, y1_train)
y_pred = model.predict(X1_test)
check_score(y1_test, y_pred)

Testing model:
 	-r2: 0.8202438354492188
 	-MSE: 381.4737548828125
 	-ABSE: 11.14776611328125



In [12]:
model = XGBRFRegressor(n_estimators=1000)
model.fit(X2_train, y2_train)
y_pred = model.predict(X2_test)
check_score(y2_test, y_pred)

Testing model:
 	-r2: 0.763891339302063
 	-MSE: 3908.3759765625
 	-ABSE: 41.3068733215332



# Maybe some neural networks??

In [13]:
model = MLPRegressor(hidden_layer_sizes=(80,80,80), activation="tanh", solver='adam', max_iter=50, batch_size=5)
model.fit(X1_train, y1_train)
y_pred = model.predict(X1_test)
check_score(y1_test, y_pred)

Testing model:
 	-r2: -6.354360933458203e-05
 	-MSE: 2122.307900608883
 	-ABSE: 31.75054389614632



## Lets scale the data and use trees again

In [14]:
def scale_data_row(X):

    pipe = Pipeline([
        ('scaler', StandardScaler()),
    ])
    x_out = pipe.fit_transform(X.T).T
    return x_out

In [15]:
X1_train_s = scale_data_row(X1_train)
X1_test_s = scale_data_row(X1_test)
X2_train_s = scale_data_row(X2_train)
X2_test_s = scale_data_row(X2_test)

In [16]:
model = XGBRegressor(n_estimators=1000, learning_rate=0.1)
model.fit(X1_train_s, y1_train)
y_pred = model.predict(X1_test_s)
check_score(y1_test, y_pred)

Testing model:
 	-r2: 0.853745698928833
 	-MSE: 310.3769226074219
 	-ABSE: 10.469403266906738



In [17]:
model.fit(X2_train_s, y2_train)
y_pred = model.predict(X2_test_s)
check_score(y2_test, y_pred)

Testing model:
 	-r2: 0.8185337781906128
 	-MSE: 3003.86376953125
 	-ABSE: 36.280235290527344



> nope, bad idea, i think scaling with trees dont go along(if i remember correctly, trees are not vurnable for scaling in data)

# Maybe simple linear regression will work?

In [18]:
model = LinearRegression()
model.fit(X1_train, y1_train)
y_pred = model.predict(X1_test)
check_score(y1_test, y_pred)

Testing model:
 	-r2: 0.4591027157842643
 	-MSE: 1147.8776394206611
 	-ABSE: 22.925436026206864



In [19]:
degrees = 5

for deg in range(1,degrees):
    pipeline = Pipeline([
        ('poly', PolynomialFeatures(degree=deg)),
        ('regressor', LinearRegression())
    ])

    pipeline.fit(X1_train, y1_train)

    y_pred = pipeline.predict(X1_test)
    print(f"Degree:{deg}")
    check_score(y1_test, y_pred)

Degree:1
Testing model:
 	-r2: 0.4591027157842634
 	-MSE: 1147.877639420663
 	-ABSE: 22.92543602620693

Degree:2
Testing model:
 	-r2: 0.6396706217017906
 	-MSE: 764.6812957742528
 	-ABSE: 18.41852643124198

Degree:3
Testing model:
 	-r2: 0.7136988712721845
 	-MSE: 607.5805395918368
 	-ABSE: 16.505627588420165

Degree:4
Testing model:
 	-r2: 0.6824320391076881
 	-MSE: 673.9341681725068
 	-ABSE: 17.260104906130508



> Not bad, but we can see overtraining later, but still 0.71 r2, and nice Absolute error, seems ok, neural network should work fine then but its for sure overtrain

In [20]:
degrees = 5

for deg in range(1,degrees):
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ('poly', PolynomialFeatures(degree=deg)),
        ('regressor', LinearRegression())
    ])

    pipeline.fit(X1_train, y1_train)

    y_pred = pipeline.predict(X1_test)
    print(f"Degree:{deg}")
    check_score(y1_test, y_pred)

Degree:1
Testing model:
 	-r2: 0.4591027157842632
 	-MSE: 1147.8776394206634
 	-ABSE: 22.92543602620697

Degree:2
Testing model:
 	-r2: 0.6396706203056758
 	-MSE: 764.6812987370499
 	-ABSE: 18.418526449626658

Degree:3
Testing model:
 	-r2: 0.7896455144635877
 	-MSE: 446.4086201674814
 	-ABSE: 14.319722865546298

Degree:4
Testing model:
 	-r2: -22242.55880631237
 	-MSE: 47204681.03648449
 	-ABSE: 249.45104834107022



> with scaler even better results

# SVR??

In [21]:
model = SVR(kernel='rbf', degree=4,epsilon=0.01) # rbf kernel, eps = 0.1
model.fit(X1_train, y1_train)
y_pred = model.predict(X1_test)
check_score(y1_test, y_pred)

Testing model:
 	-r2: -0.01555793397622085
 	-MSE: 2155.1896782728095
 	-ABSE: 25.902865798879013



> but we know SVM doesnt like non scaled data, so lets try scale a little bit and play with different kind of transformations

In [22]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', SVR(kernel='rbf'))
])

pipeline.fit(X1_train, y1_train)

y_pred = pipeline.predict(X1_test)
print("SVR-rbf, standarded scaled:")
check_score(y1_test, y_pred)

pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('regressor', SVR(kernel='rbf'))
])

pipeline.fit(X1_train, y1_train)

y_pred = pipeline.predict(X1_test)
print("SVR-rbf, robust scaled:")
check_score(y1_test, y_pred)

degrees = 6
for deg in range(1,degrees):
    pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('regressor', SVR(kernel='poly',degree=deg,epsilon=0.1))
    ])

    pipeline.fit(X1_train, y1_train)

    y_pred = pipeline.predict(X1_test)
    print(f"SVR-poly{deg}, standarded scaled:")
    check_score(y1_test, y_pred)


SVR-rbf, standarded scaled:
Testing model:
 	-r2: 0.5276974161081092
 	-MSE: 1002.3078150155193
 	-ABSE: 16.137221281004557

SVR-rbf, robust scaled:
Testing model:
 	-r2: 0.5384129719211853
 	-MSE: 979.5675512524491
 	-ABSE: 15.98917765010838

SVR-poly1, standarded scaled:
Testing model:
 	-r2: 0.32526879676975873
 	-MSE: 1431.896375538984
 	-ABSE: 20.701716838883893

SVR-poly2, standarded scaled:
Testing model:
 	-r2: 0.5208646529025067
 	-MSE: 1016.8081209479839
 	-ABSE: 17.476607934771867

SVR-poly3, standarded scaled:
Testing model:
 	-r2: 0.507665485310903
 	-MSE: 1044.8190386942906
 	-ABSE: 17.69653626137258

SVR-poly4, standarded scaled:
Testing model:
 	-r2: 0.49332670217662977
 	-MSE: 1075.2484178326958
 	-ABSE: 17.839760493486427

SVR-poly5, standarded scaled:
Testing model:
 	-r2: 0.44613954200306305
 	-MSE: 1175.3877374625479
 	-ABSE: 18.76410091576881



> scaling is all you need somethimes(expecially in SVM, where you base on vectors reliationship). So the final best model is xgboost trees regressors working very well for this kind of data.

# <span style="color:lightgreen;">First tree tuning

In [33]:
model = XGBRegressor(n_estimators=1000, learning_rate=0.09)
model.fit(X1_train, y1_train)
y_pred = model.predict(X1_test)
check_score(y1_test, y_pred)

Testing model:
 	-r2: 0.9381145238876343
 	-MSE: 131.33175659179688
 	-ABSE: 7.101688385009766

