In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.metrics import r2_score, mean_squared_error

import catboost as cb
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor

np.random.seed(42)

In [2]:
xtrain = pd.read_csv("data/XTrain.csv")
ytrain = pd.read_csv("data/yTrain.csv")
xtest = pd.read_csv("data/XTest.csv")

In [3]:
xtrain.head()

Unnamed: 0,Index,Date,Hour,Temperature(�C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(�C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,0,26/04/2018,5,10.2,73,0.8,1137,5.5,0.0,0.0,0.0,Spring,No Holiday,Yes
1,1,2/4/2018,7,15.1,80,1.0,623,11.6,0.01,0.0,0.0,Spring,No Holiday,Yes
2,2,25/05/2018,8,17.1,57,1.7,792,8.5,1.21,0.0,0.0,Spring,No Holiday,Yes
3,3,8/4/2018,22,2.7,88,2.0,621,0.9,0.0,0.0,0.0,Spring,No Holiday,Yes
4,4,6/4/2018,17,6.3,50,4.4,707,-3.3,0.64,0.0,0.0,Spring,No Holiday,Yes


# data preprocessing

In [4]:
def date_mapper(d, i):
    d = d.split('/')
    # return d[1]
    return int(d[i])-1

def season_mapper(s):
    return {season: i for i, season in enumerate(['Spring', 'Summer', 'Autumn', 'Winter'])}[s]

def holiday_mapper(h):
    return 0 if h=="No Holiday" else 1

def functioningDay_mapper(f):
    return 0 if f=="No" else 1

xtrain['DateDay'] = xtrain.Date.apply(lambda d: date_mapper(d, 0))
xtest['DateDay'] = xtest.Date.apply(lambda d: date_mapper(d, 0))

xtrain['DateMonth'] = xtrain.Date.apply(lambda d: date_mapper(d, 1))
xtest['DateMonth'] = xtest.Date.apply(lambda d: date_mapper(d, 1))

xtrain['DateYear'] = xtrain.Date.apply(lambda d: date_mapper(d, 2))
xtest['DateYear'] = xtest.Date.apply(lambda d: date_mapper(d, 2))

xtrain = xtrain.drop(columns=['Date'])
xtest = xtest.drop(columns=['Date'])

xtrain.Seasons = xtrain.Seasons.apply(season_mapper)
xtest.Seasons = xtest.Seasons.apply(season_mapper)

xtrain.Holiday = xtrain.Holiday.apply(holiday_mapper)
xtest.Holiday = xtest.Holiday.apply(holiday_mapper)

xtrain['Functioning Day'] = xtrain['Functioning Day'].apply(functioningDay_mapper)
xtest['Functioning Day'] = xtest['Functioning Day'].apply(functioningDay_mapper)

In [5]:
# scaler = MinMaxScaler()
scaler = RobustScaler()

numerical_columns = [
    'Temperature(�C)', 'Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)', 
    'Dew point temperature(�C)', 'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)'
]

scaler.fit(xtrain[numerical_columns])
transformed_xtrain = scaler.transform(xtrain[numerical_columns])
transformed_xtest = scaler.transform(xtest[numerical_columns])

for i, c in enumerate(numerical_columns):
    xtrain[c] = transformed_xtrain[:, i]
    xtest[c] = transformed_xtest[:, i]

In [6]:
xtrain = xtrain[xtrain.columns[1:]]
xtest = xtest[xtest.columns[1:]]
ytrain = ytrain[ytrain.columns[1:]]

In [7]:
# xtrain, xval, ytrain, yval = train_test_split(xtrain[xtrain.columns[1:]], ytrain[ytrain.columns[1]], test_size = 0.2)

In [8]:
len(xtrain)

6132

# polynomial regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

model = linear_model.LinearRegression()

kf = KFold(n_splits=5)
r2_scores = []

for train_index, val_index in kf.split(xtrain):
    xtrainfold = xtrain.iloc[train_index]
    ytrainfold = ytrain.iloc[train_index]
    xvalfold = xtrain.iloc[val_index]
    yvalfold = ytrain.iloc[val_index]

    rows_to_delete = xtrainfold[xtrainfold['Functioning Day']==0].index
    xtrainfold = xtrainfold.drop(rows_to_delete)
    ytrainfold = ytrainfold.drop(rows_to_delete)
    xtrainfold = xtrainfold.drop(columns=['Dew point temperature(�C)', 'Functioning Day'])
    
    rows_to_set_zero = [i for i, b in enumerate(xvalfold['Functioning Day']==0) if b]
    xvalfold = xvalfold.drop(columns=['Dew point temperature(�C)', 'Functioning Day'])

    # if data has all numerical fields
    poly = PolynomialFeatures(degree=3)
    xtrainfold = poly.fit_transform(xtrainfold)
    xvalfold = poly.transform(xvalfold)

    model.fit(xtrainfold, ytrainfold)

    pred = model.predict(xvalfold)
    pred[rows_to_set_zero] = 0
    
    rmse = (np.sqrt(mean_squared_error(yvalfold.to_numpy(), pred)))
    r2 = r2_score(yvalfold.to_numpy(), pred)

    r2_scores.append(r2)
    print('Testing performance')
    print('RMSE: {:.2f}'.format(rmse))
    print('R2: {:.2f}'.format(r2))

np.mean(r2_scores), np.std(r2_scores)

# gradient boosting regressor

In [None]:
params = {'learning_rate': 0.01, 'loss': 'squared_error', 'max_depth': 6, 'n_estimators': 1000}

model = GradientBoostingRegressor(**params)

In [None]:
kf = KFold(n_splits=5)
r2_scores = []

for train_index, val_index in kf.split(xtrain):
    model.fit(xtrain.to_numpy()[train_index], ytrain.to_numpy().squeeze()[train_index])

    pred = model.predict(xtrain.to_numpy()[val_index])
    rmse = (np.sqrt(mean_squared_error(ytrain.to_numpy()[val_index], pred)))
    r2 = r2_score(ytrain.to_numpy()[val_index], pred)

    r2_scores.append(r2)
    print('Testing performance')
    print('RMSE: {:.2f}'.format(rmse))
    print('R2: {:.2f}'.format(r2))

np.mean(r2_scores), np.std(r2_scores)

# lightgbm

In [None]:
model = LGBMRegressor()

In [None]:
kf = KFold(n_splits=5)
r2_scores = []

for train_index, val_index in kf.split(xtrain):
    model.fit(xtrain.to_numpy()[train_index], ytrain.to_numpy().squeeze()[train_index])
    
    pred = model.predict(xtrain.to_numpy()[val_index])
    rmse = (np.sqrt(mean_squared_error(ytrain.to_numpy()[val_index], pred)))
    r2 = r2_score(ytrain.to_numpy()[val_index], pred)

    r2_scores.append(r2)
    print('Testing performance')
    print('RMSE: {:.2f}'.format(rmse))
    print('R2: {:.2f}'.format(r2))

np.mean(r2_scores), np.std(r2_scores)

# catboost

In [9]:
params= {'depth': 12,
  'l2_leaf_reg': 1,
  'iterations': 1000,
  'learning_rate': 0.1}

model = cb.CatBoostRegressor(loss_function='RMSE', **params)

# grid = {'iterations': [100, 150, 200, 500, 1000],
#         'learning_rate': [0.03, 0.1],
#         'depth': [2, 4, 6, 8, 12],
#         'l2_leaf_reg': [0.2, 0.5, 1, 3]}
# model.grid_search(grid, train_dataset)

In [10]:
kf = KFold(n_splits=5)
r2_scores = []

for train_index, val_index in kf.split(xtrain):
    xtrainfold = xtrain.iloc[train_index]
    ytrainfold = ytrain.iloc[train_index]
    xvalfold = xtrain.iloc[val_index]
    yvalfold = ytrain.iloc[val_index]

    rows_to_delete = xtrainfold[xtrainfold['Functioning Day']==0].index
    xtrainfold = xtrainfold.drop(rows_to_delete)
    ytrainfold = ytrainfold.drop(rows_to_delete)
    xtrainfold = xtrainfold.drop(columns=['Dew point temperature(�C)', 'Functioning Day'])
    
    rows_to_set_zero = [i for i, b in enumerate(xvalfold['Functioning Day']==0) if b]
    xvalfold = xvalfold.drop(columns=['Dew point temperature(�C)', 'Functioning Day'])

    # if data has all numerical fields
    train_dataset = cb.Pool(xtrainfold.to_numpy(), ytrainfold.to_numpy())

    model.fit(train_dataset)

    pred = model.predict(xvalfold.to_numpy())
    pred[rows_to_set_zero] = 0
    
    rmse = (np.sqrt(mean_squared_error(yvalfold.to_numpy(), pred)))
    r2 = r2_score(yvalfold.to_numpy(), pred)

    r2_scores.append(r2)
    print('Testing performance')
    print('RMSE: {:.2f}'.format(rmse))
    print('R2: {:.2f}'.format(r2))

np.mean(r2_scores), np.std(r2_scores)

0:	learn: 599.4944550	total: 350ms	remaining: 5m 49s
1:	learn: 560.3095503	total: 520ms	remaining: 4m 19s
2:	learn: 524.2504744	total: 680ms	remaining: 3m 45s
3:	learn: 493.5368595	total: 839ms	remaining: 3m 28s
4:	learn: 465.2495127	total: 1.03s	remaining: 3m 24s
5:	learn: 439.4882588	total: 1.11s	remaining: 3m 3s
6:	learn: 414.2452657	total: 1.26s	remaining: 2m 58s
7:	learn: 390.5585191	total: 1.41s	remaining: 2m 54s
8:	learn: 371.8477299	total: 1.56s	remaining: 2m 51s
9:	learn: 353.7745870	total: 1.7s	remaining: 2m 48s
10:	learn: 338.9470630	total: 1.84s	remaining: 2m 45s
11:	learn: 325.0358874	total: 1.99s	remaining: 2m 43s
12:	learn: 315.2009925	total: 2.01s	remaining: 2m 32s
13:	learn: 302.9201486	total: 2.15s	remaining: 2m 31s
14:	learn: 293.2470482	total: 2.3s	remaining: 2m 31s
15:	learn: 282.9043535	total: 2.44s	remaining: 2m 30s
16:	learn: 273.1696321	total: 2.61s	remaining: 2m 31s
17:	learn: 265.8297480	total: 2.75s	remaining: 2m 30s
18:	learn: 259.2339331	total: 2.9s	remain

(0.9003759587753096, 0.0052143568699641605)

In [11]:
# make predictions on test set
xtrainfold = xtrain
ytrainfold = ytrain
xtestfold = xtest

rows_to_delete = xtrainfold[xtrainfold['Functioning Day']==0].index
xtrainfold = xtrainfold.drop(rows_to_delete)
ytrainfold = ytrainfold.drop(rows_to_delete)
xtrainfold = xtrainfold.drop(columns=['Dew point temperature(�C)', 'Functioning Day'])
train_dataset = cb.Pool(xtrainfold.to_numpy(), ytrainfold.to_numpy())

params= {'depth': 12,
  'l2_leaf_reg': 1,
  'iterations': 1000,
  'learning_rate': 0.1}
model = cb.CatBoostRegressor(loss_function='RMSE', **params)

model.fit(train_dataset)

rows_to_set_zero = [i for i, b in enumerate(xtestfold['Functioning Day']==0) if b]
xtestfold = xtestfold.drop(columns=['Dew point temperature(�C)', 'Functioning Day'])
pred = model.predict(xtestfold.to_numpy())
pred[rows_to_set_zero] = 0

# write predictions to file
pd.DataFrame(list(zip(range(len(pred)), pred)), columns=['Index', 'Rented Bike Count']).to_csv('data/submission.csv', index=False)

0:	learn: 599.9285311	total: 140ms	remaining: 2m 20s
1:	learn: 560.1519656	total: 309ms	remaining: 2m 34s
2:	learn: 524.4195489	total: 450ms	remaining: 2m 29s
3:	learn: 494.1013646	total: 610ms	remaining: 2m 31s
4:	learn: 464.7610461	total: 780ms	remaining: 2m 35s
5:	learn: 439.1317367	total: 880ms	remaining: 2m 25s
6:	learn: 414.1435319	total: 1.03s	remaining: 2m 26s
7:	learn: 390.8014140	total: 1.18s	remaining: 2m 26s
8:	learn: 372.4522401	total: 1.33s	remaining: 2m 26s
9:	learn: 354.8103238	total: 1.49s	remaining: 2m 27s
10:	learn: 339.4221345	total: 1.63s	remaining: 2m 26s
11:	learn: 325.8294845	total: 1.79s	remaining: 2m 27s
12:	learn: 312.9469103	total: 1.94s	remaining: 2m 27s
13:	learn: 303.1315464	total: 2.11s	remaining: 2m 28s
14:	learn: 293.8217985	total: 2.27s	remaining: 2m 29s
15:	learn: 284.3099604	total: 2.42s	remaining: 2m 28s
16:	learn: 276.2622301	total: 2.57s	remaining: 2m 28s
17:	learn: 269.4100267	total: 2.71s	remaining: 2m 27s
18:	learn: 263.3180453	total: 2.88s	re

# neural network

In [None]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()

In [None]:
from sklearn.neural_network import MLPRegressor

kf = KFold(n_splits=5)
r2_scores = []

for train_index, val_index in kf.split(xtrain):
    xencodedtrain = np.concatenate([enc.fit_transform(xtrain[['Date', 'Hour', 'Seasons']]).todense(), xtrain[numerical_columns].to_numpy()], axis=1)
    xencodedtest = np.concatenate([enc.fit_transform(xtrain[['Date', 'Hour', 'Seasons']]).todense(), xtrain[numerical_columns].to_numpy()], axis=1)

    regr = MLPRegressor(hidden_layer_sizes=2048, max_iter=2000, learning_rate_init=0.01, learning_rate='adaptive', early_stopping=True).fit(xencodedtrain[train_index], ytrain.to_numpy()[train_index].squeeze())
    
    pred = regr.predict(xencodedtrain[val_index])
    rmse = (np.sqrt(mean_squared_error(ytrain.to_numpy()[val_index], pred)))
    r2 = r2_score(ytrain.to_numpy()[val_index], pred)

    r2_scores.append(r2)
    print('Testing performance')
    print('RMSE: {:.2f}'.format(rmse))
    print('R2: {:.2f}'.format(r2))

In [None]:
np.mean(r2_scores), np.std(r2_scores)