In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.metrics import r2_score, mean_squared_error

import catboost as cb
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor

np.random.seed(42)

In [2]:
xtrain = pd.read_csv("data/XTrain.csv")
ytrain = pd.read_csv("data/yTrain.csv")
xtest = pd.read_csv("data/XTest.csv")

In [3]:
xtrain.head()

Unnamed: 0,Index,Date,Hour,Temperature(�C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(�C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,0,26/04/2018,5,10.2,73,0.8,1137,5.5,0.0,0.0,0.0,Spring,No Holiday,Yes
1,1,2/4/2018,7,15.1,80,1.0,623,11.6,0.01,0.0,0.0,Spring,No Holiday,Yes
2,2,25/05/2018,8,17.1,57,1.7,792,8.5,1.21,0.0,0.0,Spring,No Holiday,Yes
3,3,8/4/2018,22,2.7,88,2.0,621,0.9,0.0,0.0,0.0,Spring,No Holiday,Yes
4,4,6/4/2018,17,6.3,50,4.4,707,-3.3,0.64,0.0,0.0,Spring,No Holiday,Yes


# data preprocessing

In [4]:
def date_mapper(d, i):
    d = d.split('/')
    # return d[1]
    return int(d[i])-1

def season_mapper(s):
    return {season: i for i, season in enumerate(['Spring', 'Summer', 'Autumn', 'Winter'])}[s]

def holiday_mapper(h):
    return 0 if h=="No Holiday" else 1

def functioningDay_mapper(f):
    return 0 if f=="No" else 1

xtrain['DateDay'] = xtrain.Date.apply(lambda d: date_mapper(d, 0))
xtest['DateDay'] = xtest.Date.apply(lambda d: date_mapper(d, 0))

xtrain['DateMonth'] = xtrain.Date.apply(lambda d: date_mapper(d, 1))
xtest['DateMonth'] = xtest.Date.apply(lambda d: date_mapper(d, 1))

xtrain['DateYear'] = xtrain.Date.apply(lambda d: date_mapper(d, 2))
xtest['DateYear'] = xtest.Date.apply(lambda d: date_mapper(d, 2))

xtrain = xtrain.drop(columns=['Date'])
xtest = xtest.drop(columns=['Date'])

xtrain.Seasons = xtrain.Seasons.apply(season_mapper)
xtest.Seasons = xtest.Seasons.apply(season_mapper)

xtrain.Holiday = xtrain.Holiday.apply(holiday_mapper)
xtest.Holiday = xtest.Holiday.apply(holiday_mapper)

xtrain['Functioning Day'] = xtrain['Functioning Day'].apply(functioningDay_mapper)
xtest['Functioning Day'] = xtest['Functioning Day'].apply(functioningDay_mapper)

In [5]:
# scaler = MinMaxScaler()
scaler = RobustScaler()

numerical_columns = [
    'Temperature(�C)', 'Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)', 
    'Dew point temperature(�C)', 'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)'
]

scaler.fit(xtrain[numerical_columns])
transformed_xtrain = scaler.transform(xtrain[numerical_columns])
transformed_xtest = scaler.transform(xtest[numerical_columns])

for i, c in enumerate(numerical_columns):
    xtrain[c] = transformed_xtrain[:, i]
    xtest[c] = transformed_xtest[:, i]

In [6]:
xtrain = xtrain[xtrain.columns[1:]]
xtest = xtest[xtest.columns[1:]]
ytrain = ytrain[ytrain.columns[1:]]

In [7]:
# xtrain, xval, ytrain, yval = train_test_split(xtrain[xtrain.columns[1:]], ytrain[ytrain.columns[1]], test_size = 0.2)

In [8]:
len(xtrain)

6132

# polynomial regression

# gradient boosting regressor

In [39]:
params = {'learning_rate': 0.01, 'loss': 'squared_error', 'max_depth': 6, 'n_estimators': 1000}

model = GradientBoostingRegressor(**params)

In [40]:
kf = KFold(n_splits=5)
r2_scores = []

for train_index, val_index in kf.split(xtrain):
    model.fit(xtrain.to_numpy()[train_index], ytrain.to_numpy().squeeze()[train_index])

    pred = model.predict(xtrain.to_numpy()[val_index])
    rmse = (np.sqrt(mean_squared_error(ytrain.to_numpy()[val_index], pred)))
    r2 = r2_score(ytrain.to_numpy()[val_index], pred)

    r2_scores.append(r2)
    print('Testing performance')
    print('RMSE: {:.2f}'.format(rmse))
    print('R2: {:.2f}'.format(r2))

np.mean(r2_scores), np.std(r2_scores)



Testing performance
RMSE: 220.77
R2: 0.88




Testing performance
RMSE: 221.18
R2: 0.88




Testing performance
RMSE: 216.65
R2: 0.89




Testing performance
RMSE: 227.12
R2: 0.88




Testing performance
RMSE: 212.98
R2: 0.89


In [None]:
# make predictions on test set
train_dataset = cb.Pool(xtrain.to_numpy(), ytrain.to_numpy())

params= {'depth': 12,
  'l2_leaf_reg': 1,
  'iterations': 1000,
  'learning_rate': 0.1}
model = cb.CatBoostRegressor(loss_function='RMSE', **params)

model.fit(train_dataset)
pred = model.predict(xtest.to_numpy())

# write predictions to file
pd.DataFrame(list(zip(range(len(pred)), pred)), columns=['Index', 'Rented Bike Count']).to_csv('data/submission.csv', index=False)

0:	learn: 598.5695119	total: 30.6ms	remaining: 30.5s
1:	learn: 558.1442624	total: 67.1ms	remaining: 33.5s
2:	learn: 521.4617772	total: 104ms	remaining: 34.7s
3:	learn: 488.2960875	total: 136ms	remaining: 33.8s
4:	learn: 458.6508028	total: 169ms	remaining: 33.7s
5:	learn: 433.5303648	total: 201ms	remaining: 33.3s
6:	learn: 409.9317004	total: 233ms	remaining: 33.1s
7:	learn: 389.2159545	total: 266ms	remaining: 33s
8:	learn: 370.9577296	total: 300ms	remaining: 33.1s
9:	learn: 353.7020595	total: 337ms	remaining: 33.4s
10:	learn: 338.8229057	total: 373ms	remaining: 33.6s
11:	learn: 325.9392489	total: 406ms	remaining: 33.4s
12:	learn: 314.3631887	total: 438ms	remaining: 33.2s
13:	learn: 302.6096595	total: 471ms	remaining: 33.2s
14:	learn: 293.1630934	total: 503ms	remaining: 33.1s
15:	learn: 284.4931573	total: 534ms	remaining: 32.8s
16:	learn: 276.0798781	total: 566ms	remaining: 32.7s
17:	learn: 268.1087630	total: 595ms	remaining: 32.5s
18:	learn: 262.1422858	total: 630ms	remaining: 32.5s
19:

# lightgbm

In [35]:
model = LGBMRegressor()

In [36]:
kf = KFold(n_splits=5)
r2_scores = []

for train_index, val_index in kf.split(xtrain):
    model.fit(xtrain.to_numpy()[train_index], ytrain.to_numpy().squeeze()[train_index])
    
    pred = model.predict(xtrain.to_numpy()[val_index])
    rmse = (np.sqrt(mean_squared_error(ytrain.to_numpy()[val_index], pred)))
    r2 = r2_score(ytrain.to_numpy()[val_index], pred)

    r2_scores.append(r2)
    print('Testing performance')
    print('RMSE: {:.2f}'.format(rmse))
    print('R2: {:.2f}'.format(r2))

np.mean(r2_scores), np.std(r2_scores)

Testing performance
RMSE: 225.08
R2: 0.88
Testing performance
RMSE: 220.58
R2: 0.89
Testing performance
RMSE: 213.79
R2: 0.89
Testing performance
RMSE: 224.35
R2: 0.88
Testing performance
RMSE: 210.26
R2: 0.89


In [None]:
# make predictions on test set
train_dataset = cb.Pool(xtrain.to_numpy(), ytrain.to_numpy())

params= {'depth': 12,
  'l2_leaf_reg': 1,
  'iterations': 1000,
  'learning_rate': 0.1}
model = cb.CatBoostRegressor(loss_function='RMSE', **params)

model.fit(train_dataset)
pred = model.predict(xtest.to_numpy())

# write predictions to file
pd.DataFrame(list(zip(range(len(pred)), pred)), columns=['Index', 'Rented Bike Count']).to_csv('data/submission.csv', index=False)

0:	learn: 598.5695119	total: 30.6ms	remaining: 30.5s
1:	learn: 558.1442624	total: 67.1ms	remaining: 33.5s
2:	learn: 521.4617772	total: 104ms	remaining: 34.7s
3:	learn: 488.2960875	total: 136ms	remaining: 33.8s
4:	learn: 458.6508028	total: 169ms	remaining: 33.7s
5:	learn: 433.5303648	total: 201ms	remaining: 33.3s
6:	learn: 409.9317004	total: 233ms	remaining: 33.1s
7:	learn: 389.2159545	total: 266ms	remaining: 33s
8:	learn: 370.9577296	total: 300ms	remaining: 33.1s
9:	learn: 353.7020595	total: 337ms	remaining: 33.4s
10:	learn: 338.8229057	total: 373ms	remaining: 33.6s
11:	learn: 325.9392489	total: 406ms	remaining: 33.4s
12:	learn: 314.3631887	total: 438ms	remaining: 33.2s
13:	learn: 302.6096595	total: 471ms	remaining: 33.2s
14:	learn: 293.1630934	total: 503ms	remaining: 33.1s
15:	learn: 284.4931573	total: 534ms	remaining: 32.8s
16:	learn: 276.0798781	total: 566ms	remaining: 32.7s
17:	learn: 268.1087630	total: 595ms	remaining: 32.5s
18:	learn: 262.1422858	total: 630ms	remaining: 32.5s
19:

# catboost

In [109]:
params= {'depth': 12,
  'l2_leaf_reg': 1,
  'iterations': 1000,
  'learning_rate': 0.1}

model = cb.CatBoostRegressor(loss_function='RMSE', **params)

# grid = {'iterations': [100, 150, 200],
#         'learning_rate': [0.03, 0.1],
#         'depth': [2, 4, 6, 8],
#         'l2_leaf_reg': [0.2, 0.5, 1, 3]}
# model.grid_search(grid, train_dataset)

In [110]:
kf = KFold(n_splits=5)
r2_scores = []

for train_index, val_index in kf.split(xtrain):
    xtrainfold = xtrain.iloc[train_index]
    ytrainfold = ytrain.iloc[train_index]
    xvalfold = xtrain.iloc[val_index]
    yvalfold = ytrain.iloc[val_index]

    rows_to_delete = xtrainfold[xtrainfold['Functioning Day']==0].index
    xtrainfold = xtrainfold.drop(rows_to_delete)
    ytrainfold = ytrainfold.drop(rows_to_delete)
    xtrainfold = xtrainfold.drop(columns=['Dew point temperature(�C)', 'Functioning Day'])
    
    rows_to_set_zero = [i for i, b in enumerate(xvalfold['Functioning Day']==0) if b]
    xvalfold = xvalfold.drop(columns=['Dew point temperature(�C)', 'Functioning Day'])

    # if data has all numerical fields
    train_dataset = cb.Pool(xtrainfold.to_numpy(), ytrainfold.to_numpy())

    model.fit(train_dataset)

    pred = model.predict(xvalfold.to_numpy())
    pred[rows_to_set_zero] = 0
    
    rmse = (np.sqrt(mean_squared_error(yvalfold.to_numpy(), pred)))
    r2 = r2_score(yvalfold.to_numpy(), pred)

    r2_scores.append(r2)
    print('Testing performance')
    print('RMSE: {:.2f}'.format(rmse))
    print('R2: {:.2f}'.format(r2))

np.mean(r2_scores), np.std(r2_scores)

0:	learn: 599.4944550	total: 32.2ms	remaining: 32.2s
1:	learn: 560.3095503	total: 68.8ms	remaining: 34.3s
2:	learn: 524.2504744	total: 102ms	remaining: 33.8s
3:	learn: 493.5368595	total: 135ms	remaining: 33.6s
4:	learn: 465.2495127	total: 167ms	remaining: 33.2s
5:	learn: 439.4882588	total: 185ms	remaining: 30.7s
6:	learn: 414.2452657	total: 216ms	remaining: 30.6s
7:	learn: 390.5585191	total: 251ms	remaining: 31.1s
8:	learn: 371.8477299	total: 287ms	remaining: 31.5s
9:	learn: 353.7745870	total: 319ms	remaining: 31.6s
10:	learn: 338.9470630	total: 350ms	remaining: 31.4s
11:	learn: 325.0358874	total: 380ms	remaining: 31.3s
12:	learn: 315.2009925	total: 384ms	remaining: 29.1s
13:	learn: 302.9201486	total: 420ms	remaining: 29.6s
14:	learn: 293.2470482	total: 461ms	remaining: 30.3s
15:	learn: 282.9043535	total: 501ms	remaining: 30.8s
16:	learn: 273.1696321	total: 533ms	remaining: 30.8s
17:	learn: 265.8297480	total: 571ms	remaining: 31.1s
18:	learn: 259.2339331	total: 603ms	remaining: 31.1s
1

(0.9003759587753096, 0.0052143568699641605)

In [111]:
# make predictions on test set
xtrainfold = xtrain
ytrainfold = ytrain
xtestfold = xtest

rows_to_delete = xtrainfold[xtrainfold['Functioning Day']==0].index
xtrainfold = xtrainfold.drop(rows_to_delete)
ytrainfold = ytrainfold.drop(rows_to_delete)
xtrainfold = xtrainfold.drop(columns=['Dew point temperature(�C)', 'Functioning Day'])
train_dataset = cb.Pool(xtrainfold.to_numpy(), ytrainfold.to_numpy())

params= {'depth': 12,
  'l2_leaf_reg': 1,
  'iterations': 1000,
  'learning_rate': 0.1}
model = cb.CatBoostRegressor(loss_function='RMSE', **params)

model.fit(train_dataset)

rows_to_set_zero = [i for i, b in enumerate(xtestfold['Functioning Day']==0) if b]
xtestfold = xtestfold.drop(columns=['Dew point temperature(�C)', 'Functioning Day'])
pred = model.predict(xtestfold.to_numpy())
pred[rows_to_set_zero] = 0

# write predictions to file
pd.DataFrame(list(zip(range(len(pred)), pred)), columns=['Index', 'Rented Bike Count']).to_csv('data/submission.csv', index=False)

0:	learn: 599.9285311	total: 30.6ms	remaining: 30.5s
1:	learn: 560.1519656	total: 63.6ms	remaining: 31.7s
2:	learn: 524.4195489	total: 100ms	remaining: 33.3s
3:	learn: 494.1013646	total: 138ms	remaining: 34.2s
4:	learn: 464.7610461	total: 169ms	remaining: 33.7s
5:	learn: 439.1317367	total: 189ms	remaining: 31.3s
6:	learn: 414.1435319	total: 222ms	remaining: 31.4s
7:	learn: 390.8014140	total: 260ms	remaining: 32.2s
8:	learn: 372.4522401	total: 299ms	remaining: 32.9s
9:	learn: 354.8103238	total: 332ms	remaining: 32.9s
10:	learn: 339.4221345	total: 366ms	remaining: 32.9s
11:	learn: 325.8294845	total: 396ms	remaining: 32.6s
12:	learn: 312.9469103	total: 426ms	remaining: 32.4s
13:	learn: 303.1315464	total: 458ms	remaining: 32.2s
14:	learn: 293.8217985	total: 492ms	remaining: 32.3s
15:	learn: 284.3099604	total: 524ms	remaining: 32.3s
16:	learn: 276.2622301	total: 555ms	remaining: 32.1s
17:	learn: 269.4100267	total: 587ms	remaining: 32s
18:	learn: 263.3180453	total: 617ms	remaining: 31.8s
19:

# neural network

In [28]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()

xencodedtrain = np.concatenate([
    enc.fit_transform(xtrain[['DateDay', 'DateMonth', 'DateYear', 'Hour', 'Seasons']]).todense(), 
    xtrain[numerical_columns].to_numpy(),
    xtrain[['Holiday', 'Dew point temperature(�C)', 'Functioning Day']]], 
    axis=1)
xencodedtest = np.concatenate([
    enc.transform(xtrain[['DateDay', 'DateMonth', 'DateYear', 'Hour', 'Seasons']]).todense(), 
    xtrain[numerical_columns].to_numpy(),
    xtrain[['Holiday', 'Dew point temperature(�C)', 'Functioning Day']]], 
    axis=1)

In [29]:
xencodedtrain.shape

(6132, 84)

In [30]:
import torch
import torch.nn as nn

torch.manual_seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [71]:
kf = KFold(n_splits=5)
r2_scores = []

for train_index, val_index in kf.split(xtrain):
    xtrainfold = xencodedtrain[train_index]
    ytrainfold = ytrain.iloc[train_index].to_numpy()
    xvalfold = xencodedtrain[val_index]
    yvalfold = ytrain.iloc[val_index].to_numpy()

    rows_to_delete = [i for i, b in enumerate(xtrainfold[:, -1]) if b==0]
    xtrainfold = np.delete(xtrainfold, rows_to_delete, 0)
    ytrainfold = np.delete(ytrainfold, rows_to_delete, 0)
    xtrainfold = np.delete(xtrainfold, -1, 1) # drop functioning day column
    xtrainfold = np.delete(xtrainfold, -2, 1) # drop dew point temp column
    
    rows_to_set_zero = [i for i, b in enumerate(xvalfold[:, -1]) if b==0]
    xvalfold = np.delete(xvalfold, -1, 1) # drop functioning day column
    xvalfold = np.delete(xvalfold, -2, 1) # drop dew point temp column

    # print(xtrainfold.shape)

    model = nn.Sequential(
        nn.Dropout(.2),
        nn.Linear(82, 4096),
        nn.ReLU(),
        nn.Dropout(.2),
        nn.Linear(4096, 2048),
        nn.ReLU(),
        nn.Dropout(.2),
        nn.Linear(2048, 1)
    ).to(device)

    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

    train_mse = []
    eval_mse = []
    eval_r2 = []

    for epoch in range(5000):
        y_pred = model(torch.Tensor(xtrainfold).to(device))

        loss = criterion(y_pred, torch.Tensor(ytrainfold).to(device))
        
        optimizer.zero_grad()
        
        loss.backward()
        
        optimizer.step()
        train_mse.append(loss.item())
        
        with torch.no_grad():
            y_pred = model(torch.Tensor(xvalfold).to(device))
            loss = criterion(y_pred, torch.Tensor(yvalfold).to(device))
            eval_mse.append(loss.item())
            r2 = r2_score(yvalfold, np.array(y_pred.cpu()))
            eval_r2.append(r2)

        print(f"epoch: {epoch}, train_mse: {train_mse[-1]}, eval_mse: {eval_mse[-1]}, eval_r2: {r2}", end='\r')

    print("Fold Results:", min(train_mse), min(eval_mse), max(eval_r2))

np.mean(r2_scores), np.std(r2_scores)

Sequential(
  (0): Dropout(p=0.2, inplace=False)
  (1): Linear(in_features=82, out_features=4096, bias=True)
  (2): ReLU()
  (3): Dropout(p=0.2, inplace=False)
  (4): Linear(in_features=4096, out_features=2048, bias=True)
  (5): ReLU()
  (6): Dropout(p=0.2, inplace=False)
  (7): Linear(in_features=2048, out_features=1, bias=True)
)
Sequential(, train_mse: 23462.6875, eval_mse: 100675.0, eval_r2: 0.759611572733347795314296255
  (0): Dropout(p=0.2, inplace=False)
  (1): Linear(in_features=82, out_features=4096, bias=True)
  (2): ReLU()
  (3): Dropout(p=0.2, inplace=False)
  (4): Linear(in_features=4096, out_features=2048, bias=True)
  (5): ReLU()
  (6): Dropout(p=0.2, inplace=False)
  (7): Linear(in_features=2048, out_features=1, bias=True)
)
epoch: 2763, train_mse: 26961.076171875, eval_mse: 96526.015625, eval_r2: 0.772551608415573214

KeyboardInterrupt: 