In [1]:
'''
Consider the dataset at the link https://www.kaggle.com/competitions/tabular-playground-series-mar-2022  Try the following models by method of log(response):
Linear Regression
ElasticNet Regression
XGBoost
CatBoost
LightGBM

Mention leaderboard scores for each of the five.
'''

'\nConsider the dataset at the link https://www.kaggle.com/competitions/tabular-playground-series-mar-2022  Try the following models by method of log(response):\nLinear Regression\nElasticNet Regression\nXGBoost\nCatBoost\nLightGBM\n\nMention leaderboard scores for each of the five.\n'

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, log_loss, r2_score
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.svm import SVC, SVR
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaseEnsemble, VotingClassifier, GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor, StackingClassifier, StackingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import os

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostRegressor, CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')

In [30]:
os.chdir(r'E:\CDAC\PML\tabular-playground-series-mar-2022')

## Loading DataSet

In [31]:
train = pd.read_csv('train.csv', parse_dates=['time'])
test = pd.read_csv('test.csv', parse_dates=['time'])

print(train.isnull().sum().sum())
print(test.isnull().sum().sum())

0
0


## Parsing the Datetime on Train and Test

In [32]:
train['year']=train['time'].dt.year
train['month']=train['time'].dt.month
train['day']=train['time'].dt.day
train['hour']=train['time'].dt.hour
train["weekday"] = train["time"].dt.weekday

train.head()

Unnamed: 0,row_id,time,x,y,direction,congestion,year,month,day,hour,weekday
0,0,1991-04-01,0,0,EB,70,1991,4,1,0,0
1,1,1991-04-01,0,0,NB,49,1991,4,1,0,0
2,2,1991-04-01,0,0,SB,24,1991,4,1,0,0
3,3,1991-04-01,0,1,EB,18,1991,4,1,0,0
4,4,1991-04-01,0,1,NB,60,1991,4,1,0,0


In [33]:
test['year']=test['time'].dt.year
test['month']=test['time'].dt.month
test['day']=test['time'].dt.day
test['hour']=test['time'].dt.hour
test["weekday"] = test["time"].dt.weekday

test.head()

Unnamed: 0,row_id,time,x,y,direction,year,month,day,hour,weekday
0,848835,1991-09-30 12:00:00,0,0,EB,1991,9,30,12,0
1,848836,1991-09-30 12:00:00,0,0,NB,1991,9,30,12,0
2,848837,1991-09-30 12:00:00,0,0,SB,1991,9,30,12,0
3,848838,1991-09-30 12:00:00,0,1,EB,1991,9,30,12,0
4,848839,1991-09-30 12:00:00,0,1,NB,1991,9,30,12,0


Label Encoder for 'direction' column on train and test

In [34]:
le = LabelEncoder()

In [35]:
direction1 = le.fit_transform(train['direction'])
direction1 = pd.DataFrame(direction1, columns=['direction'])

df1 = pd.concat([train, direction1], axis=1)
df1.head()

Unnamed: 0,row_id,time,x,y,direction,congestion,year,month,day,hour,weekday,direction.1
0,0,1991-04-01,0,0,EB,70,1991,4,1,0,0,0
1,1,1991-04-01,0,0,NB,49,1991,4,1,0,0,1
2,2,1991-04-01,0,0,SB,24,1991,4,1,0,0,4
3,3,1991-04-01,0,1,EB,18,1991,4,1,0,0,0
4,4,1991-04-01,0,1,NB,60,1991,4,1,0,0,1


In [36]:
direction2 = le.fit_transform(test['direction'])
direction2 = pd.DataFrame(direction2, columns=['direction'])

df2 = pd.concat([test, direction2], axis=1)
df2.head()

Unnamed: 0,row_id,time,x,y,direction,year,month,day,hour,weekday,direction.1
0,848835,1991-09-30 12:00:00,0,0,EB,1991,9,30,12,0,0
1,848836,1991-09-30 12:00:00,0,0,NB,1991,9,30,12,0,1
2,848837,1991-09-30 12:00:00,0,0,SB,1991,9,30,12,0,4
3,848838,1991-09-30 12:00:00,0,1,EB,1991,9,30,12,0,0
4,848839,1991-09-30 12:00:00,0,1,NB,1991,9,30,12,0,1


In [37]:
X_train = df1.drop(columns=['row_id', 'congestion', 'time','direction'])
y_train = df1['congestion']
X_test = df2.drop(columns=['row_id','time','direction'])

- Transforming the target variable as log response variable
- As normal log function is giving -infinity at some places, hence we will use log1p as it calculates log(1 + x).


In [38]:
y_train_log = np.log1p(y_train)
y_train_log.unique()

array([4.26267988, 3.91202301, 3.21887582, 2.94443898, 4.11087386,
       4.07753744, 3.29583687, 3.4657359 , 3.8501476 , 3.40119738,
       2.83321334, 3.09104245, 3.87120101, 3.95124372, 4.31748811,
       3.78418963, 3.80666249, 3.97029191, 3.49650756, 4.35670883,
       3.68887945, 4.15888308, 3.98898405, 3.36729583, 4.00733319,
       3.73766962, 3.63758616, 2.77258872, 4.12713439, 4.18965474,
       3.13549422, 3.55534806, 3.8286414 , 3.76120012, 3.93182563,
       3.61091791, 4.17438727, 3.4339872 , 4.02535169, 4.06044301,
       3.71357207, 3.52636052, 2.89037176, 3.8918203 , 4.27666612,
       4.24849524, 2.39789527, 4.21950771, 4.39444915, 4.52178858,
       3.58351894, 4.04305127, 4.09434456, 3.66356165, 4.36944785,
       3.33220451, 4.38202663, 3.04452244, 4.4308168 , 4.14313473,
       3.25809654, 2.99573227, 2.07944154, 4.41884061, 4.33073334,
       4.34380542, 4.4543473 , 4.61512052, 1.09861229, 4.48863637,
       2.56494936, 4.49980967, 4.30406509, 2.48490665, 4.40671

## LinearRegression

In [39]:
lr = LinearRegression()
lr.fit(X_train, y_train_log)

In [40]:
y_pred_lr = np.expm1(lr.predict(X_test))

submit_lr = pd.DataFrame({'row_id':test['row_id'], 'congestion':y_pred_lr})

In [41]:
submit_lr.to_csv("Submit_LR.csv", index=False)

## ElasticNet

In [42]:
eln = ElasticNet()
eln.fit(X_train, y_train_log)

In [43]:
y_pred_eln = np.expm1(eln.predict(X_test))

submit_eln = pd.DataFrame({'row_id':test['row_id'], 'congestion':y_pred_eln})


In [44]:
submit_eln.to_csv("Submit_ElasticNet.csv", index=False)

## XGBoost

In [45]:
xgb = XGBRegressor(random_state=24)
xgb.fit(X_train, y_train_log)

In [46]:
y_pred_xgb = np.expm1(xgb.predict(X_test))

submit_xgb = pd.DataFrame({'row_id':test['row_id'], 'congestion':y_pred_xgb})

In [47]:
submit_xgb.to_csv("Submit_XGBoost.csv", index=False)

## CatBoost

In [48]:
cat = CatBoostRegressor(random_state=24)
cat.fit(X_train, y_train_log)

Learning rate set to 0.118832
0:	learn: 0.4068418	total: 357ms	remaining: 5m 56s
1:	learn: 0.4012297	total: 462ms	remaining: 3m 50s
2:	learn: 0.3968173	total: 568ms	remaining: 3m 8s
3:	learn: 0.3933555	total: 686ms	remaining: 2m 50s
4:	learn: 0.3914293	total: 773ms	remaining: 2m 33s
5:	learn: 0.3897746	total: 869ms	remaining: 2m 23s
6:	learn: 0.3876127	total: 940ms	remaining: 2m 13s
7:	learn: 0.3864197	total: 1.01s	remaining: 2m 5s
8:	learn: 0.3847905	total: 1.1s	remaining: 2m 1s
9:	learn: 0.3834394	total: 1.19s	remaining: 1m 57s
10:	learn: 0.3827186	total: 1.28s	remaining: 1m 55s
11:	learn: 0.3820488	total: 1.38s	remaining: 1m 53s
12:	learn: 0.3812655	total: 1.46s	remaining: 1m 50s
13:	learn: 0.3807760	total: 1.52s	remaining: 1m 47s
14:	learn: 0.3799953	total: 1.6s	remaining: 1m 45s
15:	learn: 0.3794378	total: 1.68s	remaining: 1m 43s
16:	learn: 0.3791155	total: 1.75s	remaining: 1m 41s
17:	learn: 0.3786508	total: 1.83s	remaining: 1m 40s
18:	learn: 0.3783991	total: 1.94s	remaining: 1m 4

<catboost.core.CatBoostRegressor at 0x1fed5c49970>

In [49]:
y_pred_cat = np.expm1(cat.predict(X_test))

submit_cat = pd.DataFrame({'row_id':test['row_id'], 'congestion':y_pred_cat})

In [50]:
submit_cat.to_csv("Submit_CatBoost.csv", index=False)

## LightGBM

In [51]:
lgbm = LGBMRegressor(random_state=24)
lgbm.fit(X_train, y_train_log)

In [52]:
y_pred_lgbm = np.expm1(lgbm.predict(X_test))

submit_lgbm = pd.DataFrame({'row_id':test['row_id'], 'congestion':y_pred_lgbm})


In [53]:
submit_lgbm.to_csv("Submit_LGBM.csv", index=False)