<p style="font-family:newtimeroman;font-size:150%;">Let's predict simply.</p>


# <p style="background-color:#f0f7f8;font-family:newtimeroman;color:#18cccc;font-size:150%;text-align:center;border-radius:30px;">Load Library and Data</p>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

from sklearn.linear_model import LinearRegression as lr
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.metrics import mean_absolute_error

from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb

from tqdm import tqdm
import warnings
warnings.simplefilter('ignore')

sns.set_style("darkgrid")

%matplotlib inline

In [2]:
train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv', parse_dates=['time'], index_col='row_id')
train.time = pd.to_datetime(train.time)
test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv', parse_dates=['time'])
test.time = pd.to_datetime(test.time)

submission = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')
train.head(3)

In [27]:
print(len(train))
print(len(test))

In [5]:
sns.displot(train.congestion)

# <p style="background-color:#f0f7f8;font-family:newtimeroman;color:#18cccc;font-size:150%;text-align:center;border-radius:30px;">Feature Engineering</p>

In [3]:
holiday = ['1991-05-27','1991-07-04','1991-09-02']
train = train.query('time not in @holiday')
train.reset_index(drop=True,inplace = True)
train.tail()

In [5]:
dir_map = {'EB': [1,0], 
          'NB': [0,1], 
          'SB': [0,-1], 
          'WB': [-1,0], 
          'NE': [1,1], 
          'SW': [-1,-1], 
          'NW': [-1,1], 
          'SE': [1,-1]}

for df in [train, test]:
    df['hour']=df.time.dt.hour
    df['minute']=df.time.dt.minute
    df['weekday'] = df.time.dt.weekday
    df['direction0'] = df['direction'].map(lambda x: dir_map[x][0])
    df['direction1'] = df['direction'].map(lambda x: dir_map[x][1])
    df.drop(['time'], axis=1, inplace=True)

In [6]:
medians = pd.DataFrame(train.groupby(['hour','minute','x','y','direction']).congestion.median().astype(int)).reset_index()
medians = medians.rename(columns={'congestion':'median'})
stds = pd.DataFrame(train.groupby(['hour','minute','x','y','direction']).congestion.std().astype(int)).reset_index()
stds = stds.rename(columns={'congestion':'std'})
train=train.merge(medians,how='left',on=['x','y','direction','hour','minute'])
train=train.merge(stds,how='left',on=['x','y','direction','hour','minute'])
test=test.merge(medians,how='left',on=['x','y','direction','hour','minute'])
test=test.merge(stds,how='left',on=['x','y','direction','hour','minute'])

In [7]:
mm = MinMaxScaler()
train[['median','std']] = mm.fit_transform(train[['median','std']])
test[['median','std']] = mm.transform(test[['median','std']])

In [10]:
train_y = train.congestion
train.drop('congestion',axis=1, inplace=True)

# <p style="background-color:#f0f7f8;font-family:newtimeroman;color:#18cccc;font-size:150%;text-align:center;border-radius:30px;">Modeling</p>

In [11]:

folds = KFold(n_splits = 10, shuffle = True, random_state = 123)
rms1,rms2 = [],[]
models1,models2 = [], []
for n_fold, (trn_idx, val_idx) in tqdm(enumerate(folds.split(train))):
    x_train, y_train = train.iloc[trn_idx], train_y[trn_idx] 
    x_val, y_val = train.iloc[val_idx], train_y[val_idx]
    
    lgb_param = {'boosting_type':'gbdt',
            'learning_rate': 0.1, 
            'bagging_fraction' : 0.85,
            'bagging_freq': 20,
            'colsample_bytree': 0.85,
             'metric': 'rmse',
            'min_child_weight': 0.01,
             'zero_as_missing': True,
            'objective': 'regression',
            'device' : 'gpu',
            'random_state': 0
            }
    
    param_cat = {
        'loss_function' : 'MAPE',
        'task_type' : 'GPU', 
        'grow_policy' : 'SymmetricTree',
        'learning_rate': 0.1,
        'l2_leaf_reg' : 0.2,
        'random_state': 0
     }
    train_set1 = lgb.Dataset(x_train, y_train, silent=False, params={'verbose': -1})
    valid_set1 = lgb.Dataset(x_val, y_val, silent=False, params={'verbose': -1})
    lgb_model = lgb.train(params = lgb_param, train_set = train_set1 , num_boost_round=5000, early_stopping_rounds=100,verbose_eval=500, valid_sets=valid_set1)
    ct = CatBoostRegressor(verbose=False, eval_metric='MAPE', **param_cat)
    ct.fit(x_train, y_train, eval_set=(x_val,y_val), cat_features= ['direction'])
    print(n_fold)
    models1.append(lgb_model)
    models2.append(ct)

In [13]:
preds = []
for model in models1:
    pred = model.predict(test.drop('row_id',axis=1))
    preds.append(pred)
model1_pred = np.mean(preds, axis=0)
preds = []
for model in models2:
    pred = model.predict(test.drop('row_id',axis=1))
    preds.append(pred)
model2_pred = np.mean(preds, axis=0)

In [14]:
submission['congestion'] = 0.5 * model1_pred + 0.5 * model2_pred
submission.to_csv('submission.csv',index=False)