In [149]:
import numpy    as np
from numpy.testing._private.utils import decorate_methods
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl
import time

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import RandomForestRegressor
from sklearn.ensemble      import ExtraTreesRegressor
from sklearn.ensemble      import AdaBoostRegressor
from sklearn.ensemble      import GradientBoostingRegressor
from xgboost               import XGBRegressor
from lightgbm              import LGBMRegressor
from catboost              import CatBoostRegressor

In [150]:
data = pd.read_csv(r"data/london_merged.csv")
np.random.seed(0)
data.head()

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0


In [151]:
data['year'] = data['timestamp'].apply(lambda row: row[:4])
data['month'] = data['timestamp'].apply(lambda row: row.split('-')[2][:2])
data['hour'] = data['timestamp'].apply(lambda row: row.split(':')[0][-2:])

In [152]:
# print(target.isnull().sum())
# print(data.isnull().sum())

In [153]:
data.drop('timestamp', axis=1, inplace=True)


In [154]:
def data_enhancement(data):
    gen_data = data

    for season in data['season'].unique():

        seasonal_data = gen_data[gen_data['season'] == season]
        hum_std = seasonal_data['hum'].std()
        wind_speed_std = seasonal_data['wind_speed'].std()
        t1_std = seasonal_data['t1'].std()
        t2_std = seasonal_data['t2'].std()
        
        
        for i in gen_data[gen_data['season']==season].index:
            if np.random.randint(2) == 1:
                gen_data['hum'].values[i] += hum_std/10
            else:
                gen_data['hum'].values[i] -= hum_std/10
            if np.random.randint(2) == 1:
                gen_data['wind_speed'].values[i] += wind_speed_std/10
            else:
                gen_data['wind_speed'].values[i] -= wind_speed_std/10

            if np.random.randint(2) == 1:
                gen_data['t1'].values[i] += t1_std/10
            else:
                gen_data['t1'].values[i] -= t1_std/10
            if np.random.randint(2) == 1:
                gen_data['t2'].values[i] += t2_std/10
            else:
                gen_data['t2'].values[i] -= t2_std/10     

    return gen_data

print(data.head(4))
gen = data_enhancement(data)
print(gen.head(5))
print(gen.shape)



   cnt   t1   t2    hum  wind_speed  weather_code  is_holiday  is_weekend  \
0  182  3.0  2.0   93.0         6.0           3.0         0.0         1.0   
1  138  3.0  2.5   93.0         5.0           1.0         0.0         1.0   
2  134  2.5  2.5   96.5         0.0           1.0         0.0         1.0   
3   72  2.0  2.0  100.0         0.0           1.0         0.0         1.0   

   season  year month hour  
0     3.0  2015    04   00  
1     3.0  2015    04   01  
2     3.0  2015    04   02  
3     3.0  2015    04   03  
   cnt        t1       t2        hum  wind_speed  weather_code  is_holiday  \
0  182  3.379372  1.51169  91.910483    6.890895           3.0         0.0   
1  138  3.379372  2.98831  94.089517    5.890895           1.0         0.0   
2  134  2.879372  2.01169  97.589517    0.890895           1.0         0.0   
3   72  1.620628  1.51169  98.910483    0.890895           1.0         0.0   
4   47  1.620628  0.48831  91.910483    5.609105           1.0         0.0   



In [155]:
y = data['cnt']
x = data.drop(['cnt'], axis=1)

In [156]:
cat_vars = ['season','is_weekend','is_holiday','year','month','weather_code']
num_vars = ['t1','t2','hum','wind_speed']

In [157]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(x,y, test_size=0.2, random_state=0)

In [158]:
extra_sample = gen.sample(gen.shape[0] // 3)
x_train = pd.concat([x_train, extra_sample.drop(['cnt'], axis=1 ) ])
y_train = pd.concat([y_train, extra_sample['cnt'] ])

In [159]:
x_train

Unnamed: 0,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season,year,month,hour
4493,25.638042,25.362627,23.487086,18.673385,1.0,0.0,0.0,1.0,2015,10,16
14407,16.861958,16.862627,67.512914,5.673385,1.0,0.0,0.0,1.0,2016,30,02
7862,11.429300,11.506144,70.778727,26.235845,7.0,0.0,1.0,2.0,2015,29,06
2627,11.901738,10.989901,67.977094,17.788771,1.0,0.0,0.0,0.0,2015,23,19
1764,8.598262,6.510099,67.522906,17.211229,2.0,0.0,0.0,0.0,2015,18,14
...,...,...,...,...,...,...,...,...,...,...,...
11029,10.098262,9.510099,52.022906,22.211229,1.0,0.0,1.0,0.0,2016,10,10
7647,9.070700,6.993856,70.221273,16.764155,2.0,0.0,0.0,2.0,2015,20,07
3182,14.401738,14.510099,47.977094,16.788771,1.0,0.0,1.0,0.0,2015,16,22
11044,9.598262,8.510099,80.477094,16.788771,2.0,0.0,0.0,0.0,2016,11,01
