In [49]:
import numpy    as np
from numpy.testing._private.utils import decorate_methods
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl
import time

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import RandomForestRegressor
from sklearn.ensemble      import ExtraTreesRegressor
from sklearn.ensemble      import AdaBoostRegressor
from sklearn.ensemble      import GradientBoostingRegressor
from xgboost               import XGBRegressor
from lightgbm              import LGBMRegressor
from catboost              import CatBoostRegressor
from sklearn.linear_model import LogisticRegression

In [50]:
data = pd.read_csv(r"data/london_merged.csv")
np.random.seed(0)
data.head()

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0


In [51]:
data['year'] = data['timestamp'].apply(lambda row: row[:4])
data['month'] = data['timestamp'].apply(lambda row: row.split('-')[2][:2])
data['hour'] = data['timestamp'].apply(lambda row: row.split(':')[0][-2:])

In [52]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17414 entries, 0 to 17413
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   timestamp     17414 non-null  object 
 1   cnt           17414 non-null  int64  
 2   t1            17414 non-null  float64
 3   t2            17414 non-null  float64
 4   hum           17414 non-null  float64
 5   wind_speed    17414 non-null  float64
 6   weather_code  17414 non-null  float64
 7   is_holiday    17414 non-null  float64
 8   is_weekend    17414 non-null  float64
 9   season        17414 non-null  float64
 10  year          17414 non-null  object 
 11  month         17414 non-null  object 
 12  hour          17414 non-null  object 
dtypes: float64(8), int64(1), object(4)
memory usage: 1.7+ MB


In [53]:
# print(target.isnull().sum())
# print(data.isnull().sum())

In [54]:
data.drop('timestamp', axis=1, inplace=True)


In [55]:
def data_enhancement(data):
    gen_data = data

    for season in data['season'].unique():

        seasonal_data = gen_data[gen_data['season'] == season]
        hum_std = seasonal_data['hum'].std()
        wind_speed_std = seasonal_data['wind_speed'].std()
        t1_std = seasonal_data['t1'].std()
        t2_std = seasonal_data['t2'].std()
        
        
        for i in gen_data[gen_data['season']==season].index:
            if np.random.randint(2) == 1:
                gen_data['hum'].values[i] += hum_std/10
            else:
                gen_data['hum'].values[i] -= hum_std/10
            if np.random.randint(2) == 1:
                gen_data['wind_speed'].values[i] += wind_speed_std/10
            else:
                gen_data['wind_speed'].values[i] -= wind_speed_std/10

            if np.random.randint(2) == 1:
                gen_data['t1'].values[i] += t1_std/10
            else:
                gen_data['t1'].values[i] -= t1_std/10
            if np.random.randint(2) == 1:
                gen_data['t2'].values[i] += t2_std/10
            else:
                gen_data['t2'].values[i] -= t2_std/10     

    return gen_data

print(data.head(4))
gen = data_enhancement(data)
print(gen.head(5))
print(gen.shape)



   cnt   t1   t2    hum  wind_speed  weather_code  is_holiday  is_weekend  \
0  182  3.0  2.0   93.0         6.0           3.0         0.0         1.0   
1  138  3.0  2.5   93.0         5.0           1.0         0.0         1.0   
2  134  2.5  2.5   96.5         0.0           1.0         0.0         1.0   
3   72  2.0  2.0  100.0         0.0           1.0         0.0         1.0   

   season  year month hour  
0     3.0  2015    04   00  
1     3.0  2015    04   01  
2     3.0  2015    04   02  
3     3.0  2015    04   03  
   cnt        t1       t2        hum  wind_speed  weather_code  is_holiday  \
0  182  3.379372  1.51169  91.910483    6.890895           3.0         0.0   
1  138  3.379372  2.98831  94.089517    5.890895           1.0         0.0   
2  134  2.879372  2.01169  97.589517    0.890895           1.0         0.0   
3   72  1.620628  1.51169  98.910483    0.890895           1.0         0.0   
4   47  1.620628  0.48831  91.910483    5.609105           1.0         0.0   



In [56]:
y = data['cnt']
x = data.drop(['cnt'], axis=1)

In [57]:
cat_vars = ['season','is_weekend','is_holiday','year','month','weather_code']
num_vars = ['t1','t2','hum','wind_speed']

In [58]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(x,y, test_size=0.2, random_state=0)

In [59]:
extra_sample = gen.sample(gen.shape[0] // 3)
x_train = pd.concat([x_train, extra_sample.drop(['cnt'], axis=1 ) ])
y_train = pd.concat([y_train, extra_sample['cnt'] ])

In [60]:
x_train.shape, y_train.shape

((19735, 11), (19735,))

In [61]:
#preproceesing
transformer = preprocessing.PowerTransformer()
y_train = transformer.fit_transform(y_train.values.reshape(-1,1))
y_val = transformer.transform(y_val.values.reshape(-1,1))

In [62]:
#impute values
cat_4_treeModel = pipeline.Pipeline(steps=[('impute', impute.SimpleImputer(strategy='constant', fill_value='missing')),
                                           ('ordinal', preprocessing.OrdinalEncoder())])

num_4_treeModel = pipeline.Pipeline(steps=[('impute', impute.SimpleImputer(strategy='constant', fill_value=-9999))])                                            

In [63]:
tree_prepo = compose.ColumnTransformer(transformers=[('num', num_4_treeModel, num_vars),
                                        ('cat', cat_4_treeModel, cat_vars)], remainder='drop')

In [64]:
tree_classifiers = {  "Decision Tree": DecisionTreeRegressor(),
                    "Extra Trees":   ExtraTreesRegressor(n_estimators=100),
                    "Random Forest": RandomForestRegressor(n_estimators=100),
                    "AdaBoost":      AdaBoostRegressor(n_estimators=100),
                    "Skl GBM":       GradientBoostingRegressor(n_estimators=100),
                    "XGBoost":       XGBRegressor(n_estimators=100),
                    "LightGBM":      LGBMRegressor(n_estimators=100),
                    "CatBoost":      CatBoostRegressor(n_estimators=100),
                   
                    }

In [65]:
tree_classifiers = {name: pipeline.make_pipeline(tree_prepo, model) for name, model in classifiers.items()}

In [70]:
rang = abs(y_train.max())+ abs(y_train.min())
results = pd.DataFrame({'Model': [], 'AC':[], 'MSE': [], 'MAB': [], " % error": [], 'Time': []})

for model_name, model in tree_classifiers.items():
    start_time = time.time()
    model.fit(x_train,y_train)
    total_time = time.time()-start_time
    pred = model.predict(x_val)
    results = results.append({"Model":    model_name,
                            # "AC": metrics.accuracy_score(y_val, pred)*100,
                            "MSE": metrics.mean_squared_error(y_val, pred),
                            "MAB": metrics.mean_absolute_error(y_val, pred),
                            " % error": metrics.mean_squared_error(y_val, pred) / rang,
                            "Time":     total_time},
                            ignore_index=True)

results_ord = results.sort_values(by=['MSE'], ascending=True, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['MSE', 'MAB'], vmin=0, vmax=100, color='#5fba7d')                            

  results = results.append({"Model":    model_name,
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  results = results.append({"Model":    model_name,
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  results = results.append({"Model":    model_name,
  return f(*args, **kwargs)
  results = results.append({"Model":    model_name,
  return f(*args, **kwargs)
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  return f(*args, **kwargs)
  results = results.append({"Model":    model_name,


Learning rate set to 0.381903
0:	learn: 0.8981769	total: 4.57ms	remaining: 452ms
1:	learn: 0.8524330	total: 9.82ms	remaining: 481ms
2:	learn: 0.8285188	total: 13.7ms	remaining: 444ms
3:	learn: 0.8130659	total: 17.6ms	remaining: 423ms
4:	learn: 0.8025043	total: 22.2ms	remaining: 421ms
5:	learn: 0.7954566	total: 28.5ms	remaining: 447ms
6:	learn: 0.7882785	total: 35.5ms	remaining: 471ms
7:	learn: 0.7864546	total: 41.3ms	remaining: 474ms
8:	learn: 0.7827960	total: 46.9ms	remaining: 474ms
9:	learn: 0.7810627	total: 50.8ms	remaining: 457ms
10:	learn: 0.7778876	total: 55.2ms	remaining: 446ms
11:	learn: 0.7758583	total: 62.1ms	remaining: 456ms
12:	learn: 0.7742224	total: 66.3ms	remaining: 444ms
13:	learn: 0.7728942	total: 70.6ms	remaining: 434ms
14:	learn: 0.7720692	total: 78.2ms	remaining: 443ms
15:	learn: 0.7707991	total: 82.6ms	remaining: 434ms
16:	learn: 0.7687112	total: 87.3ms	remaining: 426ms
17:	learn: 0.7681782	total: 92.6ms	remaining: 422ms
18:	learn: 0.7671501	total: 96.6ms	remaining

  results = results.append({"Model":    model_name,


Unnamed: 0,Model,AC,MSE,MAB,% error,Time
1,Extra Trees,,0.383524,0.371588,0.069469,4.202806
2,Random Forest,,0.425495,0.45448,0.077072,6.306145
3,XGBoost,,0.532979,0.554602,0.09654,0.652257
4,LightGBM,,0.563719,0.577875,0.102109,0.247339
5,CatBoost,,0.577734,0.583242,0.104647,0.950517
6,Skl GBM,,0.625755,0.614863,0.113345,2.001641
7,AdaBoost,,0.695454,0.664843,0.12597,0.524561
8,Decision Tree,,0.747092,0.519064,0.135324,0.203455


In [77]:
best_model = tree_classifiers['Extra Trees']
best_model.fit(x_train,y_train)
predic = best_model.predict(x_val)


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [78]:
import joblib
joblib.dump(best_model, 'model_joblib')

['model_joblib']

In [85]:
mp = joblib.load('model_joblib')
mp.predict(x_val)


array([-1.39971744,  0.78062771, -0.4544557 , ..., -0.83900744,
        0.37616379, -0.49983833])