In [1]:
import pandas as pd

path = '../data/processed_data.csv'
data = pd.read_csv(path)
data.head()

Unnamed: 0,traffic_volume,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,time,date
0,1462,No,243.39,No rain,No snow,Clear,Haze,haze,08:00,18-12-2016
1,1037,No,243.62,No rain,No snow,Clear,Haze,haze,07:00,18-12-2016
2,800,No,244.22,No rain,No snow,Clear,Clear,sky is clear,06:00,18-12-2016
3,354,No,244.82,No rain,No snow,Party Cloudy,Clouds,few clouds,03:00,02-02-2013
4,417,No,244.82,No rain,No snow,Party Cloudy,Clouds,few clouds,04:00,02-02-2013


In [2]:
have_snow = data[data['snow_1h'] == 'Have snow']
no_snow = data[data['snow_1h'] != 'Have snow']

snow_train = have_snow.sample(frac=0.9, random_state=42) 
snow_test = have_snow.drop(snow_train.index)

print(snow_test.count())
print(snow_train.count())

traffic_volume         6
holiday                6
temp                   6
rain_1h                6
snow_1h                6
clouds_all             6
weather_main           6
weather_description    6
time                   6
date                   6
dtype: int64
traffic_volume         57
holiday                57
temp                   57
rain_1h                57
snow_1h                57
clouds_all             57
weather_main           57
weather_description    57
time                   57
date                   57
dtype: int64


### split data by condition

In [3]:
def split_dataset(data):
  """
    split data based on snow_1h and rain_1h columns.
    
    Steps:
    1. 90% of rows where 'snow_1h' == 'Have snow' go to train, 10% to test.
    2. from the remaining training data, 90% of rows where 'rain_1h' == 'Have rain' stay in train, 10% move to test.
    3. split continue in train dataset
    
  """
  # define snow for handle imbalance
  have_snow = data[data['snow_1h'] == 'Have snow']
  no_snow = data[data['snow_1h'] != 'Have snow']
  
  # split by snow
  snow_train = have_snow.sample(frac=0.9, random_state=42) 
  snow_test = have_snow.drop(snow_train.index)
  
  train_data = pd.concat([no_snow, snow_train])
  test_data = snow_test
  
  # define value in rain for handle imbalance
  have_rain = data[data['rain_1h'] == 'Have rain']
  no_rain = data[data['rain_1h'] == 'No rain']
  
  # continue slit with rain
  rain_train = have_rain.sample(frac=0.9, random_state=42) 
  rain_test = have_rain.drop(rain_train.index)  
  
  # train & test after split by columns
  train_data = pd.concat([no_rain, rain_train])
  test_data = pd.concat([test_data, rain_test])
  
  # Split in train set to get enough value for testing
  final_train = train_data.sample(frac=0.8, random_state=42)  
  validation = train_data.drop(final_train.index)  
  test_data = pd.concat([test_data, validation])

  return final_train, test_data

In [4]:
train_data, test_data = split_dataset(data)
print(f"train dataset length: ", len(train_data))
print(f"test data length: ", len(test_data))

train dataset length:  38277
test data length:  9922


### Prepare model

In [10]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

def gradient_boosting_train(train_data, target_column, feature_columns, n_splits=5, scoring='neg_root_mean_squared_error'):
    param_grid = {
      'learning_rate': [0.05, 0.2, 0.1],
      'min_samples_split':[3, 5, 6],
      'max_depth': [4, 5, 6],
      'min_samples_leaf': [1, 3, 5, 7],
      'subsample': [0.6, 0.8, 1.0],
      'n_estimators': [100, 200, 300]
    }
    X_train = train_data[feature_columns]
    y_train = train_data[target_column]
    gbr = GradientBoostingRegressor(random_state=42)
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=tscv, scoring=scoring, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    # get best params + estimators
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    # calculate r2
    y_train_pred = best_model.predict(X_train)
    r2_train = r2_score(y_train, y_train_pred)
    
    return best_model, best_params, r2_train

### evaluation

In [11]:
from sklearn.metrics import mean_squared_error

def evaluate_model(model, test_data, target_column, feature_columns):
    X_test = test_data[feature_columns]
    y_test = test_data[target_column]
    y_pred = model.predict(X_test)

    rmse = mean_squared_error(y_test, y_pred, squared=False)

    return rmse, f"The RMSE of the {model} is {rmse}"

### model dumping function

In [7]:
import joblib
def save_file(scaler, filename):
    joblib.dump(scaler, filename)
    return f"Saving {filename} successfully executed"

### train model

In [8]:
feature_columns = [col for col in train_data.columns if col != "traffic_volume"]

In [12]:
best_model, best_params, r2_train = gradient_boosting_train(train_data, target_column="traffic_volume", feature_columns=feature_columns)
# evaluation
rmse = evaluate_model(best_model, test_data, target_column="traffic_volume", feature_columns=feature_columns)

print(f"\n Best Hyperparameters: {best_params}"),
print(f"\n R2 on Train-set: {r2_train}"),
print(f"\n Root Mean Squared Error (RMSE) on Test Set: {rmse}")

Fitting 5 folds for each of 972 candidates, totalling 4860 fits


ValueError: 
All the 4860 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4860 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Probability\envs\deployment\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Probability\envs\deployment\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Probability\envs\deployment\Lib\site-packages\sklearn\ensemble\_gb.py", line 659, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "d:\Probability\envs\deployment\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Probability\envs\deployment\Lib\site-packages\sklearn\utils\validation.py", line 1263, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "d:\Probability\envs\deployment\Lib\site-packages\sklearn\utils\validation.py", line 997, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Probability\envs\deployment\Lib\site-packages\sklearn\utils\_array_api.py", line 521, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Probability\envs\deployment\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'No'
