In [1]:
import pandas as pd

path = '../data/processed_data.csv'
data = pd.read_csv(path)
data = data.sort_values(by=["temp", "rain_1h", "snow_1h", 'clouds_all'])
data.head()

Unnamed: 0,traffic_volume,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,year,month,day,hour,day_in_week,is_weekend
0,1462,No,243.39,No rain,No snow,Clear,Haze,haze,2016,12,18,8,6,0
1,1037,No,243.62,No rain,No snow,Clear,Haze,haze,2016,12,18,7,6,0
2,800,No,244.22,No rain,No snow,Clear,Clear,sky is clear,2016,12,18,6,6,0
3,354,No,244.82,No rain,No snow,Party Cloudy,Clouds,few clouds,2013,2,2,3,5,0
4,417,No,244.82,No rain,No snow,Party Cloudy,Clouds,few clouds,2013,2,2,4,5,0


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48193 entries, 0 to 48192
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   traffic_volume       48193 non-null  int64  
 1   holiday              48193 non-null  object 
 2   temp                 48193 non-null  float64
 3   rain_1h              48193 non-null  object 
 4   snow_1h              48193 non-null  object 
 5   clouds_all           48193 non-null  object 
 6   weather_main         48193 non-null  object 
 7   weather_description  48193 non-null  object 
 8   year                 48193 non-null  int64  
 9   month                48193 non-null  int64  
 10  day                  48193 non-null  int64  
 11  hour                 48193 non-null  int64  
 12  day_in_week          48193 non-null  int64  
 13  is_weekend           48193 non-null  int64  
dtypes: float64(1), int64(7), object(6)
memory usage: 5.5+ MB


### split data by condition

In [2]:
def split_dataset(data):
  """
    split data based on snow_1h and rain_1h columns.
    
    Steps:
    1. 90% of rows where 'snow_1h' == 'Have snow' go to train, 10% to test.
    2. from the remaining training data, 90% of rows where 'rain_1h' == 'Have rain' stay in train, 10% move to test.
    3. split continue in train dataset
    
  """
  # define snow for handle imbalance
  have_snow = data[data['snow_1h'] == 'Have snow']
  no_snow = data[data['snow_1h'] != 'Have snow']
  
  # split by snow
  snow_train = have_snow.sample(frac=0.9, random_state=42) 
  snow_test = have_snow.drop(snow_train.index)
  
  train_data = pd.concat([no_snow, snow_train])
  test_data = snow_test
  
  # define value in rain for handle imbalance
  have_rain = data[data['rain_1h'] == 'Have rain']
  no_rain = data[data['rain_1h'] == 'No rain']
  
  # continue slit with rain
  rain_train = have_rain.sample(frac=0.9, random_state=42) 
  rain_test = have_rain.drop(rain_train.index)  
  
  # train & test after split by columns
  train_data = pd.concat([no_rain, rain_train])
  test_data = pd.concat([test_data, rain_test])
  
  # Split in train set to get enough value for testing
  final_train = train_data.sample(frac=0.8, random_state=42)  
  validation = train_data.drop(final_train.index)  
  test_data = pd.concat([test_data, validation])

  return final_train, test_data

In [3]:
train_data, test_data = split_dataset(data)
print(f"train dataset length: ", len(train_data))
print(f"test data length: ", len(test_data))

train dataset length:  38277
test data length:  9922


### encoding

In [4]:
from sklearn.preprocessing import LabelEncoder

def label_encoding(df):
  # include only category columns
  category_column = df.select_dtypes(include=['object']).columns.tolist()
  
  encoded_df = df.copy()
  for col in category_column:
    encoded_df[col] = encoded_df[col].astype(str)
    
    label_encode = LabelEncoder()
    encoded_df[col] = label_encode.fit_transform(encoded_df[col])

  return encoded_df

In [5]:
train_data = label_encoding(train_data)
test_data = label_encoding(test_data)

### Prepare model

In [14]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

def gradient_boosting_train(train_data, target_column, feature_columns, n_splits=10, scoring='neg_root_mean_squared_error'):
    param_grid = {
      'learning_rate': [0.05, 0.01, 0.1],
      'min_samples_split':[6, 10, 15],
      'max_depth': [2, 3, 4],
      'min_samples_leaf': [5, 10, 20],
      'subsample': [0.6, 0.8, 1.0],
      'n_estimators': [100, 200, 300]
    }
    X_train = train_data[feature_columns]
    y_train = train_data[target_column]
    gbr = GradientBoostingRegressor(random_state=42, n_iter_no_change=10, validation_fraction=0.2)
    tscv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # print log data for debug
    def print_progress(cv_results, n_iter):
        print(f"Iteration {n_iter}/{len(cv_results['mean_test_score'])}:")
        print(f"Params: {cv_results['params'][n_iter - 1]}")
        print(f"Mean Test Score: {cv_results['mean_test_score'][n_iter - 1]}")
        print(f"Std Test Score: {cv_results['std_test_score'][n_iter - 1]}")
        print("-" * 40)
    
    grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=tscv, scoring=scoring, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    # Log parameter results during fitting
    for i in range(len(grid_search.cv_results_['params'])):
        print_progress(grid_search.cv_results_, i + 1)
    
    # get best params + estimators
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    # calculate r2
    y_train_pred = best_model.predict(X_train)
    r2_train = r2_score(y_train, y_train_pred)
    
    return best_model, best_params, r2_train

### evaluation

In [8]:
from sklearn.metrics import mean_squared_error

def evaluate_model(model, test_data, target_column, feature_columns):
    X_test = test_data[feature_columns]
    y_test = test_data[target_column]
    y_pred = model.predict(X_test)

    rmse = mean_squared_error(y_test, y_pred, squared=False)

    return rmse, f"The RMSE of the {model} is {rmse}"

### model dumping function

In [9]:
import joblib
def save_file(scaler, filename):
    joblib.dump(scaler, filename)
    return f"Saving {filename} successfully executed"

### train model

In [11]:
feature_columns = [col for col in train_data.columns if col != "traffic_volume"]
feature_columns

['holiday',
 'temp',
 'rain_1h',
 'snow_1h',
 'clouds_all',
 'weather_main',
 'weather_description',
 'year',
 'month',
 'day',
 'hour',
 'day_in_week',
 'is_weekend']

In [12]:
best_model, best_params, r2_train = gradient_boosting_train(train_data, target_column="traffic_volume", feature_columns=feature_columns)
# evaluation
rmse, evaluation_message = evaluate_model(best_model, test_data, target_column="traffic_volume", feature_columns=feature_columns)

print(f"\n Best Hyperparameters: {best_params}"),
print(f"\n R2 on Train-set: {r2_train}"),
print(f"\n Root Mean Squared Error (RMSE) on Test Set: {rmse}")
print(evaluation_message)

Fitting 5 folds for each of 972 candidates, totalling 4860 fits

 Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 6, 'min_samples_leaf': 7, 'min_samples_split': 3, 'n_estimators': 300, 'subsample': 0.8}

 R2 on Train-set: 0.9858321072918674

 Root Mean Squared Error (RMSE) on Test Set: (398.9927449378183, 'The RMSE of the GradientBoostingRegressor(learning_rate=0.2, max_depth=6, min_samples_leaf=7,\n                          min_samples_split=3, n_estimators=300,\n                          random_state=42, subsample=0.8) is 398.9927449378183')




In [13]:
path = './saved_feature/gbr.joblib'
save_file(best_model, path)

'Saving ./saved_feature/gbr.joblib successfully executed'