In [1]:
import pandas as pd

path = '../data/processed_data.csv'
data = pd.read_csv(path)
data = data.sort_values(by=["temp", "rain_1h", "snow_1h", 'clouds_all'])
data.head()

Unnamed: 0,traffic_volume,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,year,month,day,hour,day_in_week,is_weekend
0,1462,No,243.39,No rain,No snow,Clear,Mist,2016,12,18,8,6,0
1,1037,No,243.62,No rain,No snow,Clear,Mist,2016,12,18,7,6,0
2,800,No,244.22,No rain,No snow,Clear,Clear,2016,12,18,6,6,0
3,354,No,244.82,No rain,No snow,Party Cloudy,Clouds,2013,2,2,3,5,0
4,417,No,244.82,No rain,No snow,Party Cloudy,Clouds,2013,2,2,4,5,0


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48193 entries, 0 to 48192
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   traffic_volume  48193 non-null  int64  
 1   holiday         48193 non-null  object 
 2   temp            48193 non-null  float64
 3   rain_1h         48193 non-null  object 
 4   snow_1h         48193 non-null  object 
 5   clouds_all      48193 non-null  object 
 6   weather_main    48193 non-null  object 
 7   year            48193 non-null  int64  
 8   month           48193 non-null  int64  
 9   day             48193 non-null  int64  
 10  hour            48193 non-null  int64  
 11  day_in_week     48193 non-null  int64  
 12  is_weekend      48193 non-null  int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 5.1+ MB


In [16]:
sorted(data['weather_main'].unique())

['Clear', 'Clouds', 'Mist', 'Rain', 'Snow']

### split data by condition

In [2]:
def split_dataset(data):
  """
    split data based on snow_1h and rain_1h columns.
    
    Steps:
    1. 90% of rows where 'snow_1h' == 'Have snow' go to train, 10% to test.
    2. from the remaining training data, 90% of rows where 'rain_1h' == 'Have rain' stay in train, 10% move to test.
    3. split continue in train dataset
    
  """
  # define snow for handle imbalance
  have_snow = data[data['snow_1h'] == 'Have snow']
  no_snow = data[data['snow_1h'] != 'Have snow']
  
  # split by snow
  snow_train = have_snow.sample(frac=0.9, random_state=42) 
  snow_test = have_snow.drop(snow_train.index)
  
  train_data = pd.concat([no_snow, snow_train])
  test_data = snow_test
  
  # define value in rain for handle imbalance
  have_rain = data[data['rain_1h'] == 'Have rain']
  no_rain = data[data['rain_1h'] == 'No rain']
  
  # continue slit with rain
  rain_train = have_rain.sample(frac=0.9, random_state=42) 
  rain_test = have_rain.drop(rain_train.index)  
  
  # train & test after split by columns
  train_data = pd.concat([no_rain, rain_train])
  test_data = pd.concat([test_data, rain_test])
  
  # Split in train set to get enough value for testing
  final_train = train_data.sample(frac=0.8, random_state=42)  
  validation = train_data.drop(final_train.index)  
  test_data = pd.concat([test_data, validation])

  return final_train, test_data

In [3]:
train_data, test_data = split_dataset(data)
print(f"train dataset length: ", len(train_data))
print(f"test data length: ", len(test_data))

train dataset length:  38277
test data length:  9922


### encoding

In [4]:
from sklearn.preprocessing import LabelEncoder

def label_encoding(df):
  # include only category columns
  category_column = df.select_dtypes(include=['object']).columns.tolist()
  
  encoded_df = df.copy()
  for col in category_column:
    encoded_df[col] = encoded_df[col].astype(str)
    
    label_encode = LabelEncoder()
    encoded_df[col] = label_encode.fit_transform(encoded_df[col])

  return encoded_df

In [5]:
train_data = label_encoding(train_data)
test_data = label_encoding(test_data)

In [6]:
train_data.head()

Unnamed: 0,traffic_volume,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,year,month,day,hour,day_in_week,is_weekend
27923,2647,7,286.22,1,1,0,0,2017,8,5,8,5,0
17337,5252,7,275.63,0,1,3,3,2013,12,2,14,0,0
25868,4283,7,284.38,1,1,1,3,2018,9,30,16,6,0
27700,6424,7,286.01,1,1,1,1,2017,10,25,17,2,0
29477,3209,7,287.41,1,1,0,0,2017,4,22,21,5,0


### Prepare model

In [11]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

def gbr_train(train_data, target_column, feature_columns, scoring='neg_root_mean_squared_error', early_stopping_rounds=20):

    X = train_data[feature_columns]
    y = train_data[target_column]

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    param_grid = {
        'n_estimators': [100, 200, 500],  
        'learning_rate': [0.01, 0.05, 0.1],  
        'max_depth': [3, 5],               
        'min_samples_split': [2, 5],     
        'min_samples_leaf': [1, 3],     
        'subsample': [0.7, 0.8],           
        'max_features': ['sqrt', 'log2'],  
    }

    gbr = GradientBoostingRegressor(random_state=42)

    # Perform GridSearchCV
    grid_search = GridSearchCV(
        estimator=gbr,
        param_grid=param_grid,
        scoring=scoring,
        cv=3,
        n_jobs=-1,
        verbose=2
    )
    grid_search.fit(X_train, y_train)

    # Retrieve the best model and parameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Implement manual early stopping
    min_val_error = float('inf')
    no_improve_count = 0

    for n in range(1, best_model.n_estimators + 1):
        # Set the number of estimators for the current iteration
        best_model.set_params(n_estimators=n)
        best_model.fit(X_train, y_train)

        # Evaluate on validation set
        y_val_pred = best_model.predict(X_val)
        val_error = mean_squared_error(y_val, y_val_pred)

        # Check for improvement
        if val_error < min_val_error:
            min_val_error = val_error
            no_improve_count = 0
        else:
            no_improve_count += 1

        if no_improve_count >= early_stopping_rounds:
            print(f"Early stopping at iteration {n}")
            break

    # Final evaluation on the training set
    y_train_pred = best_model.predict(X_train)
    r2_train = r2_score(y_train, y_train_pred)

    return best_model, best_params, r2_train


In [13]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

def xgb_train(train_data, target_column, feature_columns, scoring='neg_root_mean_squared_error', early_stopping_rounds=20):
    param_grid = {
        'max_depth': [3, 4],          
        'min_child_weight': [5, 10],       
        'learning_rate': [0.01, 0.05],  
        'n_estimators': [100, 200, 500],   
        'subsample': [0.7, 0.8],           
        'colsample_bytree': [0.7, 0.8],    
        'reg_alpha': [0, 0.1],              
        'reg_lambda': [1, 2],              
        'scale_pos_weight': [1, 10]        
    }
    
    X_train = train_data[feature_columns]
    y_train = train_data[target_column]
    
    # try early stopping
    X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    gbr = XGBRegressor(random_state=42)
    
    grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=3, scoring=scoring, n_jobs=-1, verbose=2)
    grid_search.fit(X_train_sub, y_train_sub, eval_set=[(X_val, y_val)], early_stopping_rounds=early_stopping_rounds, verbose=True)
    
    
    # get best params + estimators
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    # calculate r2
    y_train_pred = best_model.predict(X_train)
    r2_train = r2_score(y_train, y_train_pred)
    
    return best_model, best_params, r2_train

### evaluation

In [7]:
from sklearn.metrics import mean_squared_error

def evaluate_model(model, test_data, target_column, feature_columns):
    X_test = test_data[feature_columns]
    y_test = test_data[target_column]
    y_pred = model.predict(X_test)

    rmse = mean_squared_error(y_test, y_pred, squared=False)

    return rmse, f"The RMSE of the {model} is {rmse}"

### model dumping function

In [8]:
import joblib
def save_file(scaler, filename):
    joblib.dump(scaler, filename)
    return f"Saving {filename} successfully executed"

### train model

In [9]:
feature_columns = [col for col in train_data.columns if col != "traffic_volume"]
feature_columns

['holiday',
 'temp',
 'rain_1h',
 'snow_1h',
 'clouds_all',
 'weather_main',
 'year',
 'month',
 'day',
 'hour',
 'day_in_week',
 'is_weekend']

#### gradient boosting

In [12]:
best_model, best_params, r2_train = gbr_train(train_data, target_column="traffic_volume", feature_columns=feature_columns)
# evaluation
rmse, evaluation_message = evaluate_model(best_model, test_data, target_column="traffic_volume", feature_columns=feature_columns)

print(f"\n Best Hyperparameters: {best_params}"),
print(f"\n R2 on Train-set: {r2_train}"),
print(f"\n Root Mean Squared Error (RMSE) on Test Set: {rmse}")
print(evaluation_message)

Fitting 3 folds for each of 288 candidates, totalling 864 fits

 Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500, 'subsample': 0.7}

 R2 on Train-set: 0.9697219274130239

 Root Mean Squared Error (RMSE) on Test Set: 552.5216367756693
The RMSE of the GradientBoostingRegressor(max_depth=5, max_features='sqrt', min_samples_split=5,
                          n_estimators=500, random_state=42, subsample=0.7) is 552.5216367756693




In [13]:
path = './saved_feature/gbr.joblib'
save_file(best_model, path)

'Saving ./saved_feature/gbr.joblib successfully executed'

#### xgb

In [14]:
best_model, best_params, r2_train = xgb_train(train_data, target_column="traffic_volume", feature_columns=feature_columns)
# evaluation
rmse = evaluate_model(best_model, test_data, target_column="traffic_volume", feature_columns=feature_columns)

print(f"\n Best Hyperparameters: {best_params}"),
print(f"\n R2 on Train-set: {r2_train}"),
print(f"\n Root Mean Squared Error (RMSE) on Test Set: {rmse}")

Fitting 3 folds for each of 768 candidates, totalling 2304 fits
[0]	validation_0-rmse:1889.50924
[1]	validation_0-rmse:1818.48703
[2]	validation_0-rmse:1739.71016
[3]	validation_0-rmse:1665.31921
[4]	validation_0-rmse:1595.26270
[5]	validation_0-rmse:1529.53663
[6]	validation_0-rmse:1523.13924
[7]	validation_0-rmse:1469.88905
[8]	validation_0-rmse:1411.13858
[9]	validation_0-rmse:1364.36551
[10]	validation_0-rmse:1311.64316
[11]	validation_0-rmse:1262.08499
[12]	validation_0-rmse:1223.07177
[13]	validation_0-rmse:1178.81803
[14]	validation_0-rmse:1173.14680
[15]	validation_0-rmse:1132.09271
[16]	validation_0-rmse:1093.24905
[17]	validation_0-rmse:1056.74596
[18]	validation_0-rmse:1022.69476
[19]	validation_0-rmse:995.92028




[20]	validation_0-rmse:971.14277
[21]	validation_0-rmse:947.92027
[22]	validation_0-rmse:920.49348
[23]	validation_0-rmse:916.20202
[24]	validation_0-rmse:890.83299
[25]	validation_0-rmse:866.11382
[26]	validation_0-rmse:862.78393
[27]	validation_0-rmse:840.99688
[28]	validation_0-rmse:837.58697
[29]	validation_0-rmse:817.23024
[30]	validation_0-rmse:801.81209
[31]	validation_0-rmse:787.70626
[32]	validation_0-rmse:770.12103
[33]	validation_0-rmse:753.70053
[34]	validation_0-rmse:739.03573
[35]	validation_0-rmse:724.95924
[36]	validation_0-rmse:722.38143
[37]	validation_0-rmse:712.74488
[38]	validation_0-rmse:703.71118
[39]	validation_0-rmse:701.37289
[40]	validation_0-rmse:689.84989
[41]	validation_0-rmse:682.06821
[42]	validation_0-rmse:675.02305
[43]	validation_0-rmse:665.87840
[44]	validation_0-rmse:656.94393
[45]	validation_0-rmse:648.97431
[46]	validation_0-rmse:641.66626
[47]	validation_0-rmse:634.96172
[48]	validation_0-rmse:628.08291
[49]	validation_0-rmse:623.45370
[50]	valid



In [15]:
path = './saved_feature/xgb_2.joblib'
save_file(best_model, path)

'Saving ./saved_feature/xgb_2.joblib successfully executed'