In [10]:
import pandas as pd

In [11]:
path = "C:/Users/Admin/OneDrive/VNU/4.1/3. Các hệ thống thông tin toàn cầu/Final/processed_data.csv"
data = pd.read_csv(path)
data.head()

Unnamed: 0,traffic_volume,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,year,month,day,hour,day_in_week,is_weekend
0,1462,No,243.39,No rain,No snow,Clear,Haze,haze,2016,12,18,8,6,0
1,1037,No,243.62,No rain,No snow,Clear,Haze,haze,2016,12,18,7,6,0
2,800,No,244.22,No rain,No snow,Clear,Clear,sky is clear,2016,12,18,6,6,0
3,354,No,244.82,No rain,No snow,Party Cloudy,Clouds,few clouds,2013,2,2,3,5,0
4,417,No,244.82,No rain,No snow,Party Cloudy,Clouds,few clouds,2013,2,2,4,5,0


In [12]:
have_snow = data[data['snow_1h'] == 'Have snow']
no_snow = data[data['snow_1h'] != 'Have snow']

snow_train = have_snow.sample(frac=0.9, random_state=42) 
snow_test = have_snow.drop(snow_train.index)

print(snow_test.count())
print(snow_train.count())

traffic_volume         6
holiday                6
temp                   6
rain_1h                6
snow_1h                6
clouds_all             6
weather_main           6
weather_description    6
year                   6
month                  6
day                    6
hour                   6
day_in_week            6
is_weekend             6
dtype: int64
traffic_volume         57
holiday                57
temp                   57
rain_1h                57
snow_1h                57
clouds_all             57
weather_main           57
weather_description    57
year                   57
month                  57
day                    57
hour                   57
day_in_week            57
is_weekend             57
dtype: int64


# split data by condition

In [13]:
def split_dataset(data):
  """
    split data based on snow_1h and rain_1h columns.
    
    Steps:
    1. 90% of rows where 'snow_1h' == 'Have snow' go to train, 10% to test.
    2. from the remaining training data, 90% of rows where 'rain_1h' == 'Have rain' stay in train, 10% move to test.
    3. split continue in train dataset
    
  """
  # define snow for handle imbalance
  have_snow = data[data['snow_1h'] == 'Have snow']
  no_snow = data[data['snow_1h'] != 'Have snow']
  
  # split by snow
  snow_train = have_snow.sample(frac=0.9, random_state=42) 
  snow_test = have_snow.drop(snow_train.index)
  
  train_data = pd.concat([no_snow, snow_train])
  test_data = snow_test
  
  # define value in rain for handle imbalance
  have_rain = data[data['rain_1h'] == 'Have rain']
  no_rain = data[data['rain_1h'] == 'No rain']
  
  # continue slit with rain
  rain_train = have_rain.sample(frac=0.9, random_state=42) 
  rain_test = have_rain.drop(rain_train.index)  
  
  # train & test after split by columns
  train_data = pd.concat([no_rain, rain_train])
  test_data = pd.concat([test_data, rain_test])
  
  # Split in train set to get enough value for testing
  final_train = train_data.sample(frac=0.8, random_state=42)  
  validation = train_data.drop(final_train.index)  
  test_data = pd.concat([test_data, validation])

  return final_train, test_data

In [14]:
train_data, test_data = split_dataset(data)
print(f"train dataset length: ", len(train_data))
print(f"test data length: ", len(test_data))

train dataset length:  38277
test data length:  9922


In [15]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

def one_hot_encoding(df):
    # Include only category columns
    category_columns = df.select_dtypes(include=['object']).columns.tolist()
    
    # Create a OneHotEncoder instance
    onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    
    # Fit and transform the categorical columns
    encoded_df = df.copy()
    onehot_encoded = onehot_encoder.fit_transform(df[category_columns])
    
    # Create a DataFrame with the one-hot encoded data
    onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(category_columns))
    
    # Drop the original categorical columns and concatenate the one-hot encoded columns
    encoded_df = encoded_df.drop(columns=category_columns).reset_index(drop=True)
    encoded_df = pd.concat([encoded_df, onehot_encoded_df], axis=1)
    
    return encoded_df

## Prepare model

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

def random_forest_train(train_data, target_column, feature_columns, hyperparameters):
    # Identify categorical columns
    categorical_columns = train_data.select_dtypes(include=['object']).columns.tolist()

    # Define the column transformer with OneHotEncoder for categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
        ],
        remainder='passthrough'
    )
    
    X_train = train_data[feature_columns]
    y_train = train_data[target_column]
    
    # Set up the RandomForestRegressor with specified hyperparameters
    rf = RandomForestRegressor(**hyperparameters, random_state=42)
    
    # Create a pipeline that first applies the preprocessor and then the RandomForestRegressor
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('rf', rf)])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    return pipeline, hyperparameters

## evaluation

In [17]:
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

def evaluate_model(model, train_data, test_data, target_column, feature_columns):
    X_train = train_data[feature_columns]
    y_train = train_data[target_column]
    X_test = test_data[feature_columns]
    y_test = test_data[target_column]
    
    # Calculate R2 on training data
    y_train_pred = model.predict(X_train)
    r2_train = r2_score(y_train, y_train_pred)
    
    # Calculate RMSE on test data
    y_test_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    return r2_train, rmse

### model dumping function

In [24]:
import joblib
def save_file(scaler, filename):
    joblib.dump(scaler, filename)
    return f"Saving {filename} successfully executed"

save_file(model,'random_forest_model.joblib')

'Saving random_forest_model.joblib successfully executed'

## train model

In [19]:
feature_columns = train_data.columns.drop('traffic_volume').tolist()
hyperparameters = {
    'n_estimators': 200,
    'max_depth': 10,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    'max_features': 'sqrt'
}

In [20]:
# Train the model
model, best_params = random_forest_train(train_data, target_column="traffic_volume", feature_columns=feature_columns, hyperparameters=hyperparameters)

# Evaluate the model
r2_train, rmse = evaluate_model(model, train_data, test_data, target_column="traffic_volume", feature_columns=feature_columns)

# Print the results
print(f"\nBest Hyperparameters: {best_params}")
print(f"\nR2 on Train-set: {r2_train}")
print(f"\nRoot Mean Squared Error (RMSE) on Test Set: {rmse}")


Best Hyperparameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt'}

R2 on Train-set: 0.686393924554298

Root Mean Squared Error (RMSE) on Test Set: 1124.0897916449273
