In [24]:
import pandas as pd
import numpy as np
import plotly.express as px

from tqdm import tqdm

from data.city.load_cities import CITY

df_dataset: pd.DataFrame = CITY.df_hours.copy()
station_name: str = df_dataset.columns[0]

In [216]:
class MainTest():
    def __init__(self) -> None:
        print(self.__class__.__name__)

class Test(MainTest):
    def __init__(self) -> None:
        super().__init__()

Test()

Test


<__main__.Test at 0x2284fc400e0>

In [209]:
def create_features_from_date(station_name: str, df: pd.DataFrame) -> pd.DataFrame:
    df_X = pd.DataFrame()
    df_X['hour'] = df['date'].dt.hour.astype('uint8')
    df_X['day_of_week'] = df['date'].dt.dayofweek.astype('uint8')
    df_X['day_of_month'] = df['date'].dt.day.astype('uint8')
    df_X['is_weekend'] = (df['date'].dt.dayofweek >= 5).astype('uint8')
    df_X['is_sunday'] = (df['date'].dt.dayofweek == 6).astype('uint8')

    # for hour in [1, 6, 12, 24]:
    #     df_X[f'lag_{hour:>02}h'] = df[station_name].shift(hour)
    
    return df_X.bfill()

In [210]:
df_X = create_features_from_date(station_name, df_dataset)

df_y = df_dataset[station_name]

In [222]:
pd.date_range('2023', periods=10, freq='1h', inclusive='right')

DatetimeIndex(['2023-01-01 01:00:00', '2023-01-01 02:00:00',
               '2023-01-01 03:00:00', '2023-01-01 04:00:00',
               '2023-01-01 05:00:00', '2023-01-01 06:00:00',
               '2023-01-01 07:00:00', '2023-01-01 08:00:00',
               '2023-01-01 09:00:00'],
              dtype='datetime64[ns]', freq='h')

In [211]:
df_X

Unnamed: 0,hour,day_of_week,day_of_month,is_weekend,is_sunday
1,0,4,1,0,0
2,1,4,1,0,0
3,2,4,1,0,0
4,3,4,1,0,0
5,4,4,1,0,0
...,...,...,...,...,...
4316,19,1,27,0,0
4317,20,1,27,0,0
4318,21,1,27,0,0
4319,22,1,27,0,0


In [206]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=48, shuffle=False)

In [207]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=12,
    random_state=48,
    max_depth=12,
    n_jobs=-1,
    verbose=0
)

rf_model.fit(X_train, y_train)

from sklearn.metrics import mean_squared_error, mean_absolute_error
print('MSE:', mean_squared_error(y_test, rf_model.predict(X_test)))
print('MAE:', mean_absolute_error(y_test, rf_model.predict(X_test)))

MSE: 0.032065216361566506
MAE: 0.12001042755260201


In [208]:
maj = len(y_test)
data = X_test[:maj].reset_index(drop=True).copy()
predicted = []
for index in tqdm(range(maj)):
    predicted_value = rf_model.predict(data.iloc[index].to_frame().T)
    predicted.append(predicted_value)

    for hour in [1, 6, 12, 24]:
        if index + hour < maj:
            data.loc[index + hour, f'lag_{hour:>02}h'] = predicted_value

print('MSE:', mean_squared_error(y_test[:maj], predicted))
print('MAE:', mean_absolute_error(y_test[:maj], predicted))
px.line(pd.DataFrame({'reality': y_test[:maj], 'predicted': np.array(predicted).flatten()}))

100%|██████████| 1296/1296 [00:26<00:00, 49.40it/s]

MSE: 0.09147017382242172
MAE: 0.2118458288233093





In [218]:
try:
    joblib.load('model_path')
except FileNotFoundError:
    print('cc')

cc


In [157]:
import joblib
joblib.dump(rf_model, 'cool.pkl', compress=3)

px.line(pd.DataFrame({'reality': y_test.to_numpy(), 'predicted': rf_model.predict(X_test)}, index=df_dataset.loc[y_test.index, 'date']))

In [None]:
df = pd.DataFrame()
df['hour'] = df_dataset['date'].dt.hour.astype('uint8')
df['day_of_week'] = df_dataset['date'].dt.dayofweek.astype('uint8')
df['day_of_month'] = df_dataset['date'].dt.day.astype('uint8')
df['is_weekend'] = (df_dataset['date'].dt.dayofweek >= 5).astype('uint8')
df['is_sunday'] = (df_dataset['date'].dt.dayofweek == 6).astype('uint8')

station_encoder = {station: code for station, code in zip(df_dataset.columns, range(len(df_dataset)))}
df['station_id'] = pd.Series(-1, index=df.index)

columns_of_df = []
for station_name in tqdm(df_dataset.columns[:-1]):
    for hour in [1, 6, 12, 24]:
        columns_of_df.append(df_dataset[station_name].shift(hour).rename(f'{station_name}_{hour}h_lag').astype('float32'))
        if hour == 1:
            continue

        columns_of_df.append(df_dataset[station_name].rolling(hour).mean().rename(f'{station_name}_{hour}h_mean').astype('float32'))
        columns_of_df.append(df_dataset[station_name].rolling(hour).std().rename(f'{station_name}_{hour}h_std').astype('float32'))
        columns_of_df.append(df_dataset[station_name].rolling(hour).max().rename(f'{station_name}_{hour}h_max').astype('float32'))
        columns_of_df.append(df_dataset[station_name].rolling(hour).min().rename(f'{station_name}_{hour}h_min').astype('float32'))
df = pd.concat([df] + columns_of_df, axis='columns').bfill()

rows_of_df = []
for station_name in tqdm(df_dataset.columns[:-1]):
    df['station_id'] = station_encoder[station_name]
    rows_of_df.append(df.copy())
df_X = pd.concat(rows_of_df, axis='index')
df_X['station_id'] = df_X['station_id'].astype('uint8')

In [33]:
rows_of_df = []
dates_of_df = []
for station_name in tqdm(df_dataset.columns[:-1]):
    rows_of_df.append(df_dataset[station_name].copy())
    dates_of_df.append(df_dataset['date'].copy())
df_y = pd.concat(rows_of_df, axis='index', ignore_index=True)
df_date = pd.concat(dates_of_df, axis='index', ignore_index=True)
df_y = df_y.astype('float32').to_frame()
df_y['date'] = df_date

100%|██████████| 185/185 [00:00<00:00, 15087.72it/s]


In [37]:
pd.date_range(df_y.loc[1416, 'date'], periods=168, freq='1h')

DatetimeIndex(['2016-05-30 00:00:00', '2016-05-30 01:00:00',
               '2016-05-30 02:00:00', '2016-05-30 03:00:00',
               '2016-05-30 04:00:00', '2016-05-30 05:00:00',
               '2016-05-30 06:00:00', '2016-05-30 07:00:00',
               '2016-05-30 08:00:00', '2016-05-30 09:00:00',
               ...
               '2016-06-05 14:00:00', '2016-06-05 15:00:00',
               '2016-06-05 16:00:00', '2016-06-05 17:00:00',
               '2016-06-05 18:00:00', '2016-06-05 19:00:00',
               '2016-06-05 20:00:00', '2016-06-05 21:00:00',
               '2016-06-05 22:00:00', '2016-06-05 23:00:00'],
              dtype='datetime64[ns]', length=168, freq='h')

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=48)

In [9]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=12,
    random_state=48,
    max_depth=5,
    n_jobs=-1,
    verbose=2
)

rf_model.fit(X_train, y_train)

rf_model.score(X_test, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.


building tree 1 of 12building tree 2 of 12
building tree 3 of 12
building tree 4 of 12

building tree 5 of 12
building tree 6 of 12
building tree 7 of 12
building tree 8 of 12
building tree 9 of 12
building tree 10 of 12
building tree 11 of 12
building tree 12 of 12


[Parallel(n_jobs=-1)]: Done   8 out of  12 | elapsed:  7.9min remaining:  3.9min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  7.9min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   3 out of  12 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=12)]: Done  10 out of  12 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.0s finished


0.2076986763026334

In [16]:
first_test = X_test[(X_test['station_id'] == 0) & (X_test['hour'] == 0) & (X_test['day_of_week'] == 0)].iloc[0].to_frame().T


rf_model.predict(first_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   3 out of  12 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=12)]: Done  10 out of  12 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.0s finished


array([0.37560713])

In [19]:
y_test.loc[1416]

0.2958477

In [21]:
# Importing the libraries
import numpy as np # for array operations
import pandas as pd # for working with DataFrames
import requests, io # for HTTP requests and I/O commands
import matplotlib.pyplot as plt # for data visualization
%matplotlib inline

# scikit-learn modules
from sklearn.model_selection import train_test_split # for splitting the data
from sklearn.metrics import mean_squared_error # for calculating the cost function
from sklearn.ensemble import RandomForestRegressor # for building the model

In [22]:
# Importing the dataset from the url of the data set
url = "https://drive.google.com/u/0/uc?id=1mVmGNx6cbfvRHC_DvF12ZL3wGLSHD9f_&export=download"
data = requests.get(url).content
# Reading the data
dataset = pd.read_csv(io.StringIO(data.decode('utf-8')))
dataset.head()

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
0,9.0,3571,1976,0.525,541
1,9.0,4092,1250,0.572,524
2,9.0,3865,1586,0.58,561
3,7.5,4870,2351,0.529,414
4,8.0,4399,431,0.544,410


In [23]:
x = dataset.drop('Petrol_Consumption', axis = 1) # Features
y = dataset['Petrol_Consumption']  # Target

In [24]:
# Splitting the dataset into training and testing set (80/20)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 28)

In [25]:
# Initializing the Random Forest Regression model with 10 decision trees
model = RandomForestRegressor(n_estimators = 10, random_state = 0)

# Fitting the Random Forest Regression model to the data
model.fit(x_train, y_train)

In [26]:
# Predicting the target values of the test set
y_pred = model.predict(x_test)

# RMSE (Root Mean Square Error)
rmse = float(format(np.sqrt(mean_squared_error(y_test, y_pred)), '.3f'))
print("\nRMSE: ", rmse)


RMSE:  96.389
