**Przygotowywanie danych do modelu**



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


In [3]:
url = "https://raw.githubusercontent.com/JanP5563/Projekt/main/models/SeoulBikeData.csv"
df = pd.read_csv(url, encoding = "unicode_escape")
df.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


Czyścimy dane

In [4]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
# Usuwam kolumnę Year, XBoost nie porwadzi sobie z nią

df = df.drop('Date', axis=1)

In [5]:
df['Holiday'] = df['Holiday'].map({'No Holiday': 0, 'Holiday': 1})
df['Functioning Day'] = df['Functioning Day'].map({'No': 0, 'Yes': 1})
df = pd.get_dummies(df, columns=['Seasons'], dtype = int, drop_first=True)
df = pd.get_dummies(df, columns=['Day'], dtype = int, drop_first=True)
df = pd.get_dummies(df, columns=['Month'], dtype = int, drop_first=True)
df.head()

Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),...,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
0,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


Tworzymy kolumny pomocnicze

In [6]:
def is_rush_hour(hour):
  if(hour >= 8 and hour <= 9) or (hour >= 17 and hour <= 19):
    return 1
  else:
    return 0
df['IsRushHour'] = df['Hour'].apply(is_rush_hour)
df.head()



Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),...,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12,IsRushHour
0,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
2,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
4,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0


Dzielimy dane

In [7]:
y = df['Rented Bike Count']
X = df.drop('Rented Bike Count', axis=1)

X_rest, X_test, y_rest, y_test = train_test_split(X, y, test_size=0.15, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_rest, y_rest, test_size=0.18, shuffle=False)

print(f"Trening: {len(X_train)} wierszy")
print(f"Walidacja: {len(X_val)} wierszy")
print(f"Test: {len(X_test)} wierszy")

Trening: 6105 wierszy
Walidacja: 1341 wierszy
Test: 1314 wierszy


Zaszumiamy dane

In [8]:
def add_noise_df(df, noise_level=0.01):
    noise = noise_level * np.random.randn(*df.shape)
    return pd.DataFrame(df.values + noise, columns=df.columns, index=df.index)

X_train_noisy = add_noise_df(X_train, noise_level=0.005)

X_train_augmented = pd.concat([X_train, X_train_noisy], axis=0)
y_train_augmented = pd.concat([y_train, y_train], axis=0)

print(f"Liczba wierszy przed: {len(X_train)}")
print(f"Liczba wierszy po augmentacji: {len(X_train_augmented)}")

Liczba wierszy przed: 6105
Liczba wierszy po augmentacji: 12210


**Trening modelu**

In [9]:
import xgboost as xgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

model_xgb = xgb.XGBRegressor(
    n_estimators=1000,
    max_depth=3,
    learning_rate=0.03,
    subsample=0.7,
    colsample_bytree=0.7,
    gamma = 10,
    early_stopping_rounds=50
)

model_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
#model_xgb.fit(X_train_augmented, y_train_augmented, eval_set=[(X_val, y_val)], verbose=False)

y_val_xgb = model_xgb.predict(X_val)
y_test_xgb = model_xgb.predict(X_test)
print("--- WYNIKI XGBOOST  ---")
print(f"R2 Treningowy: {r2_score(y_train, model_xgb.predict(X_train)):.4f}")
print(f"R2 Walidacyjny: {r2_score(y_val, y_val_xgb):.4f}")
print(f"MAE Treningowe: {mean_absolute_error(y_train, model_xgb.predict(X_train)):.2f}")
print(f"MAE Walidacyjne: {mean_absolute_error(y_val, y_val_xgb):.2f}")
print(f"RMSE Treningowe: {np.sqrt(mean_absolute_error(y_train, model_xgb.predict(X_train))):.2f}")
print(f"RMSE Walidacyjne: {np.sqrt(mean_absolute_error(y_val, y_val_xgb)):.2f}")

--- WYNIKI XGBOOST  ---
R2 Treningowy: 0.9200
R2 Walidacyjny: 0.8030
MAE Treningowe: 117.27
MAE Walidacyjne: 229.07
RMSE Treningowe: 10.83
RMSE Walidacyjne: 15.13


In [10]:
print("--- WYNIKI KOŃCOWE XGBOOST  ---")
print(f"R2 Testowy: {r2_score(y_test, y_test_xgb):.4f}")
print(f"MAE Testowy: {mean_absolute_error(y_test, y_test_xgb):.4f}")
print(f"MSE Testowy: {mean_squared_error(y_test, y_test_xgb):.4f}")
print(f"RMSE Testowy: {np.sqrt(mean_squared_error(y_test, y_test_xgb)):.4f}")

--- WYNIKI KOŃCOWE XGBOOST  ---
R2 Testowy: 0.5614
MAE Testowy: 271.2101
MSE Testowy: 142310.8750
RMSE Testowy: 377.2411


In [11]:
import joblib
joblib.dump(model_xgb, 'model_xgb.pkl')

['model_xgb.pkl']

In [12]:
# from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
# import xgboost as xgb

# xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

# #
# param_grid = {
#     'n_estimators': [500, 750, 1000],
#     'max_depth': [ 6, 7, 8, 9],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'subsample': [0.7, 0.8, 0.9],
#     'colsample_bytree': [0.7, 0.8, 0.9],
#     'gamma': [0, 0.1, 0.2]
# }

# tscv = TimeSeriesSplit(n_splits=5)

# random_search = RandomizedSearchCV(
#     estimator=xgb_model,
#     param_distributions=param_grid,
#     n_iter=10,
#     scoring='r2',
#     cv=tscv,
#     verbose=1,
#     random_state=42,
#     n_jobs=-1
# )

# random_search.fit(X_train_augmented, y_train_augmented)

# print(f"Najlepsze parametry: {random_search.best_params_}")
# print(f"Najlepszy wynik R2: {random_search.best_score_:.4f}")

#
# best_xgb = random_search.best_estimator_

In [14]:
df.columns

Index(['Rented Bike Count', 'Hour', 'Temperature(°C)', 'Humidity(%)',
       'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(°C)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Holiday',
       'Functioning Day', 'Seasons_Spring', 'Seasons_Summer', 'Seasons_Winter',
       'Day_2', 'Day_3', 'Day_4', 'Day_5', 'Day_6', 'Day_7', 'Day_8', 'Day_9',
       'Day_10', 'Day_11', 'Day_12', 'Day_13', 'Day_14', 'Day_15', 'Day_16',
       'Day_17', 'Day_18', 'Day_19', 'Day_20', 'Day_21', 'Day_22', 'Day_23',
       'Day_24', 'Day_25', 'Day_26', 'Day_27', 'Day_28', 'Day_29', 'Day_30',
       'Day_31', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6',
       'Month_7', 'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12',
       'IsRushHour'],
      dtype='object')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 57 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Rented Bike Count          8760 non-null   int64  
 1   Hour                       8760 non-null   int64  
 2   Temperature(°C)            8760 non-null   float64
 3   Humidity(%)                8760 non-null   int64  
 4   Wind speed (m/s)           8760 non-null   float64
 5   Visibility (10m)           8760 non-null   int64  
 6   Dew point temperature(°C)  8760 non-null   float64
 7   Solar Radiation (MJ/m2)    8760 non-null   float64
 8   Rainfall(mm)               8760 non-null   float64
 9   Snowfall (cm)              8760 non-null   float64
 10  Holiday                    8760 non-null   int64  
 11  Functioning Day            8760 non-null   int64  
 12  Seasons_Spring             8760 non-null   int64  
 13  Seasons_Summer             8760 non-null   int64