**Przygotowywanie danych do modelu**



In [1]:
!pip install --upgrade scikit-learn==1.8.0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


Collecting scikit-learn==1.8.0
  Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed scikit-learn-1.8.0


In [2]:
url = "https://raw.githubusercontent.com/JanP5563/Projekt/main/models/SeoulBikeData.csv"
df = pd.read_csv(url, encoding = "unicode_escape")
df.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


Czyścimy dane

In [3]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
# Usuwam kolumnę Year, regresja liniowa będzie tylko przedłużała poprzedni trend

print(df[['Day', 'Month']].head())
df = df.drop('Date', axis=1)

   Day  Month
0    1     12
1    1     12
2    1     12
3    1     12
4    1     12


In [4]:
df['Holiday'] = df['Holiday'].map({'No Holiday': 0, 'Holiday': 1})        # Robimy wartości liczbowe z tekstowych
df['Functioning Day'] = df['Functioning Day'].map({'No': 0, 'Yes': 1})
df = pd.get_dummies(df, columns=['Seasons'], dtype = int, drop_first=True) # one hot encoder
df = pd.get_dummies(df, columns=['Day'], dtype = int, drop_first=True)
df = pd.get_dummies(df, columns=['Month'], dtype = int, drop_first=True)
df.head()

Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),...,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
0,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


Tworzymy kolumnę pomocniczą

In [5]:
def is_rush_hour(hour):
  if(hour >= 8 and hour <= 9) or (hour >= 17 and hour <= 19):
    return 1
  else:
    return 0
df['IsRushHour'] = df['Hour'].apply(is_rush_hour)
df.head()



Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),...,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12,IsRushHour
0,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
2,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
4,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0


Dzielimy dane

In [6]:
y = df['Rented Bike Count']
X = df.drop('Rented Bike Count', axis=1)

X_rest, X_test, y_rest, y_test = train_test_split(X, y, test_size=0.15, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_rest, y_rest, test_size=0.18, shuffle=False)

print(f"Trening: {len(X_train)} wierszy")
print(f"Walidacja: {len(X_val)} wierszy")
print(f"Test: {len(X_test)} wierszy")

Trening: 6105 wierszy
Walidacja: 1341 wierszy
Test: 1314 wierszy


Obróba danych

In [7]:
from sklearn.preprocessing import StandardScaler
import joblib

scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

joblib.dump(scaler, 'scaler_linear.pkl')


['scaler_linear.pkl']

**Trening modelu**

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_val_scaled)
y_pred_test = model.predict(X_test_scaled)

print("--- WYNIKI REGRESJI LINIOWEJ ---")
#print(f"R2 Score: {r2_score(y_val, y_pred):.4f}")
#print(f"Średni błąd (MAE): {mean_absolute_error(y_val, y_pred):.2f} rowerów")

print(f"R2 Score Test: {r2_score(y_test, y_pred_test):.4f}")
print(f"(MAE) Test: {mean_absolute_error(y_test, y_pred_test):.2f} rowerów")
print(f"(MSE) Test: {mean_squared_error(y_test, y_pred_test):.0f} rowerów^2")
print(f"(RMSE) Test: {np.sqrt(mean_squared_error(y_test, y_pred_test)):.2f} rowerów")


--- WYNIKI REGRESJI LINIOWEJ ---
R2 Score Test: 0.4324
(MAE) Test: 318.36 rowerów
(MSE) Test: 184198 rowerów^2
(RMSE) Test: 429.18 rowerów


In [9]:
joblib.dump(model, 'model_linear.pkl')

['model_linear.pkl']