# 🎯 Final Model: Polynomial Regression with Clean Features

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Cleaned dataset: filtered and ready for modeling
df_cleaned = pd.read_csv("CBS_Cleaned.csv", parse_dates=["started_at"])

# 2. Capacity info from the station file
df_stations = pd.read_excel("Capital_Bikeshare_Locations.xltx")
capacity_data = df_stations[["NAME", "CAPACITY"]].dropna()
capacity_data.columns = ["start_station_name", "CAPACITY"]

# 3. Create daily ride counts by station
df_cleaned["date"] = df_cleaned["started_at"].dt.date
daily_rides = (
    df_cleaned.groupby(["start_station_name", "date"])
    .size()
    .reset_index(name="ride_count")
)

# 4. Load weather data (daily)
weather = pd.read_csv("CBS_2021-2023_Daily_Weather.csv", parse_dates=["Date"])
weather.columns = [
    "date", "casual", "member", "total_rides_weather",
    "apparent_temp", "actual_temp", "weather_code", "wind_speed"
]

# 5. Merge ride data with weather data on date
df_model = daily_rides.merge(weather, on="date", how="left")

# 6. Merge capacity data
df_model = df_model.merge(capacity_data, on="start_station_name", how="left")

import os
output_path = "CBS_Model_Features_Base.csv"
df_model.to_csv(output_path, index=False)
output_path

'CBS_Model_Features_Base.csv'

In [10]:
df = pd.read_csv("CBS_Model_Features_Base.csv", parse_dates=["date"])

In [15]:
features = ["actual_temp", "apparent_temp", "weather_code", "wind_speed", "CAPACITY"]

In [13]:
print(df.columns.tolist())

['start_station_name', 'date', 'ride_count', 'casual', 'member', 'total_rides_weather', 'apparent_temp', 'actual_temp', 'weather_code', 'wind_speed', 'CAPACITY']


🧠 Adım 2: Özellikleri ve Hedef Değişkeni Ayır

In [16]:
features = ["actual_temp", "apparent_temp", "weather_code", "wind_speed", "CAPACITY"]
X = df[features]
y = df["ride_count"]

In [17]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

🧮 Adım 3: Polinomial Özellik Dönüşümü

In [19]:
X.isna().sum()


actual_temp      588023
apparent_temp    588023
weather_code     588023
wind_speed       588023
CAPACITY          46840
dtype: int64

In [18]:
# Polinomial özellikler (2. derece)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Yeni özellik isimlerini de alalım (isteğe bağlı ama sunumda işe yarayabilir)
feature_names = poly.get_feature_names_out(features)

ValueError: Input X contains NaN.
PolynomialFeatures does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [20]:
df_cleaned["started_at"].dt.date.nunique()

1095

In [22]:
print(weather.columns)

Index(['date', 'casual', 'member', 'total_rides_weather', 'apparent_temp',
       'actual_temp', 'weather_code', 'wind_speed'],
      dtype='object')
