In [1]:
# === Imports ===
import pandas as pd
import numpy as np
 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
 
# === Config ===
DATA_PATH = "weather_linear_regression_10000.csv"
TARGET = "temperature_c"
 
DATE_COL = "date"
NUMERIC_COLS = [
    "humidity_percent",
    "pressure_hpa",
    "wind_speed_kmph",
    "cloud_cover_percent",
    "rainfall_mm",
    "sunshine_hours",
    # 'temperature_c' must be excluded from features since it's the target
]
 
# === Custom Date Transformer (robust to pandas versions) ===
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Transforms a pandas datetime Series into engineered time features:
    year, month, day, dayofweek, dayofyear, and cyclic encodings.
    Robust to DataFrame/ndarray inputs and missing dates.
    """
    def __init__(self):
        self.feature_names_ = [
            "year", "month", "day", "dayofweek", "dayofyear",
            "month_sin", "month_cos", "doy_sin", "doy_cos"
        ]
 
    def fit(self, X, y=None):
        return self
 
    def transform(self, X):
        # Normalize input to a Series of datetime
        if isinstance(X, pd.DataFrame):
            ser = pd.to_datetime(X.iloc[:, 0], errors="coerce")
        else:
            ser = pd.to_datetime(pd.Series(X[:, 0]), errors="coerce")
 
        ser = pd.Series(ser)  # ensure Series
        if ser.isna().all():
            ser = pd.to_datetime(pd.Series(["2000-01-01"] * len(ser)))
        else:
            ser = ser.ffill().bfill()
 
        year = ser.dt.year.astype(float)
        month = ser.dt.month.astype(float)
        day = ser.dt.day.astype(float)
        dayofweek = ser.dt.dayofweek.astype(float)
        dayofyear = ser.dt.dayofyear.astype(float)
 
        # Cyclic encodings
        month_sin = np.sin(2 * np.pi * (month / 12.0))
        month_cos = np.cos(2 * np.pi * (month / 12.0))
        doy_sin = np.sin(2 * np.pi * (dayofyear / 365.25))
        doy_cos = np.cos(2 * np.pi * (dayofyear / 365.25))
 
        features = np.vstack([
            year, month, day, dayofweek, dayofyear,
            month_sin, month_cos, doy_sin, doy_cos
        ]).T
        return features
 
    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_)
 
# === Load data ===
df = pd.read_csv(DATA_PATH)
 
# Basic schema checks
if DATE_COL not in df.columns:
    raise ValueError(f"Column '{DATE_COL}' not found in dataset columns: {df.columns.tolist()}")
if TARGET not in df.columns:
    raise ValueError(f"Target '{TARGET}' not found in dataset columns: {df.columns.tolist()}")
 
# Ensure all expected feature columns exist
missing_feats = [c for c in NUMERIC_COLS if c not in df.columns]
if missing_feats:
    raise ValueError(f"Missing expected numeric feature columns: {missing_feats}")
 
# --- Clean types ---
# Coerce target to numeric and drop rows with missing target
df[TARGET] = pd.to_numeric(df[TARGET], errors="coerce")
before = len(df)
df = df.dropna(subset=[TARGET]).copy()
print(f"[Info] Dropped {before - len(df)} rows with NaN in target '{TARGET}'. Remaining: {len(df)}")
 
# Coerce numeric feature columns to numeric (bad strings -> NaN -> imputed later)
for col in NUMERIC_COLS:
    df[col] = pd.to_numeric(df[col], errors="coerce")
 
# === Build X, y ===
X = df[[DATE_COL] + NUMERIC_COLS].copy()
y = df[TARGET].copy()
 
# === Build pipelines ===
date_pipeline = Pipeline(steps=[
    ("date_features", DateFeatureExtractor()),
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])
 
numeric_pipeline = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])
 
preprocessor = ColumnTransformer(
    transformers=[
        ("date", date_pipeline, [DATE_COL]),
        ("num", numeric_pipeline, NUMERIC_COLS)
    ],
    remainder="drop",
    verbose_feature_names_out=False
)
 
linreg_model = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", LinearRegression())
])
 
# === Fit model ===
linreg_model.fit(X, y)
print("[Info] Model trained on", len(X), "rows.")
 
# === Single-row prediction (edit values as needed) ===
test_data = {
    "date": "2015-01-03",        # the pipeline parses this
    "humidity_percent": 72.5,
    "pressure_hpa": 1005.1,
    "wind_speed_kmph": 12.3,
    "cloud_cover_percent": 58.0,
    "rainfall_mm": 2.4,
    "sunshine_hours": 6.2,
}
test_df = pd.DataFrame([test_data])
 
prediction = linreg_model.predict(test_df)
pred_value = float(prediction[0])
print("Predicted Temperature (°C):", round(pred_value, 2))
 
# Optional quick interpretation
if pred_value < 20:
    print("Interpretation: Rather cool day.")
elif pred_value < 30:
    print("Interpretation: Mild to warm.")
else:
    print("Interpretation: Hot day.")

[Info] Dropped 0 rows with NaN in target 'temperature_c'. Remaining: 10000
[Info] Model trained on 10000 rows.
Predicted Temperature (°C): 61.15
Interpretation: Hot day.
