In [4]:
import os
import pandas as pd
import numpy as np
import kagglehub
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# -----------------------------------
# 1. Download dataset via kagglehub
# -----------------------------------
path = kagglehub.dataset_download("muthuj7/weather-dataset")

print("Dataset directory:", path)
print("Files:", os.listdir(path))

# -----------------------------------
# 2. Load CSV file
# -----------------------------------
csv_file = os.path.join(path, "weatherHistory.csv")
df = pd.read_csv(csv_file)
print(df.head())


Dataset directory: C:\Users\draku\.cache\kagglehub\datasets\muthuj7\weather-dataset\versions\1
Files: ['weatherHistory.csv']
                  Formatted Date        Summary Precip Type  Temperature (C)  \
0  2006-04-01 00:00:00.000 +0200  Partly Cloudy        rain         9.472222   
1  2006-04-01 01:00:00.000 +0200  Partly Cloudy        rain         9.355556   
2  2006-04-01 02:00:00.000 +0200  Mostly Cloudy        rain         9.377778   
3  2006-04-01 03:00:00.000 +0200  Partly Cloudy        rain         8.288889   
4  2006-04-01 04:00:00.000 +0200  Mostly Cloudy        rain         8.755556   

   Apparent Temperature (C)  Humidity  Wind Speed (km/h)  \
0                  7.388889      0.89            14.1197   
1                  7.227778      0.86            14.2646   
2                  9.377778      0.89             3.9284   
3                  5.944444      0.83            14.1036   
4                  6.977778      0.83            11.0446   

   Wind Bearing (degrees)  Visibi

In [5]:


print("\nDataset shape:", df.shape)

# ------------------------------
# 3. Parse datetime
# ------------------------------
df["Formatted Date"] = pd.to_datetime(df["Formatted Date"], utc=True)

df["year"] = df["Formatted Date"].dt.year
df["month"] = df["Formatted Date"].dt.month
df["day"] = df["Formatted Date"].dt.day
df["hour"] = df["Formatted Date"].dt.hour

# ------------------------------
# 4. Drop useless / leakage columns
# ------------------------------
df = df.drop(
    columns=[
        "Formatted Date",
        "Apparent Temperature (C)",  # leakage
        "Loud Cover"                 # constant column
    ],
    errors="ignore"
)

# ------------------------------
# 5. Define target (y) and features (X)
# ------------------------------
y = df["Temperature (C)"]
X = df.drop(columns=["Temperature (C)"])

# ------------------------------
# 6. Feature groups
# ------------------------------
categorical_features = [
    "Summary",
    "Precip Type"
]

numerical_features = [
    "Humidity",
    "Wind Speed (km/h)",
    "Wind Bearing (degrees)",
    "Visibility (km)",
    "year",
    "month",
    "day",
    "hour"
]

# ------------------------------
# 7. Preprocessing
# ------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numerical_features)
    ]
)

# ------------------------------
# 8. Model
# ------------------------------
model = RandomForestRegressor(
    n_estimators=150,
    random_state=42,
    n_jobs=-1
)

# ------------------------------
# 9. Pipeline
# ------------------------------
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# ------------------------------
# 10. Train-test split
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

# ------------------------------
# 11. Train model
# ------------------------------
print("\nTraining model...")
pipeline.fit(X_train, y_train)

# ------------------------------
# 12. Evaluate
# ------------------------------
y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\nModel Evaluation")
print("----------------")
print(f"MAE  : {mae:.2f} 째C")
print(f"RMSE : {rmse:.2f} 째C")

# ------------------------------
# 13. Save model
# ------------------------------
joblib.dump(pipeline, "temperature_model.pkl")
print("\nModel saved as: temperature_model.pkl")



Dataset shape: (96453, 12)

Training model...

Model Evaluation
----------------
MAE  : 1.35 째C
RMSE : 1.84 째C

Model saved as: temperature_model.pkl
