In [4]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# =========================
# Load data
# =========================
df = pd.read_csv("weatherAUS.csv")

# Keep only rows with target
df = df.dropna(subset=["RainTomorrow", "RainToday"])

# Binary target
df["RainTomorrow"] = df["RainTomorrow"].map({"No": 0, "Yes": 1})
df["RainToday"] = df["RainToday"].map({"No": 0, "Yes": 1})

# =========================
# Features & target
# =========================
X = df.drop(columns=["RainTomorrow", "Date"])
y = df["RainTomorrow"]

categorical_features = [
    "Location",
    "WindGustDir",
    "WindDir9am",
    "WindDir3pm"
]

numeric_features = [col for col in X.columns if col not in categorical_features]

# =========================
# Preprocessing
# =========================
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# =========================
# Model
# =========================
model = LogisticRegression(max_iter=1000)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# =========================
# Train / validation split
# =========================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipeline.fit(X_train, y_train)

# =========================
# Validation
# =========================
val_preds = pipeline.predict(X_val)
print("F1 score:", f1_score(y_val, val_preds))

# =========================
# Save pipeline
# =========================
joblib.dump(pipeline, "models/rain_model.pkl")

print("✅ Model saved as rain_model.pkl")


F1 score: 0.6120930232558139
✅ Model saved as rain_model.pkl
