In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

In [7]:
state = 'kuala_lumpur'
df = pd.read_csv(f"dataset/{state}.csv")

# Make sure datetime is a datetime object
df["datetime"] = pd.to_datetime(df["datetime"])

# Binary rainy hour
df["is_rainy"] = df["precipitation_total"] > 0

# Create a date column
df["date"] = df["datetime"].dt.date

# Get daily rain per state (1 if any hour is rainy)
daily_rain = df.groupby(["state", "date"])["is_rainy"].max().reset_index()

# Get month from date
daily_rain["month"] = pd.to_datetime(daily_rain["date"]).dt.to_period("M")

# Aggregate monthly rainy days
monthly_rain = daily_rain.groupby(["state", "month"])["is_rainy"].sum().reset_index()
monthly_rain.rename(columns={"is_rainy": "rainy_days"}, inplace=True)

In [9]:
print(daily_rain)

            state        date  is_rainy    month
0    Kuala Lumpur  2024-01-01     False  2024-01
1    Kuala Lumpur  2024-01-02      True  2024-01
2    Kuala Lumpur  2024-01-03      True  2024-01
3    Kuala Lumpur  2024-01-04      True  2024-01
4    Kuala Lumpur  2024-01-05     False  2024-01
..            ...         ...       ...      ...
239  Kuala Lumpur  2024-08-27      True  2024-08
240  Kuala Lumpur  2024-08-28     False  2024-08
241  Kuala Lumpur  2024-08-29     False  2024-08
242  Kuala Lumpur  2024-08-30     False  2024-08
243  Kuala Lumpur  2024-08-31      True  2024-08

[244 rows x 4 columns]


In [8]:
print(monthly_rain)

          state    month  rainy_days
0  Kuala Lumpur  2024-01          12
1  Kuala Lumpur  2024-02           5
2  Kuala Lumpur  2024-03          17
3  Kuala Lumpur  2024-04          18
4  Kuala Lumpur  2024-05          12
5  Kuala Lumpur  2024-06          17
6  Kuala Lumpur  2024-07          11
7  Kuala Lumpur  2024-08          19


# Prediction Model

In [None]:
le = LabelEncoder()
monthly_rain["state_code"] = le.fit_transform(monthly_rain["state"])

# Convert month to number (1-12)
monthly_rain["month_number"] = monthly_rain["month"].dt.month

# Features and target
X = monthly_rain[["state_code", "month_number"]]
y = monthly_rain["rainy_days"]

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

next_month = pd.Timestamp.today().month + 1
if next_month > 12:
    next_month = 1

state_code = le.transform(["Kuala Lumpur"])[0]

X_new = pd.DataFrame({"state_code":[state_code], "month_number":[next_month]})
predicted_rainy_days = round(model.predict(X_new)[0])
print(f"Predicted rainy days in Kuala Lumpur next month: {predicted_rainy_days}")

Predicted rainy days in Kuala Lumpur next month: 17


# Check Accuracy

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R²: {r2}")


MAE: 6.63
MSE: 47.4538
R²: -0.3181611111111111


In [None]:
import pickle

with open("rain_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("state_encoder.pkl", "wb") as f:
    pickle.dump(le, f)