<a href="https://colab.research.google.com/github/JoshuaTewolde/HangmanGame-Java/blob/main/Fixed_ML_Weather.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import csv

import pandas as pd
import glob
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score


In [None]:
# Load all CSVs
files = glob.glob("/content/drive/MyDrive/Weather Datasets/*.csv")
print("This many files: ", len(files))

df_list = []
for f in files:
    df_list.append(pd.read_csv(f, low_memory=False))

df = pd.concat(df_list, ignore_index=True)


This many files:  7


In [None]:
#replace M with NaN
df = df.replace("M", np.nan)

# Drop rows where temperature is missing
df = df.dropna(subset=["tmpf"])


In [None]:
#convert to numeric
numeric_cols = ["tmpf", "dwpf", "relh",
                "feel", "drct", "sped", "mslp",
                "lon", "lat"]
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")


In [None]:
#convert date/time
df["valid"] = pd.to_datetime(df["valid"])
df = df.rename(columns={"valid": "datetime",})
df["DOY"] = df["datetime"].dt.dayofyear #day of the year
df["TOD"] = df["datetime"].dt.hour*60 + df["datetime"].dt.minute #time of day
# df = df.drop(columns=["datetime"])
# df = df.drop(columns=["valid"])

#sort by date
df = df.sort_values("datetime")


In [None]:
target = df[df['station'] == 'FNT'].copy()
predictors = df.copy()

#target_time is new column with the time is 24 hrs in the future
predictors['target_time'] = predictors['datetime'] + pd.Timedelta(hours=24)

#rename the timestamp column in the target to "flint_time"
target = target.rename(columns={'datetime': 'flint_time', "tmpf": "target_tmpf",})


In [None]:
print(predictors.shape)
print(target.shape)
print(predictors["target_time"].is_unique)
print(target["flint_time"].is_unique)


(18353896, 15)
(124725, 14)
False
False


In [None]:
df_merged = pd.merge_asof(
    predictors.sort_values('target_time'),
    target.sort_values('flint_time'),
    left_on='target_time',
    right_on='flint_time',
    direction='nearest',           # choose the closest future/previous measurement
    tolerance=pd.Timedelta('2H')   # max distance allowed (example: 2 hours)
)


  tolerance=pd.Timedelta('2H')   # max distance allowed (example: 2 hours)


In [None]:
df_merged = df_merged.dropna(subset=["target_tmpf", "tmpf"]) #drop any row without a temperature value
#drop target_time, flint_time as they are no longer needed, drop station as it is an id

df_final = df_merged.drop(columns=["station_y", "target_time", "flint_time", "lon_y", "lat_y",
"elevation_y","dwpf_y","relh_y","feel_y","drct_y","sped_y","mslp_y","DOY_y","TOD_y"]) #"station_x" used to be on here but removed


In [None]:
print(df_final.columns.tolist())

# ---- 1. Floor timestamps to hourly buckets ----
df_final["hour"] = df_final["datetime"].dt.floor("H")

# ---- 2. Keep only needed columns ----
df_small = df_final[["station_x", "hour", "tmpf", "dwpf_x"]].copy()

# ---- 3. Pivot so each station becomes columns ----
df_wide = df_small.pivot_table(
    index="hour",
    columns="station_x",
    values=["tmpf", "dwpf_x"],
    aggfunc="mean"
)

# Flatten column names: ('tmpf','FNT') -> 'FNT_tmpf'
df_wide.columns = [f"{station}_{var}" for var, station in df_wide.columns]
df_wide = df_wide.reset_index()

['station_x', 'datetime', 'lon_x', 'lat_x', 'elevation_x', 'tmpf', 'dwpf_x', 'relh_x', 'feel_x', 'drct_x', 'sped_x', 'mslp_x', 'DOY_x', 'TOD_x', 'target_tmpf']


  df_final["hour"] = df_final["datetime"].dt.floor("H")


In [None]:
# print(df_merged.columns.tolist())
# print(df_final.head(3))

print(df_wide.columns.tolist())
print(df_wide.shape)
print(df_wide.head())


['hour', '04W_dwpf_x', '14Y_dwpf_x', '2WX_dwpf_x', '7L2_dwpf_x', '8D3_dwpf_x', 'ABR_dwpf_x', 'AEL_dwpf_x', 'AIT_dwpf_x', 'ALO_dwpf_x', 'ANW_dwpf_x', 'ATY_dwpf_x', 'AXN_dwpf_x', 'AZO_dwpf_x', 'BAC_dwpf_x', 'BCK_dwpf_x', 'BFF_dwpf_x', 'BIS_dwpf_x', 'BKX_dwpf_x', 'BUU_dwpf_x', 'C29_dwpf_x', 'CID_dwpf_x', 'CKC_dwpf_x', 'CMX_dwpf_x', 'CNC_dwpf_x', 'CUT_dwpf_x', 'CVX_dwpf_x', 'D07_dwpf_x', 'DBQ_dwpf_x', 'DEH_dwpf_x', 'DNS_dwpf_x', 'DVN_dwpf_x', 'DXX_dwpf_x', 'EOK_dwpf_x', 'EST_dwpf_x', 'EZS_dwpf_x', 'FAR_dwpf_x', 'FFL_dwpf_x', 'FFM_dwpf_x', 'FLD_dwpf_x', 'FNT_dwpf_x', 'FSD_dwpf_x', 'FSE_dwpf_x', 'GRN_dwpf_x', 'GRR_dwpf_x', 'HEI_dwpf_x', 'HON_dwpf_x', 'IEN_dwpf_x', 'IKV_dwpf_x', 'ISQ_dwpf_x', 'IWD_dwpf_x', 'LBF_dwpf_x', 'LCG_dwpf_x', 'LNL_dwpf_x', 'LUM_dwpf_x', 'MBG_dwpf_x', 'MBL_dwpf_x', 'MCW_dwpf_x', 'MFI_dwpf_x', 'MKG_dwpf_x', 'MTW_dwpf_x', 'OEO_dwpf_x', 'OTG_dwpf_x', 'PBH_dwpf_x', 'PCZ_dwpf_x', 'PDC_dwpf_x', 'PNM_dwpf_x', 'RCA_dwpf_x', 'RDK_dwpf_x', 'RDR_dwpf_x', 'ROX_dwpf_x', 'RYV_dwpf_x

In [None]:
forest = RandomForestRegressor(n_estimators=2, n_jobs=-1)


In [None]:
#Manage the DOY
df_wide["DOY"] = df_wide["hour"].dt.dayofyear


In [None]:
# create DF final from flint

# Extract Flint hourly temps
flint = (
    df[df["station"] == "FNT"]
    .copy()
    .assign(hour=lambda x: x["datetime"].dt.floor("H"))
    .groupby("hour")["tmpf"]
    .mean()
    .rename("target_tmpf")
    .reset_index()
)

# Shift target 24 hours into the future
flint["hour"] = flint["hour"] - pd.Timedelta(hours=24)

# Merge target onto predictors
df_final = pd.merge(
    df_wide,
    flint,
    on="hour",
    how="inner"
)
df_final["year"] = df_final["hour"].dt.year

print(df_final.shape)
print(df_final.head())

# df_final["year"] = df_final["datetime"].dt.year
# df_final = df_final.drop(columns=["datetime"])
# df_final = df_final.drop(columns=["station_x"])

# Test set = only 2018 and 2021
df_test = df_final[df_final["year"].isin([2018, 2021])]
X_test = df_test.drop(columns=["target_tmpf", "year", "hour"])
y_test = df_test["target_tmpf"]

# Train set = every other year
df_train = df_final[~df_final["year"].isin([2018, 2021])]
X_train = df_train.drop(columns=["target_tmpf", "year", "hour"])
y_train = df_train["target_tmpf"]



  .assign(hour=lambda x: x["datetime"].dt.floor("H"))


(95915, 166)
                 hour  04W_dwpf_x  14Y_dwpf_x  2WX_dwpf_x  7L2_dwpf_x  \
0 2013-01-01 00:00:00        -7.0       -20.2        10.0         8.6   
1 2013-01-01 01:00:00       -11.2       -22.6         8.1        10.4   
2 2013-01-01 02:00:00       -14.2       -22.6         8.0        10.4   
3 2013-01-01 03:00:00       -16.0       -23.8         9.0        11.0   
4 2013-01-01 04:00:00       -16.6       -23.8         9.0        14.0   

   8D3_dwpf_x  ABR_dwpf_x  AEL_dwpf_x  AIT_dwpf_x  ALO_dwpf_x  ...  SUE_tmpf  \
0        -5.0         5.0        -4.6       -19.6        -4.0  ...      15.2   
1        -1.0         6.9        -8.2       -20.2        -5.0  ...      14.0   
2         0.0         8.0        -7.6       -20.8        -7.0  ...      12.2   
3         1.0         9.0        -8.2       -23.2        -7.0  ...      12.2   
4         2.0        11.0        -9.4       -23.8        -8.0  ...      12.2   

   SUW_tmpf   SYN_tmpf  TWM_tmpf  XVG_tmpf  Y51_tmpf  YKN_tmpf  DOY

In [None]:
# forest.fit(df_train.drop(columns=["target_tmpf"]), df_train["target_tmpf"])
# y_pred = forest.predict(df_test.drop(columns=["target_tmpf"]))

forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)



In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
# accuracy = accuracy_score(y_test, y_pred)

print("RMSE:", rmse)
print("R²:", r2)


RMSE: 7.299985878050665
R²: 0.8697834806714485


In [None]:
absolute_error = np.abs(y_test - y_pred)
within_3_degrees = np.sum(absolute_error <= 5)
percentage_within_3 = (within_3_degrees / len(y_test)) * 100
print(f"Percentage of forecasts accurate within 5 degrees: {percentage_within_3:.2f}%")

within_3_degrees = np.sum(absolute_error <= 3)
percentage_within_3 = (within_3_degrees / len(y_test)) * 100
print(f"Percentage of forecasts accurate within 3 degrees: {percentage_within_3:.2f}%")

within_3_degrees = np.sum(absolute_error <= 2)
percentage_within_3 = (within_3_degrees / len(y_test)) * 100
print(f"Percentage of forecasts accurate within 2 degrees: {percentage_within_3:.2f}%")

within_3_degrees = np.sum(absolute_error <= 1)
percentage_within_3 = (within_3_degrees / len(y_test)) * 100
print(f"Percentage of forecasts accurate within 1 degrees: {percentage_within_3:.2f}%")


Percentage of forecasts accurate within 5 degrees: 56.05%
Percentage of forecasts accurate within 3 degrees: 36.92%
Percentage of forecasts accurate within 2 degrees: 25.98%
Percentage of forecasts accurate within 1 degrees: 14.48%


In [None]:
#get feature importances
importances = forest.feature_importances_
for name, val in sorted(zip(X_train.columns, importances), key=lambda x: -x[1]):
    print(name, val)


MCW_tmpf 0.6458909839996174
CKC_tmpf 0.10690192008232421
MFI_tmpf 0.08480518319557451
CID_tmpf 0.021409159592295623
SAW_tmpf 0.013332852207036281
FLD_tmpf 0.007992213782838197
FNT_tmpf 0.005890300129987317
CVX_tmpf 0.005191381793970227
FSE_tmpf 0.0042630728172147565
IWD_tmpf 0.004117980921636704
DBQ_dwpf_x 0.0025895714780401675
DOY 0.002303167402079384
DVN_tmpf 0.002291690045870631
LNL_tmpf 0.002125401935424238
ISQ_tmpf 0.0017390590192962684
EOK_tmpf 0.0016222981468153046
PBH_tmpf 0.0015676851703212607
FSE_dwpf_x 0.0015377957309031004
PDC_dwpf_x 0.0014371935379463268
DNS_dwpf_x 0.0014269297334653477
CMX_tmpf 0.0013200014668433488
MKG_tmpf 0.0012601024819295572
CNC_tmpf 0.0012561330479245105
BUU_tmpf 0.0012509287936573103
GRR_tmpf 0.0012464161463259435
MBL_tmpf 0.0011420396623994047
CKC_dwpf_x 0.0010740310906392264
RDR_tmpf 0.0010101142649056119
BKX_tmpf 0.0010066009619764957
7L2_tmpf 0.0009593640688101339
ROX_dwpf_x 0.0009570953794967255
RYV_tmpf 0.0009556522436706574
BAC_dwpf_x 0.0009