<a href="https://colab.research.google.com/github/JoshuaTewolde/Weather-Forecasting/blob/main/ML_Weather.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import csv

import pandas as pd
import glob
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score


In [None]:
# Load all CSVs
files = glob.glob("/content/drive/MyDrive/Weather Datasets/*.csv")
print("This many files: ", len(files))

df_list = []
for f in files:
    df_list.append(pd.read_csv(f, low_memory=False))

df = pd.concat(df_list, ignore_index=True)


This many files:  7


In [None]:
#replace M with NaN
df = df.replace("M", np.nan)

# Drop rows where temperature is missing
df = df.dropna(subset=["tmpf"])


In [None]:
#convert to numeric
numeric_cols = ["tmpf", "dwpf", "relh",
                "feel", "drct", "sped", "mslp",
                "lon", "lat"]
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")


In [None]:
#convert date/time
df["valid"] = pd.to_datetime(df["valid"])
df = df.rename(columns={"valid": "datetime",})
df["DOY"] = df["datetime"].dt.dayofyear #day of the year
df["TOD"] = df["datetime"].dt.hour*60 + df["datetime"].dt.minute #time of day
# df = df.drop(columns=["datetime"])
# df = df.drop(columns=["valid"])

#sort by date
df = df.sort_values("datetime")


In [None]:
target = df[df['station'] == 'FNT'].copy()
predictors = df.copy()

#target_time is new column with the time is 24 hrs in the future
predictors['target_time'] = predictors['datetime'] + pd.Timedelta(hours=24)

#rename the timestamp column in the target to "flint_time"
target = target.rename(columns={'datetime': 'flint_time', "tmpf": "target_tmpf",})


In [None]:
print(predictors.shape)
print(target.shape)
print(predictors["target_time"].is_unique)
print(target["flint_time"].is_unique)


(18353896, 15)
(124725, 14)
False
False


In [None]:
df_merged = pd.merge_asof(
    predictors.sort_values('target_time'),
    target.sort_values('flint_time'),
    left_on='target_time',
    right_on='flint_time',
    direction='nearest',           # choose the closest future/previous measurement
    tolerance=pd.Timedelta('2H')   # max distance allowed (example: 2 hours)
)


  tolerance=pd.Timedelta('2H')   # max distance allowed (example: 2 hours)


In [None]:
df_merged = df_merged.dropna(subset=["target_tmpf", "tmpf"]) #drop any row without a temperature value
#drop target_time, flint_time as they are no longer needed, drop station as it is an id
df_final = df_merged.drop(columns=["station_x", "station_y", "target_time", "flint_time", "lon_y", "lat_y",
"elevation_y","dwpf_y","relh_y","feel_y","drct_y","sped_y","mslp_y","DOY_y","TOD_y"])


In [None]:
print(df_merged.columns.tolist())
print(df_final.head(3))


['station_x', 'datetime', 'lon_x', 'lat_x', 'elevation_x', 'tmpf', 'dwpf_x', 'relh_x', 'feel_x', 'drct_x', 'sped_x', 'mslp_x', 'DOY_x', 'TOD_x', 'target_time', 'station_y', 'flint_time', 'lon_y', 'lat_y', 'elevation_y', 'target_tmpf', 'dwpf_y', 'relh_y', 'feel_y', 'drct_y', 'sped_y', 'mslp_y', 'DOY_y', 'TOD_y']
    datetime    lon_x    lat_x  elevation_x  tmpf  dwpf_x  relh_x  feel_x  \
0 2013-01-01 -95.2624  41.0106        317.0  19.4    10.4   67.51   11.43   
1 2013-01-01 -93.5695  41.6878        270.7   8.6     1.4   71.92   -1.39   
2 2013-01-01 -95.3800  41.9842        381.0   6.8     1.4   78.01   -2.01   

   drct_x  sped_x  mslp_x  DOY_x  TOD_x  target_tmpf  
0    20.0    5.75     NaN      1      0         18.0  
1   340.0    5.75     NaN      1      0         18.0  
2    20.0    4.60     NaN      1      0         18.0  


In [None]:
forest = RandomForestRegressor(n_estimators=50, n_jobs=-1)


In [None]:
# Extract the year
df_final["year"] = df_final["datetime"].dt.year
df_final = df_final.drop(columns=["datetime"])

# Keep only the most relevant columns
columns_to_keep = ['tmpf', 'DOY_x', 'TOD_x', 'target_tmpf', 'year', 'dwpf_x', "feel_x", "lat_x", "lon_x", "elevation_x"]
df_final = df_final[columns_to_keep].copy()

# Optional: check the result
print(df_final.head())
print(df_final.columns.tolist())

# Test set = only 2018 and 2021
df_test = df_final[df_final["year"].isin([2018, 2021])]
X_test = df_test.drop(columns=["target_tmpf", "year"])
y_test = df_test["target_tmpf"]

# Train set = every other year
df_train = df_final[~df_final["year"].isin([2018, 2021])]
X_train = df_train.drop(columns=["target_tmpf", "year"])
y_train = df_train["target_tmpf"]


   tmpf  DOY_x  TOD_x  target_tmpf  year  dwpf_x  feel_x    lat_x    lon_x  \
0  19.4      1      0         18.0  2013    10.4   11.43  41.0106 -95.2624   
1   8.6      1      0         18.0  2013     1.4   -1.39  41.6878 -93.5695   
2   6.8      1      0         18.0  2013     1.4   -2.01  41.9842 -95.3800   
3  17.6      1      0         18.0  2013    10.4    3.67  41.0521 -91.9834   
4  21.2      1      0         18.0  2013    12.2   10.81  40.4615 -91.4274   

   elevation_x  
0        317.0  
1        270.7  
2        381.0  
3        242.3  
4        204.5  
['tmpf', 'DOY_x', 'TOD_x', 'target_tmpf', 'year', 'dwpf_x', 'feel_x', 'lat_x', 'lon_x', 'elevation_x']


In [None]:
forest.fit(df_train.drop(columns=["target_tmpf"]), df_train["target_tmpf"])
y_pred = forest.predict(df_test.drop(columns=["target_tmpf"]))


In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
# accuracy = accuracy_score(y_test, y_pred)

print("RMSE:", rmse)
print("R²:", r2)


RMSE: 9.153540023223696
R²: 0.7972166661106797


In [None]:
absolute_error = np.abs(y_test - y_pred)
within_3_degrees = np.sum(absolute_error <= 5)
percentage_within_3 = (within_3_degrees / len(y_test)) * 100
print(f"Percentage of forecasts accurate within 5 degrees: {percentage_within_3:.2f}%")

within_3_degrees = np.sum(absolute_error <= 3)
percentage_within_3 = (within_3_degrees / len(y_test)) * 100
print(f"Percentage of forecasts accurate within 3 degrees: {percentage_within_3:.2f}%")

within_3_degrees = np.sum(absolute_error <= 2)
percentage_within_3 = (within_3_degrees / len(y_test)) * 100
print(f"Percentage of forecasts accurate within 2 degrees: {percentage_within_3:.2f}%")

within_3_degrees = np.sum(absolute_error <= 1)
percentage_within_3 = (within_3_degrees / len(y_test)) * 100
print(f"Percentage of forecasts accurate within 1 degrees: {percentage_within_3:.2f}%")


Percentage of forecasts accurate within 5 degrees: 44.24%
Percentage of forecasts accurate within 3 degrees: 27.80%
Percentage of forecasts accurate within 2 degrees: 18.91%
Percentage of forecasts accurate within 1 degrees: 9.66%


In [None]:
#get feature importances
importances = forest.feature_importances_
for name, val in sorted(zip(df_final.columns, importances), key=lambda x: -x[1]):
    print(name, val)


tmpf 0.7657354441379107
DOY_x 0.09722010859881997
year 0.03562751752874505
target_tmpf 0.033068688390471154
TOD_x 0.025927439394513828
dwpf_x 0.018838657655948284
feel_x 0.014449676244373415
lat_x 0.006381233020408396
lon_x 0.0027512350288093007
