In [None]:
import pandas as pd

weather = pd.read_csv("weather.csv", index_col="DATE")

In [None]:
weather

In [None]:
null_pct = weather.apply(pd.isnull).sum()/weather.shape[0]
null_pct

In [None]:
valid_columns = weather.columns[null_pct < .05]

In [None]:
valid_columns

In [None]:
weather = weather[valid_columns].copy()

In [None]:
weather.columns = weather.columns.str.lower()

In [None]:
weather

In [None]:
weather = weather.ffill()

In [None]:
weather.apply(pd.isnull).sum()

In [None]:
weather.apply(lambda x: (x == 9999).sum())

In [None]:
weather.dtypes

In [None]:
weather.index

In [None]:
weather.index = pd.to_datetime(weather.index)

In [None]:
weather.index.year.value_counts().sort_index()

In [None]:
weather["snwd"].plot()

In [None]:
weather["target"] = weather.shift(-1)["tmax"]

In [None]:
weather

In [None]:
weather = weather.ffill()

In [None]:
weather

In [None]:
from sklearn.linear_model import Ridge

rr = Ridge(alpha=.1)

In [None]:
predictors = weather.columns[~weather.columns.isin(["target", "name", "station"])]

In [None]:
def backtest(weather, model, predictors, start=3650, step=90):
    all_predictions = []
    
    for i in range(start, weather.shape[0], step):
        train = weather.iloc[:i,:]
        test = weather.iloc[i:(i+step),:]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        combined["diff"] = (combined["prediction"] - combined["actual"]).abs()
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [None]:
predictions = backtest(weather, rr, predictors)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

mean_absolute_error(predictions["actual"], predictions["prediction"])

In [None]:
predictions.sort_values("diff", ascending=False)

In [None]:
pd.Series(rr.coef_, index=predictors)

In [None]:
def pct_diff(old, new):
    return (new - old) / old

def compute_rolling(weather, horizon, col):
    label = f"rolling_{horizon}_{col}"
    weather[label] = weather[col].rolling(horizon).mean()
    weather[f"{label}_pct"] = pct_diff(weather[label], weather[col])
    return weather
    
rolling_horizons = [3, 14]
for horizon in rolling_horizons:
    for col in ["tmax", "tmin", "prcp"]:
        weather = compute_rolling(weather, horizon, col)

In [None]:
def expand_mean(df):
    return df.expanding(1).mean()

for col in ["tmax", "tmin", "prcp"]:
    weather[f"month_avg_{col}"] = weather[col].groupby(weather.index.month, group_keys=False).apply(expand_mean)
    weather[f"day_avg_{col}"] = weather[col].groupby(weather.index.day_of_year, group_keys=False).apply(expand_mean)

In [None]:
weather = weather.iloc[14:,:]
weather = weather.fillna(0)

In [None]:
predictors = weather.columns[~weather.columns.isin(["target", "name", "station"])]

In [None]:
predictions = backtest(weather, rr, predictors)
mean_absolute_error(predictions["actual"], predictions["prediction"])

In [None]:
mean_squared_error(predictions["actual"], predictions["prediction"])

In [None]:
predictions.sort_values("diff", ascending=False)

In [None]:
weather.loc["1990-03-07": "1990-03-17"]

In [None]:
(predictions["diff"].round().value_counts().sort_index() / predictions.shape[0]).plot()

In [None]:
predictions