In [1]:
import pandas as pd

weather = pd.read_csv('dataiot.csv', index_col="DATE")

In [2]:
weather

Unnamed: 0_level_0,PRCP,TMAX,TMIN
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/18/2024 0:35,0.01,27.1,27.1
1/18/2024 0:35,0.01,27.1,27.1
1/18/2024 0:35,0.01,27.1,27.1
1/18/2024 0:35,0.01,27.1,27.1
1/18/2024 0:35,0.01,27.1,27.1
...,...,...,...
1/18/2024 8:13,0.01,27.1,23.0
1/18/2024 8:13,0.01,27.1,23.0
1/18/2024 8:13,0.01,27.1,23.0
1/18/2024 8:13,0.01,27.1,23.0


In [3]:
null_pct = weather.apply(pd.isnull).sum()/weather.shape[0]
null_pct

PRCP    0.0
TMAX    0.0
TMIN    0.0
dtype: float64

In [4]:
weather.columns = weather.columns.str.lower()

In [5]:
weather

Unnamed: 0_level_0,prcp,tmax,tmin
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/18/2024 0:35,0.01,27.1,27.1
1/18/2024 0:35,0.01,27.1,27.1
1/18/2024 0:35,0.01,27.1,27.1
1/18/2024 0:35,0.01,27.1,27.1
1/18/2024 0:35,0.01,27.1,27.1
...,...,...,...
1/18/2024 8:13,0.01,27.1,23.0
1/18/2024 8:13,0.01,27.1,23.0
1/18/2024 8:13,0.01,27.1,23.0
1/18/2024 8:13,0.01,27.1,23.0


In [6]:
weather.dtypes

prcp    float64
tmax    float64
tmin    float64
dtype: object

In [7]:
weather.index

Index(['1/18/2024 0:35', '1/18/2024 0:35', '1/18/2024 0:35', '1/18/2024 0:35',
       '1/18/2024 0:35', '1/18/2024 0:35', '1/18/2024 0:35', '1/18/2024 0:35',
       '1/18/2024 0:35', '1/18/2024 0:35',
       ...
       '1/18/2024 8:12', '1/18/2024 8:13', '1/18/2024 8:13', '1/18/2024 8:13',
       '1/18/2024 8:13', '1/18/2024 8:13', '1/18/2024 8:13', '1/18/2024 8:13',
       '1/18/2024 8:13', '1/18/2024 8:13'],
      dtype='object', name='DATE', length=5468)

In [8]:
weather.index = pd.to_datetime(weather.index)

In [9]:
weather["target"] = weather.shift(-1)["tmax"]

In [10]:
weather

Unnamed: 0_level_0,prcp,tmax,tmin,target
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-01-18 00:35:00,0.01,27.1,27.1,27.1
2024-01-18 00:35:00,0.01,27.1,27.1,27.1
2024-01-18 00:35:00,0.01,27.1,27.1,27.1
2024-01-18 00:35:00,0.01,27.1,27.1,27.1
2024-01-18 00:35:00,0.01,27.1,27.1,27.1
...,...,...,...,...
2024-01-18 08:13:00,0.01,27.1,23.0,27.1
2024-01-18 08:13:00,0.01,27.1,23.0,27.1
2024-01-18 08:13:00,0.01,27.1,23.0,27.1
2024-01-18 08:13:00,0.01,27.1,23.0,27.1


In [11]:
weather = weather.ffill()

In [12]:
weather

Unnamed: 0_level_0,prcp,tmax,tmin,target
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-01-18 00:35:00,0.01,27.1,27.1,27.1
2024-01-18 00:35:00,0.01,27.1,27.1,27.1
2024-01-18 00:35:00,0.01,27.1,27.1,27.1
2024-01-18 00:35:00,0.01,27.1,27.1,27.1
2024-01-18 00:35:00,0.01,27.1,27.1,27.1
...,...,...,...,...
2024-01-18 08:13:00,0.01,27.1,23.0,27.1
2024-01-18 08:13:00,0.01,27.1,23.0,27.1
2024-01-18 08:13:00,0.01,27.1,23.0,27.1
2024-01-18 08:13:00,0.01,27.1,23.0,27.1


In [13]:
from sklearn.linear_model import Ridge

rr = Ridge(alpha=.1)

In [14]:
predictors = weather.columns[~weather.columns.isin(["target", "name", "station"])]

In [15]:
def backtest(weather, model, predictors, start=3650, step=90):
    all_predictions = []

    for i in range(start, weather.shape[0], step):
        train = weather.iloc[:i,:]
        test = weather.iloc[i:(i+step),:]

        model.fit(train[predictors], train["target"])

        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        combined["diff"] = (combined["prediction"] - combined["actual"]).abs()

        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [16]:
predictions = backtest(weather, rr, predictors)

In [17]:
predictions.sort_values("diff", ascending=False)

Unnamed: 0_level_0,actual,prediction,diff
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-18 06:29:00,27.1,27.1,7.105427e-15
2024-01-18 06:23:00,27.1,27.1,7.105427e-15
2024-01-18 06:21:00,27.1,27.1,7.105427e-15
2024-01-18 06:21:00,27.1,27.1,7.105427e-15
2024-01-18 06:21:00,27.1,27.1,7.105427e-15
...,...,...,...
2024-01-18 07:20:00,27.1,27.1,0.000000e+00
2024-01-18 07:20:00,27.1,27.1,0.000000e+00
2024-01-18 07:20:00,27.1,27.1,0.000000e+00
2024-01-18 07:20:00,27.1,27.1,0.000000e+00


In [18]:
pd.Series(rr.coef_, index=predictors)

prcp    0.0
tmax    0.0
tmin    0.0
dtype: float64

In [19]:
def pct_diff(old, new):
    return (new - old) / old

def compute_rolling(weather, horizon, col):
    label = f"rolling_{horizon}_{col}"
    weather[label] = weather[col].rolling(horizon).mean()
    weather[f"{label}_pct"] = pct_diff(weather[label], weather[col])
    return weather

rolling_horizons = [3, 14]
for horizon in rolling_horizons:
    for col in ["tmax", "tmin", "prcp"]:
        weather = compute_rolling(weather, horizon, col)

In [20]:
def expand_mean(df):
    return df.expanding(1).mean()

for col in ["tmax", "tmin", "prcp"]:
    weather[f"month_avg_{col}"] = weather[col].groupby(weather.index.month, group_keys=False).apply(expand_mean)
    weather[f"day_avg_{col}"] = weather[col].groupby(weather.index.day_of_year, group_keys=False).apply(expand_mean)

In [21]:
weather = weather.iloc[14:,:]
weather = weather.fillna(0)

In [22]:
predictors = weather.columns[~weather.columns.isin(["target"])]

In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
predictions = backtest(weather, rr, predictors)
mean_absolute_error(predictions["actual"], predictions["prediction"])

3.1903526383906936e-15

In [24]:
mean_squared_error(predictions["actual"], predictions["prediction"])

2.0150061259747352e-29

In [25]:
predictions.sort_values("diff", ascending=False)

Unnamed: 0_level_0,actual,prediction,diff
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-18 06:30:00,27.1,27.1,7.105427e-15
2024-01-18 06:24:00,27.1,27.1,7.105427e-15
2024-01-18 06:22:00,27.1,27.1,7.105427e-15
2024-01-18 06:22:00,27.1,27.1,7.105427e-15
2024-01-18 06:22:00,27.1,27.1,7.105427e-15
...,...,...,...
2024-01-18 07:20:00,27.1,27.1,0.000000e+00
2024-01-18 07:20:00,27.1,27.1,0.000000e+00
2024-01-18 07:21:00,27.1,27.1,0.000000e+00
2024-01-18 07:21:00,27.1,27.1,0.000000e+00


In [26]:
import joblib
# Instantiate the Ridge model
rr = Ridge(alpha=.1)

# Train the model on the entire dataset
rr.fit(weather[predictors], weather["target"])

# Save the model to a file
model_filename = "ridge_model.joblib"
joblib.dump(rr, model_filename)

['ridge_model.joblib']

In [29]:
model_filename = "ridge_model.joblib"
loaded_model = joblib.load(model_filename)


In [31]:
import joblib
import pickle

# Load the Ridge model from the joblib file
model_filename_joblib = "ridge_model.joblib"
loaded_model = joblib.load(model_filename_joblib)

# Save the model to a pickle file
model_filename_pkl = "ridge_model.pkl"
with open(model_filename_pkl, 'wb') as file:
    pickle.dump(loaded_model, file)

print(f"Ridge model saved as {model_filename_pkl}")


Ridge model saved as ridge_model.pkl


In [30]:
# Get the last date in the dataset
last_date = weather.index[-1]

# Create a new DataFrame with the same features as the current dataset for the next day
new_date = last_date + pd.Timedelta(days=1)
new_data = pd.DataFrame(index=[new_date], columns=predictors)

# Get the values of the features for the last day and assign them to the new day
last_values = weather[predictors].iloc[-1]
new_data[predictors] = last_values.values

# Ensure that the 'index' column of 'new_data' is of datetime type
new_data.index = pd.to_datetime(new_data.index)

# Use the Ridge model to predict for the next day
prediction_for_next_day = loaded_model.predict(new_data[predictors])

# Create a new DataFrame containing the prediction and the corresponding date
prediction_df = pd.DataFrame(index=new_data.index, columns=["Prediction"])
prediction_df["Prediction"] = prediction_for_next_day
prediction_df["Prediction"] = prediction_df["Prediction"].round(2)
# Display the prediction for the next day
print(prediction_df)


                     Prediction
2024-01-19 08:13:00        27.1
