In [4]:
import pandas as pd

aggregated = pd.read_csv("data/processed/aggregated.csv")
aggregated["date"] = pd.to_datetime(aggregated["date"])
print(aggregated["temperature_2m_mean"])
# Let's assume the simple baseline of having the same temperature at J+1 than J
# We will compute the average error

diff = sum(
    [
        abs(
            aggregated.loc[idx, "temperature_2m_max"]
            - aggregated.loc[idx - 1, "temperature_2m_max"]
        )
        for idx in range(1, len(aggregated))
    ]
) / (len(aggregated) - 1)

print(diff)

0       26.083895
1       25.479166
2       25.372917
3       25.439583
4       24.704168
          ...    
2918    27.489584
2919    27.716667
2920    27.745834
2921    27.943750
2922    28.175001
Name: temperature_2m_mean, Length: 2923, dtype: float64
0.7840006844626981


In [11]:
# Now let's try a linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

aggregated["next_day_temperature_max"] = aggregated["temperature_2m_max"].shift(-1)
aggregated = aggregated.dropna()

X = aggregated[
    [
        "temperature_2m_max",
        "relative_humidity_2m",
        "precipitation",
        "wind_speed_10m",
        "surface_pressure",
    ]
]
y = aggregated["next_day_temperature_max"]
model = LinearRegression()
model.fit(X, y)

predictions = model.predict(X)
print(mean_absolute_error(y, predictions))
print(y.head(10))
print(predictions[:10])

0.7239562192031919
0    32.80
1    33.05
2    33.15
3    31.45
4    30.25
5    30.90
6    30.90
7    30.85
8    29.80
9    30.95
Name: next_day_temperature_max, dtype: float64
[32.5595336  32.63621918 32.73288525 32.61721499 31.46698824 30.37175412
 30.77146254 30.72044666 30.65930463 29.85756085]
