In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Load CSV file and examine its content

In [None]:
file_path = './data/Data_Temperatures.csv'
weather_data = pd.read_csv(file_path)

weather_data.head()

In [None]:
weather_data['Measurement Timestamp'] = pd.to_datetime(weather_data['Measurement Timestamp'])
weather_data.set_index('Measurement Timestamp', inplace=True)

weather_data.head()

In [None]:
df_pivot = weather_data.pivot(columns='Station Name', values='Air Temperature')

df_pivot.head()

Let's plot the 3 time series

In [None]:
plt.figure(figsize=(15, 8))
for station in df_pivot.columns:
    plt.plot(df_pivot.index, df_pivot[station], label=station)

plt.xlabel('Measurement Timestamp')
plt.ylabel('Air Temperature (°C)')
plt.title('Air Temperature Over Time for Each Weather Station')
plt.legend()
plt.xticks(rotation=45)
plt.show()

In [None]:
# Choose a random week and day within the dataset for specific plots
# Define a specific day and a specific week for closer inspection
one_day = df_pivot.loc['2016-01-01']
one_week = df_pivot.loc['2016-01-01':'2016-01-07']

# Plot for one day
plt.figure(figsize=(10, 5))
for station in one_day.columns:
    plt.plot(one_day.index, one_day[station], label=station)
plt.xlabel('Measurement Timestamp')
plt.ylabel('Air Temperature (°C)')
plt.title('Air Temperature Over Time for Each Weather Station (One Day)')
plt.legend()
plt.xticks(rotation=45)
plt.show()

# Plot for one week
plt.figure(figsize=(15, 8))
for station in one_week.columns:
    plt.plot(one_week.index, one_week[station], label=station)
plt.xlabel('Measurement Timestamp')
plt.ylabel('Air Temperature (°C)')
plt.title('Air Temperature Over Time for Each Weather Station (One Week)')
plt.legend()
plt.xticks(rotation=45)
plt.show()

So here we can observe some missing values, there are missing dates in may-june and maybe to some other places, let's give it a closer look

In [None]:
missing_values = df_pivot.isna().sum()

all_dates = pd.date_range(start=df_pivot.index.min(), end=df_pivot.index.max(), freq='H')
missing_dates = all_dates.difference(df_pivot.index)

missing_values, missing_dates.size

In [None]:
missing_dates_by_station = {}

for station in df_pivot.columns:
    station_data = df_pivot[station]
    station_data_reindexed = station_data.reindex(all_dates)
    missing_dates = all_dates[station_data_reindexed.isna()]
    missing_dates_by_station[station] = missing_dates

for station, missing_dates in missing_dates_by_station.items():
    missing_dates_df = pd.DataFrame(index=all_dates)
    missing_dates_df['Missing'] = 0  # Default to 0 (not missing)
    missing_dates_df.loc[missing_dates, 'Missing'] = 1  # Set to 1 for missing dates
    
    # Plot the missing dates over time
    plt.figure(figsize=(15, 5))
    plt.plot(missing_dates_df.index, missing_dates_df['Missing'], marker='|', linestyle='None', color='red')
    plt.title(f'Missing Timestamps for {station}')
    plt.xlabel('Date')
    plt.ylabel('Missing (1 = Missing, 0 = Present)')
    plt.yticks([0, 1], ['Present', 'Missing'])
    plt.grid(True)
    plt.show()


Let s use interpolation to deal with the missing data. BUT BE CAREFUL, we want to keep the big gaps as NaN, interpolation would lower the accuracy of our model otherwise

In [None]:
max_gap_size = 24

df_pivot = df_pivot.reindex(all_dates)

large_gap_indices = {}

for station in df_pivot.columns:
    station_data = df_pivot[station]
    
    missing_mask = station_data.isna()
    
    gap_sizes = missing_mask.astype(int).groupby((~missing_mask).cumsum()).cumsum()
    
    large_gap_indices[station] = station_data[gap_sizes > max_gap_size].index

    df_pivot[station] = station_data.interpolate() #df_pivot = df_pivot.interpolate() #fillna(method='ffill').fillna(method='bfill')

    df_pivot.loc[large_gap_indices[station], station] = np.nan

Go back up and run the graph -> reduced

Ok but I know these stations timeseries are highly correlated, I can do a test quickly here if you want

In [None]:
from itertools import combinations

correlations = {}
for station1, station2 in combinations(df_pivot.columns, 2):
    series1 = df_pivot[station1]
    series2 = df_pivot[station2]
    
    correlation = series1.corr(series2)
    correlations[(station1, station2)] = correlation
    print(f"Correlation between {station1} and {station2}: {correlation:.4f}")

correlations

Then for the rest of the missing data we use the avg of the other stations values

In [None]:
for station in df_pivot.columns:
    missing_indices = df_pivot[station].isna()
    
    for idx in df_pivot[station][missing_indices].index:
        available_data = [
            df_pivot[other_station].loc[idx] for other_station in df_pivot.columns if other_station != station and not pd.isna(df_pivot[other_station].loc[idx])
        ]
        
        if available_data:
            df_pivot.loc[idx, station] = np.mean(available_data)


Now we can process the data 

In [None]:
station_names = df_pivot.columns
lagged_data = pd.DataFrame()

for station in station_names:
    for lag in range(1, 8):  # 7 lags
        lagged_data[f'{station}_lag{lag}'] = df_pivot[station].shift(lag)
    lagged_data[f'{station}_lag{25}'] = df_pivot[station].shift(23)

for station in station_names:
    lagged_data[f'target_{station}'] = df_pivot[station]

lagged_data = lagged_data.dropna()

lagged_data.head()


Then let's play with Random Forest nom!

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

train_data = lagged_data[lagged_data.index < '2016-12-01']
val_data = lagged_data[(lagged_data.index >= '2016-12-01') & (lagged_data.index <= '2016-12-31')]

mae_scores = {}
forecast_results = {}

for station in station_names:
    X_train = train_data.drop(columns=[f'target_{s}' for s in station_names])
    y_train = train_data[f'target_{station}']
    X_val = val_data.drop(columns=[f'target_{s}' for s in station_names])
    y_val = val_data[f'target_{station}']

    model = RandomForestRegressor(n_estimators=100)  # Hyperparameters can be tuned further
    model.fit(X_train, y_train)

    # Predict on the validation set
    y_pred = model.predict(X_val)
    mae_scores[station] = mean_squared_error(y_val, y_pred)

    forecast_results[station] = {
        'train': y_train,
        'validation': y_val,
        'forecast': y_pred
    }

for station in station_names:
    results = forecast_results[station]
    
    plt.figure(figsize=(12, 6))
    plt.plot(results['train'].values, label="Train")
    plt.plot(range(len(results['train']), len(results['train']) + len(results['validation'])), 
             results['validation'].values, label="Validation")
    plt.plot(range(len(results['train']), len(results['train']) + len(results['forecast'])), 
             results['forecast'], label="Forecast")
    plt.legend()
    plt.title(f"Forecast for {station} - MSE: {mae_scores[station]:.2f}")
    plt.xlabel("Time")
    plt.ylabel("Temperature (°C)")
    plt.show()

print("MAE scores for each station:")
mae_scores


# Feature Importance

In [None]:
importances = {}

for station in station_names:

    importances[station] = model.feature_importances_

    feature_importance_series = pd.Series(model.feature_importances_, index=X_train.columns)
    sorted_importance = feature_importance_series.sort_values(ascending=False)
    
    sorted_importance.plot(kind='bar')
    plt.title(f"Feature Importance for {station}")
    plt.xlabel("Features")
    plt.ylabel("Importance Score")
    plt.show()

for station, importance_values in importances.items():
    print(f"Top features for {station}:")
    sorted_features = sorted(zip(X_train.columns, importance_values), key=lambda x: x[1], reverse=True)
    for feature, score in sorted_features[:5]:  # Show top 5 features
        print(f"{feature}: {score:.4f}")
    print("\n")

# Grid Search with Out-of-Bag or val set

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
import numpy as np

file_path = './data/Data_Temperatures.csv'
weather_data = pd.read_csv(file_path)

weather_data['Measurement Timestamp'] = pd.to_datetime(weather_data['Measurement Timestamp'])
weather_data.set_index('Measurement Timestamp', inplace=True)

df_pivot = weather_data.pivot(columns='Station Name', values='Air Temperature')
df_pivot = df_pivot.fillna(method='ffill').fillna(method='bfill')

station_names = df_pivot.columns
lagged_data = pd.DataFrame()

for station in station_names:
    for lag in range(1, 8):  # 7 hours lag
        lagged_data[f'{station}_lag{lag}'] = df_pivot[station].shift(lag)

for station in station_names:
    lagged_data[f'target_{station}'] = df_pivot[station]

lagged_data = lagged_data.dropna()

train_data = lagged_data[lagged_data.index < '2016-12-01']
val_data = lagged_data[(lagged_data.index >= '2016-12-01') & (lagged_data.index <= '2016-12-31')]

param_dist = {
    'n_estimators': np.arange(50, 201, 50), 
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 4, 8],
    'max_features': ['sqrt', 'log2'],
    'max_samples': [0.8, 0.9, 1.0]
}

best_params = {}
mae_scores = {}
forecast_results = {}

tscv = TimeSeriesSplit(n_splits=5)

for station in station_names:
    X_train = train_data.drop(columns=[f'target_{s}' for s in station_names])
    y_train = train_data[f'target_{station}']
    X_val = val_data.drop(columns=[f'target_{s}' for s in station_names])
    y_val = val_data[f'target_{station}']
    
    random_search = RandomizedSearchCV(
        RandomForestRegressor(random_state=123, oob_score=True),
        param_distributions=param_dist,
        n_iter=30,
        cv=tscv,
        scoring='neg_mean_absolute_error',
        random_state=123
    )
    random_search.fit(X_train, y_train)
    
    best_station_params = random_search.best_params_
    best_model = RandomForestRegressor(**best_station_params, random_state=123, oob_score=True)
    best_model.fit(X_train, y_train)
    
    y_pred = best_model.predict(X_val)
    mae_scores[station] = mean_absolute_error(y_val, y_pred)
    best_params[station] = best_station_params
    
    forecast_results[station] = {
        'train': y_train,
        'validation': y_val,
        'forecast': y_pred
    }

for station in station_names:
    results = forecast_results[station]
    
    plt.figure(figsize=(12, 6))
    plt.plot(results['train'].values, label="Train")
    plt.plot(range(len(results['train']), len(results['train']) + len(results['validation'])), 
             results['validation'].values, label="Validation")
    plt.plot(range(len(results['train']), len(results['train']) + len(results['forecast'])), 
             results['forecast'], label="Forecast")
    plt.legend()
    plt.title(f"Forecast for {station} - MAE: {mae_scores[station]:.2f}")
    plt.xlabel("Time")
    plt.ylabel("Temperature")
    plt.show()

print("MAE Scores per Station:", mae_scores)
print("Best Parameters per Station:", best_params)


# Uncerntainty Quantification

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

file_path = './data/Data_Temperatures.csv'
weather_data = pd.read_csv(file_path)

weather_data['Measurement Timestamp'] = pd.to_datetime(weather_data['Measurement Timestamp'])
weather_data.set_index('Measurement Timestamp', inplace=True)

df_pivot = weather_data.pivot(columns='Station Name', values='Air Temperature')
df_pivot = df_pivot.fillna(method='ffill').fillna(method='bfill')

station_names = df_pivot.columns
lagged_data = pd.DataFrame()

for station in station_names:
    for lag in range(1, 8):  # 7 hours lag
        lagged_data[f'{station}_lag{lag}'] = df_pivot[station].shift(lag)

for station in station_names:
    lagged_data[f'target_{station}'] = df_pivot[station]

lagged_data = lagged_data.dropna()

train_data = lagged_data[lagged_data.index < '2016-12-01']
val_data = lagged_data[(lagged_data.index >= '2016-12-01') & (lagged_data.index <= '2016-12-31')]

mae_scores = {}
forecast_results = {}

for station in station_names:
    X_train = train_data.drop(columns=[f'target_{s}' for s in station_names])
    y_train = train_data[f'target_{station}']
    X_val = val_data.drop(columns=[f'target_{s}' for s in station_names])
    y_val = val_data[f'target_{station}']
    
    model = RandomForestRegressor(n_estimators=100, random_state=123) #max_depth=10, min_samples_split=5, min_samples_leaf=2, max_features='sqrt', , oob_score=True
                                  
    model.fit(X_train, y_train)
    
    all_tree_predictions = np.array([tree.predict(X_val.values) for tree in model.estimators_])

    mean_predictions = np.mean(all_tree_predictions, axis=0)
    std_predictions = np.std(all_tree_predictions, axis=0)

    mae_scores[station] = mean_absolute_error(y_val, mean_predictions)
    
    forecast_results[station] = {
        'validation': y_val,
        'forecast': mean_predictions,
        'std': std_predictions
    }

for station in station_names:
    results = forecast_results[station]
    
    plt.figure(figsize=(12, 6))
    plt.plot(results['validation'].index, results['validation'].values, label="Actual (Validation)", color="green")
    plt.plot(results['validation'].index, results['forecast'], label="Mean Prediction", color="orange")
    plt.fill_between(results['validation'].index, 
                     results['forecast'] - results['std'], 
                     results['forecast'] + results['std'], 
                     color="orange", alpha=0.2, label="Prediction ± 1 std")
    plt.legend()
    plt.title(f"December Forecast for {station} - MAE: {mae_scores[station]:.2f}")
    plt.xlabel("Time")
    plt.ylabel("Temperature")
    plt.xticks(rotation=45)
    plt.show()

print("MAE Scores per Station:", mae_scores)

# To improve:
- Add more features, be smarter with the features, it's the most important part