In [None]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Load and preprocess the data (as before)
file_path = 'C:/Users/natha/OneDrive/Bureau/Interview trainings/Coding/Aquatic/Weather_Forecast_Ideas/data/chicago_beach_weather.csv'
weather_data = pd.read_csv(file_path)
weather_data['Measurement Timestamp'] = pd.to_datetime(weather_data['Measurement Timestamp'])
weather_data.set_index('Measurement Timestamp', inplace=True)

df_pivot = weather_data.pivot(columns='Station Name', values='Air Temperature')
df_pivot = df_pivot.fillna(method='ffill').fillna(method='bfill')

# Define lag configurations to test
lag_configs = [
    [1, 2],
    [1, 2, 3],           # Short lags
    [1, 2, 3, 4, 5, 6, 7],  # Full week of hourly lags
    [24, 48, 72],        # Daily lags
    [1, 2, 3, 24, 48],   # Mixed short-term and daily lags
]

# Dictionary to store best results for each station
best_mae_scores = {}
best_lag_configurations = {}
best_forecast_results = {}

# Loop over each station to perform the grid search
for station in df_pivot.columns:
    best_mae = float("inf")
    best_lags = None
    best_y_train, best_y_val, best_y_pred = None, None, None  # To store the best model's results for plotting

    for lags in lag_configs:
        lagged_data = pd.DataFrame()
        
        # Create lagged features for the current configuration
        for lag in lags:
            lagged_data[f'{station}_lag{lag}'] = df_pivot[station].shift(lag)
        
        # Add target variable for current station
        lagged_data[f'target_{station}'] = df_pivot[station]
        lagged_data = lagged_data.dropna()
        
        # Train-validation split (before and in December 2016)
        train_data = lagged_data[lagged_data.index < '2016-12-01']
        val_data = lagged_data[(lagged_data.index >= '2016-12-01') & (lagged_data.index <= '2016-12-31')]
        
        X_train = train_data.drop(columns=[f'target_{station}'])
        y_train = train_data[f'target_{station}']
        X_val = val_data.drop(columns=[f'target_{station}'])
        y_val = val_data[f'target_{station}']
        
        # Train the XGBoost model
        model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
        model.fit(X_train, y_train)
        
        # Predict and calculate MAE
        y_pred = model.predict(X_val)
        mae = mean_absolute_error(y_val, y_pred)
        
        # Update best configuration if current MAE is lower
        if mae < best_mae:
            best_mae = mae
            best_lags = lags
            best_y_train, best_y_val, best_y_pred = y_train, y_val, y_pred  # Store results for best model
    
    # Store the best MAE, lag configuration, and forecast results for the station
    best_mae_scores[station] = best_mae
    best_lag_configurations[station] = best_lags
    best_forecast_results[station] = {
        'train': best_y_train,
        'validation': best_y_val,
        'forecast': best_y_pred
    }

# Print best lag configuration and MAE for each station
print("Best Lag Configurations and MAE for Each Station:")
for station, mae in best_mae_scores.items():
    print(f"Station: {station}, Best MAE: {mae:.2f}, Best Lags: {best_lag_configurations[station]}")

# Plot the training, validation, and forecast results for each station
for station, results in best_forecast_results.items():
    plt.figure(figsize=(12, 6))
    plt.plot(results['train'].values, label="Train (Actual)", color="black")
    plt.plot(range(len(results['train']), len(results['train']) + len(results['validation'])), 
             results['validation'].values, label="Validation (Actual)", color="gray")
    plt.plot(range(len(results['train']), len(results['train']) + len(results['forecast'])), 
             results['forecast'], label="Forecast (Predicted)", color="blue")
    
    plt.legend()
    plt.title(f"Forecast for {station} - Best MAE: {best_mae_scores[station]:.2f}, Best Lags: {best_lag_configurations[station]}")
    plt.xlabel("Time")
    plt.ylabel("Temperature")
    plt.show()


In [None]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import pandas as pd

# Load the CSV file to examine its contents
file_path = 'C:/Users/natha/OneDrive/Bureau/Interview trainings/Coding/Aquatic/Weather_Forecast_Ideas/data/chicago_beach_weather.csv'
weather_data = pd.read_csv(file_path)

# Display the first few rows of the data to understand its structure
weather_data.head()

# Preprocess the data as required
weather_data['Measurement Timestamp'] = pd.to_datetime(weather_data['Measurement Timestamp'])
weather_data.set_index('Measurement Timestamp', inplace=True)

# Pivot the data by station name and fill any missing values
df_pivot = weather_data.pivot(columns='Station Name', values='Air Temperature')
df_pivot = df_pivot.fillna(method='ffill').fillna(method='bfill')

# Generate lagged features
station_names = df_pivot.columns
lagged_data = pd.DataFrame()

# Create lagged features for each station
for station in station_names:
    for lag in range(1, 8):  # 7 lag days
        lagged_data[f'{station}_lag{lag}'] = df_pivot[station].shift(lag)

# Add each station's current value as a target
for station in station_names:
    lagged_data[f'target_{station}'] = df_pivot[station]

# Drop rows with NaN values after creating lags
lagged_data = lagged_data.dropna()

# Split data into training (before December) and validation (December only)
train_data = lagged_data[lagged_data.index < '2016-12-01']
val_data = lagged_data[(lagged_data.index >= '2016-12-01') & (lagged_data.index <= '2016-12-31')]

# Dictionary to store results for each station
mae_scores = {}
forecast_results = {}

# Train and predict for each station
for station in station_names:
    # Define X and y for the current station
    X_train = train_data.drop(columns=[f'target_{s}' for s in station_names])
    y_train = train_data[f'target_{station}']
    X_val = val_data.drop(columns=[f'target_{s}' for s in station_names])
    y_val = val_data[f'target_{station}']

    # Train the XGBoost model
    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
    model.fit(X_train, y_train)

    # Predict and compute MAE for validation set
    y_pred = model.predict(X_val)
    mae_scores[station] = mean_absolute_error(y_val, y_pred)

    # Store the forecast, validation, and training sets for plotting
    forecast_results[station] = {
        'train': y_train,
        'validation': y_val,
        'forecast': y_pred
    }

# Plot the forecast, validation, and training sets for each station
for station in station_names:
    results = forecast_results[station]
    
    plt.figure(figsize=(12, 6))
    plt.plot(results['train'].values, label="Train")
    plt.plot(range(len(results['train']), len(results['train']) + len(results['validation'])), 
             results['validation'].values, label="Validation")
    plt.plot(range(len(results['train']), len(results['train']) + len(results['forecast'])), 
             results['forecast'], label="Forecast")
    plt.legend()
    plt.title(f"Forecast for {station} - MAE: {mae_scores[station]:.2f}")
    plt.show()

# Display MAE scores for each station
mae_scores

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Re-prepare df_pivot to ensure it exists
df_pivot = df.pivot(index='Measurement Timestamp', columns='Station Name', values='Air Temperature')
df_pivot = df_pivot.fillna(method='ffill').fillna(method='bfill')  # Fill missing values

# Prepare the data with lagged features for each station to predict all stations simultaneously
station_names = df_pivot.columns
lagged_data = pd.DataFrame()

# Create lagged features for each station
for station in station_names:
    for lag in range(1, 8):  # 7 lag days
        lagged_data[f'{station}_lag{lag}'] = df_pivot[station].shift(lag)

# Add each station's current value as a target
for station in station_names:
    lagged_data[f'target_{station}'] = df_pivot[station]

# Drop rows with NaN values after creating lags
lagged_data = lagged_data.dropna()

# Initialize a dictionary to store results for each station
mae_scores = {}
forecast_results = {}

# Train and predict for each station
for station in station_names:
    # Define X and y for the current station
    X = lagged_data.drop(columns=[f'target_{s}' for s in station_names])
    y = lagged_data[f'target_{station}']

    # Split into train and test sets, without shuffling to maintain time order
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Train the XGBoost model
    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
    model.fit(X_train, y_train)

    # Predict and compute MAE
    y_pred = model.predict(X_test)
    mae_scores[station] = mean_absolute_error(y_test, y_pred)

    # Store the forecast, validation, and training sets for plotting
    forecast_results[station] = {
        'train': y_train,
        'validation': y_test,
        'forecast': y_pred
    }

# Plot the forecast, validation, and training sets for each station
for station in station_names:
    results = forecast_results[station]
    
    plt.figure(figsize=(12, 6))
    plt.plot(results['train'].values, label="Train")
    plt.plot(range(len(results['train']), len(results['train']) + len(results['validation'])), 
             results['validation'].values, label="Validation")
    plt.plot(range(len(results['train']), len(results['train']) + len(results['forecast'])), 
             results['forecast'], label="Forecast")
    plt.legend()
    plt.title(f"Forecast for {station} - MAE: {mae_scores[station]:.2f}")
    plt.show()

# Display MAE scores for each station
mae_scores
