In [2]:
import pandas as pd
from operations.adjust_dataset import adjust_dataset
from operations.lin import LinearRegressionModel
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import GridSearchCV
import numpy as np
import warnings
warnings.filterwarnings('ignore')

Random tree forest search

In [18]:
df = pd.read_csv('docs/data/GlobalWeatherRepository.csv')

# Filtering the DataFrame to keep only the specified cities
cities_to_keep = ['Tokyo', 'Madrid', 'Stockholm', 'Paris']
df_filtered_for_prediction = df[df['location_name'].isin(cities_to_keep)]

# Now apply one-hot encoding
df_encoded = pd.get_dummies(df, columns=['location_name'])

# Define the target variables
#targetvar = ['temperature_celsius', 'wind_kph', 'humidity', 'pressure_mb', 'precip_mm', 'cloud']
targetvar = ['temperature_celsius', 'wind_kph', 'pressure_mb', 'precip_mm']

window_size = 7  # For example, using the past 7 days to predict
target_variable = 'temperature_celsius'

# Apply the same lag features to the filtered dataframe
for i in range(window_size):
    df_filtered_for_prediction[f'{target_variable}_lag_{i+1}'] = df_filtered_for_prediction[target_variable].shift(i+1)

# Create sliding window features
for i in range(window_size):
    df_encoded[f'{target_variable}_lag_{i+1}'] = df_encoded[target_variable].shift(i+1)

# Drop rows with NaN values caused by shifting
df_encoded = df_encoded.dropna()
df_filtered_for_prediction = df_filtered_for_prediction.dropna()
# Initialize dictionaries to store models and predictions
models = {}
predictions = {}
mae_scores = {}
mse_scores = {}
example_predictions = {}

# Train a model and predict for each day ahead
for day_ahead in range(1, 4):
    # Define X and y for this day ahead
    X = df_encoded[[f'{target_variable}_lag_{i+1}' for i in range(window_size)]]
    y = df_encoded[target_variable].shift(-day_ahead)

    # Adjust X and y to have the same length
    X = X.iloc[:-day_ahead]
    y = y.iloc[:-day_ahead]

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Train the model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    # Store the model and make predictions
    models[day_ahead] = rf
    pred = rf.predict(X_test)
    predictions[day_ahead] = rf.predict(X_test)

     # Calculate and store metrics
    mae_scores[day_ahead] = mean_absolute_error(y_test, pred)
    mse_scores[day_ahead] = mean_squared_error(y_test, pred)

    # Store example predictions
    example_predictions[day_ahead] = list(zip(y_test.head(5), pred[:5]))

for day_ahead in range(1, 4):
    # Prepare the data for this day ahead
    X_pred = df_filtered_for_prediction[[f'{target_variable}_lag_{i+1}' for i in range(window_size)]]

    # Align the prediction dataframe length with the model's input
    X_pred = X_pred.iloc[window_size:-day_ahead]

    # Make predictions
    predictions[day_ahead] = models[day_ahead].predict(X_pred)

    # You can also add actual values to compare, if needed
    actuals = df_filtered_for_prediction[target_variable].iloc[window_size + day_ahead:]

    # Store example predictions
    example_predictions[day_ahead] = list(zip(actuals.head(5), predictions[day_ahead][:5]))


# Print the results in an easy-to-understand format
for day_ahead in range(1, 4):
    print(f"Day {day_ahead} Ahead Forecast:")
    print(f"  Mean Absolute Error: {mae_scores[day_ahead]:.3f}")
    print(f"  Mean Squared Error: {mse_scores[day_ahead]:.3f}")
    print("  Example Predictions (Actual, Predicted):")
    for actual, predicted in example_predictions[day_ahead]:
        print(f"    Actual: {actual:.2f}, Predicted: {predicted:.2f}")
    print("\n")


Day 1 Ahead Forecast:
  Mean Absolute Error: 3.391
  Mean Squared Error: 22.635
  Example Predictions (Actual, Predicted):
    Actual: 12.00, Predicted: 25.08
    Actual: 18.00, Predicted: 28.78
    Actual: 30.00, Predicted: 13.38
    Actual: 23.00, Predicted: 22.77
    Actual: 11.00, Predicted: 23.16


Day 2 Ahead Forecast:
  Mean Absolute Error: 3.436
  Mean Squared Error: 23.601
  Example Predictions (Actual, Predicted):
    Actual: 18.00, Predicted: 28.31
    Actual: 30.00, Predicted: 20.51
    Actual: 23.00, Predicted: 21.60
    Actual: 11.00, Predicted: 22.26
    Actual: 21.00, Predicted: 27.49


Day 3 Ahead Forecast:
  Mean Absolute Error: 3.542
  Mean Squared Error: 24.530
  Example Predictions (Actual, Predicted):
    Actual: 30.00, Predicted: 22.48
    Actual: 23.00, Predicted: 18.27
    Actual: 11.00, Predicted: 17.73
    Actual: 21.00, Predicted: 28.23
    Actual: 30.00, Predicted: 19.07


