<a href="https://colab.research.google.com/github/LaFuego20/exchange-rate-forecasting/blob/main/ConvLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import ConvLSTM2D, Flatten, Dense
import tensorflow as tf
import random
import matplotlib.pyplot as plt

# --- Set random seed for reproducibility ---
os.environ['PYTHONHASHSEED'] = '2'
os.environ['TF_DETERMINISTIC_OPS'] = '1'
random.seed(2)
np.random.seed(2)
tf.random.set_seed(2)

# --- Data Loading ---
file_path_normalized = 'normalized_data.csv'
file_path_original = 'merged_exchange_rates.csv'

if not os.path.exists(file_path_normalized):
    print(f"Error: File not found at {file_path_normalized}")
    # Consider adding code here to handle missing files, e.g., download from a URL or prompt the user to upload
    raise FileNotFoundError(f"Required data file not found: {file_path_normalized}")

if not os.path.exists(file_path_original):
    print(f"Error: File not found at {file_path_original}")
    # Consider adding code here to handle missing files
    raise FileNotFoundError(f"Required data file not found: {file_path_original}")


try:
    normalized_df = pd.read_csv(file_path_normalized)
    original_df = pd.read_csv(file_path_original)
    print("Data loaded successfully.")
except Exception as e:
    print(f"An error occurred while loading the data: {e}")
    raise


# --- Data Preprocessing ---
if 'Unnamed: 4' in original_df.columns:
    original_df = original_df.drop('Unnamed: 4', axis=1)

normalized_df['Date'] = pd.to_datetime(normalized_df['Date'], format='%d/%m/%Y')
original_df['Date'] = pd.to_datetime(original_df['Date'], format='%d/%m/%Y')

features = ['USD_NGN_Norm', 'EUR_NGN_Norm', 'GBP_NGN_Norm']
original_currency_cols = ['USD_NGN', 'EUR_NGN', 'GBP_NGN']
normalized_currency_cols = ['USD_NGN_Norm', 'EUR_NGN_Norm', 'GBP_NGN_Norm']

# Define sequence length
sequence_length = 30

# Define a function to create sequences
def create_sequences(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:(i + sequence_length)])
        y.append(data[i + sequence_length])
    return np.array(X), np.array(y)

# Function to denormalize predictions and true values and calculate metrics
def evaluate_denormalized(y_true_normalized, y_pred_normalized, original_df, currency_name):
    original_min = original_df[currency_name].min()
    original_max = original_df[currency_name].max()
    y_true_denormalized = y_true_normalized * (original_max - original_min) + original_min
    y_pred_denormalized = y_pred_normalized * (original_max - original_min) + original_min
    rmse_denormalized = np.sqrt(mean_squared_error(y_true_denormalized, y_pred_denormalized))
    mae_denormalized = mean_absolute_error(y_true_denormalized, y_pred_denormalized)
    return rmse_denormalized, mae_denormalized

# --- Define ConvLSTM Model Architecture ---
def build_convlstm_model(input_shape, output_units):
    model = Sequential()
    model.add(ConvLSTM2D(filters=64, kernel_size=(1, 3), activation='relu', input_shape=input_shape, return_sequences=False))
    model.add(Flatten())
    model.add(Dense(units=output_units, activation='linear'))
    model.compile(optimizer='adam', loss='mse')
    return model

# --- Define Periods ---
subsidy_removal_date = pd.to_datetime('2023-05-29')

full_df = normalized_df.copy()
pre_subsidy_df = normalized_df[normalized_df['Date'] < subsidy_removal_date].copy()
post_subsidy_df = normalized_df[normalized_df['Date'] >= subsidy_removal_date].copy()

periods = {
    'Full': full_df,
    'Pre-subsidy': pre_subsidy_df,
    'Post-subsidy': post_subsidy_df
}


# --- Calculate Performance Metrics ---

# Create a dictionary to store RMSE and MAE values
performance_metrics = {currency: {'RMSE': {}, 'MAE': {}} for currency in original_currency_cols}

# Iterate through each period and currency to calculate metrics
print("--- Calculating Performance Metrics ---")
for period_name, period_df in periods.items():
    print(f"\nProcessing {period_name} Period for Metrics...")

    if period_df.empty or len(period_df) <= sequence_length:
        print(f"Not enough data in {period_name} period to calculate metrics.")
        continue

    data_for_sequences_period = period_df[features].values
    X_period, y_period = create_sequences(data_for_sequences_period, sequence_length)

    if len(X_period) == 0:
         print(f"Not enough data in {period_name} period to create sequences for metric calculation.")
         continue

    X_period = X_period.reshape((X_period.shape[0], sequence_length, 1, X_period.shape[2], 1))

    train_size_period = int(len(X_period) * 0.8)
    if train_size_period == 0 or train_size_period == len(X_period):
        print(f"Not enough data in {period_name} period for an 80/20 train/test split for metric calculation.")
        continue

    X_train_period, X_test_period = X_period[:train_size_period], X_period[train_size_period:]
    y_train_period, y_test_period = y_period[:train_size_period], y_period[train_size_period:]

    # Rebuild and train the model for this period to get the predictions
    print(f"Training model for {period_name} to calculate metrics...")
    model_period = build_convlstm_model(input_shape=(X_train_period.shape[1], X_train_period.shape[2], X_train_period.shape[3], X_train_period.shape[4]),
                                        output_units=y_train_period.shape[1])
    if X_train_period.shape[0] > 1 and X_train_period.shape[0] * 0.2 >= 1:
        model_period.fit(X_train_period, y_train_period, epochs=50, batch_size=32, verbose=0, validation_split=0.2)
    else:
        model_period.fit(X_train_period, y_train_period, epochs=50, batch_size=32, verbose=0)


    if len(X_test_period) > 0:
         y_pred_period = model_period.predict(X_test_period, verbose=0)
    else:
        print(f"No test data available for {period_name} to calculate metrics.")
        continue

    # Calculate and store metrics for each currency
    if 'y_pred_period' in locals(): # Check if predictions were made for this period
        print(f"  Performance Metrics for {period_name} Test Set:")
        for i, currency_norm in enumerate(normalized_currency_cols):
            currency_orig = original_currency_cols[i]
            y_test_currency_norm = y_test_period[:, i]
            y_pred_currency_norm = y_pred_period[:, i]

            rmse_denorm, mae_denorm = evaluate_denormalized(
                y_test_currency_norm,
                y_pred_currency_norm,
                original_df, # Use the full original_df for min/max scaling
                currency_orig
            )
            performance_metrics[currency_orig]['RMSE'][period_name] = rmse_denorm
            performance_metrics[currency_orig]['MAE'][period_name] = mae_denorm
            print(f'    {currency_orig}: RMSE={rmse_denorm:.4f}, MAE={mae_denorm:.4f}')
    else:
        print(f"Predictions were not generated for {period_name}. Skipping metric calculation.")

print("\n--- Performance Metrics Calculation Complete ---")


# --- Generate and Save Plots as PDFs ---
print("\n--- Generating and Saving Plots as PDFs ---")

# Actual vs Predicted plots for each period and currency (as requested)
for period_name, period_df in periods.items():
    print(f"\nGenerating plots for {period_name} Period...")
    if period_df.empty or len(period_df) <= sequence_length:
        print(f"Skipping plots for {period_name} period due to insufficient data.")
        continue

    data_for_sequences_period = period_df[features].values
    X_period, y_period = create_sequences(data_for_sequences_period, sequence_length)

    if len(X_period) == 0:
         print(f"Skipping plots for {period_name} period due to insufficient sequences.")
         continue

    X_period = X_period.reshape((X_period.shape[0], sequence_length, 1, X_period.shape[2], 1))

    train_size_period = int(len(X_period) * 0.8)
    if train_size_period == 0 or train_size_period == len(X_period):
        print(f"Skipping plots for {period_name} period due to insufficient test data after split.")
        continue

    X_train_period, X_test_period = X_period[:train_size_period], X_period[train_size_period:]
    y_train_period, y_test_period = y_period[:train_size_period], y_period[train_size_period:]

    # Rebuild and train the model for this period to get the predictions
    model_period = build_convlstm_model(input_shape=(X_train_period.shape[1], X_train_period.shape[2], X_train_period.shape[3], X_train_period.shape[4]),
                                        output_units=y_train_period.shape[1])
    if X_train_period.shape[0] > 1 and X_train_period.shape[0] * 0.2 >= 1:
        model_period.fit(X_train_period, y_train_period, epochs=50, batch_size=32, verbose=0, validation_split=0.2)
    else:
        model_period.fit(X_train_period, y_train_period, epochs=50, batch_size=32, verbose=0)

    if len(X_test_period) > 0:
        y_pred_period = model_period.predict(X_test_period, verbose=0)

        # Get the corresponding dates for the test set
        test_dates_period = period_df['Date'].iloc[train_size_period + sequence_length:].reset_index(drop=True)

        # Create a figure with subplots for each currency in this period
        fig, axes = plt.subplots(nrows=len(original_currency_cols), ncols=1, figsize=(12, 18))
        fig.suptitle(f'{period_name} Period: Actual vs Predicted for All Currencies', y=1.02) # Add a super title

        for i, currency_orig in enumerate(original_currency_cols):
            currency_norm = normalized_currency_cols[i]
            y_test_currency_norm = y_test_period[:, i]
            y_pred_currency_norm = y_pred_period[:, i]

            # Denormalize the actual and predicted values for plotting
            original_min = original_df[currency_orig].min()
            original_max = original_df[currency_orig].max()
            y_true_denormalized = y_test_currency_norm * (original_max - original_min) + original_min
            y_pred_denormalized = y_pred_currency_norm * (original_max - original_min) + original_min

            # Plot on the corresponding subplot
            axes[i].plot(test_dates_period, y_true_denormalized, label='Actual')
            axes[i].plot(test_dates_period, y_pred_denormalized, label='Predicted')
            axes[i].set_title(f'{currency_orig}')
            axes[i].set_xlabel('Date')
            axes[i].set_ylabel('Exchange Rate')
            axes[i].legend()
            axes[i].tick_params(axis='x', rotation=45) # Rotate x-axis labels

        plt.tight_layout() # Adjust layout to prevent labels overlapping
        plt.savefig(f'{period_name}_all_currencies_actual_vs_predicted.pdf') # Save the plot as PDF
        plt.close() # Close the plot to free memory
    else:
        print(f"No test data available for {period_name} plotting.")


# Combined RMSE and MAE Bar Plots by Period (as requested)
periods_order = ['Full', 'Pre-subsidy', 'Post-subsidy']
metrics_to_plot = ['RMSE', 'MAE']
currency_colors = {'USD_NGN': 'blue', 'EUR_NGN': 'green', 'GBP_NGN': 'red'}

for period_name in periods_order:
    print(f"\nGenerating combined RMSE and MAE bar plots for {period_name} Period...")
    plt.figure(figsize=(10, 6))
    x = np.arange(len(original_currency_cols))
    width = 0.35

    # Get RMSE and MAE values for the current period across all currencies
    rmse_values = [performance_metrics[currency_orig]['RMSE'].get(period_name, 0) for currency_orig in original_currency_cols]
    mae_values = [performance_metrics[currency_orig]['MAE'].get(period_name, 0) for currency_orig in original_currency_cols]

    # Create grouped bar plot for RMSE and MAE
    bars1 = plt.bar(x - width/2, rmse_values, width, label='RMSE', color=[currency_colors[c] for c in original_currency_cols])
    bars2 = plt.bar(x + width/2, mae_values, width, label='MAE', color=[currency_colors[c] for c in original_currency_cols])


    # Add labels and title
    plt.ylabel('Metric Value')
    plt.title(f'{period_name} Period: RMSE and MAE across Currencies')
    plt.xticks(x, original_currency_cols)
    plt.legend()

    # Add value labels on top of bars
    def autolabel(bars):
        for bar in bars:
            height = bar.get_height()
            plt.annotate(f'{height:.2f}',
                         xy=(bar.get_x() + bar.get_width() / 2, height),
                         xytext=(0, 3),  # 3 points vertical offset
                         textcoords="offset points",
                         ha='center', va='bottom')

    autolabel(bars1)
    autolabel(bars2)

    plt.tight_layout() # Adjust layout to prevent labels overlapping
    plt.savefig(f'{period_name}_rmse_mae_combined_bar_plot.pdf') # Save the plot as PDF
    plt.close() # Close the plot to free memory

print("\n--- Plot Generation and Saving Complete ---")

Data loaded successfully.
--- Calculating Performance Metrics ---

Processing Full Period for Metrics...
Training model for Full to calculate metrics...


  super().__init__(**kwargs)


  Performance Metrics for Full Test Set:
    USD_NGN: RMSE=33.4333, MAE=29.0345
    EUR_NGN: RMSE=200.6248, MAE=196.3051
    GBP_NGN: RMSE=125.5230, MAE=121.5888

Processing Pre-subsidy Period for Metrics...
Training model for Pre-subsidy to calculate metrics...


  super().__init__(**kwargs)


  Performance Metrics for Pre-subsidy Test Set:
    USD_NGN: RMSE=16.9509, MAE=16.4012
    EUR_NGN: RMSE=22.6990, MAE=21.9108
    GBP_NGN: RMSE=7.7473, MAE=6.3276

Processing Post-subsidy Period for Metrics...
Training model for Post-subsidy to calculate metrics...


  super().__init__(**kwargs)
  super().__init__(**kwargs)


  Performance Metrics for Post-subsidy Test Set:
    USD_NGN: RMSE=35.6594, MAE=28.5162
    EUR_NGN: RMSE=36.8112, MAE=28.1149
    GBP_NGN: RMSE=40.7283, MAE=32.7213

--- Performance Metrics Calculation Complete ---

--- Generating and Saving Plots as PDFs ---

Generating plots for Full Period...

Generating plots for Pre-subsidy Period...


  super().__init__(**kwargs)



Generating plots for Post-subsidy Period...


  super().__init__(**kwargs)



Generating combined RMSE and MAE bar plots for Full Period...

Generating combined RMSE and MAE bar plots for Pre-subsidy Period...

Generating combined RMSE and MAE bar plots for Post-subsidy Period...

--- Plot Generation and Saving Complete ---
