**Importing Libraries and Data**

In [None]:
#Importing libraries
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import root_mean_squared_error
from statsmodels.tsa.stattools import acf
import matplotlib.pyplot as plt
from mySSA import mySSA
import numpy as np
import warnings
import gc
import os
import sys

#Ignoring warnings
warnings.filterwarnings("ignore")

#Reading in the data and extracting unique account values
BoostedTransactions = pd.read_csv("Boosted Transaction Dataset.csv")
AccountIDs = BoostedTransactions["AccountId"].values
UniqueAccounts = set(AccountIDs)

#Converting transaction dates into months
BoostedTransactions["TransactionDate"] = pd.to_datetime(BoostedTransactions["TransactionDate"])
BoostedTransactions["TransactionDate"] = BoostedTransactions["TransactionDate"].dt.to_period("M").dt.to_timestamp()

#Grouping transactions by account and month and summating each grouping's net transaction amount
GroupedSumOfTransactionsByAccountAndMonth = BoostedTransactions.groupby(by=["AccountId", "TransactionDate"]).agg(["sum"])["Amount"]
print(GroupedSumOfTransactionsByAccountAndMonth)
GroupedSumOfTransactionsByAccountAndMonth = GroupedSumOfTransactionsByAccountAndMonth.reset_index()

                                                          sum
AccountId                            TransactionDate         
0003a5ae-0c77-4372-b44d-882ef9874a28 2019-05-01      -5541.31
                                     2019-06-01      -3601.46
                                     2019-07-01       3151.74
                                     2019-08-01      -4801.13
                                     2019-09-01       3890.05
...                                                       ...
fff7f00c-c869-4310-b705-4503538f5ecf 2020-03-01       2974.84
                                     2020-04-01        382.93
                                     2020-05-01        166.15
                                     2020-06-01       -550.89
                                     2020-07-01      -2721.18

[10995 rows x 1 columns]


**Key SSA Loop**

This will only execute if the mySSA.py class is present within the Colab environment.

In [None]:
#Creating models and forecasting for all accounts
for i in UniqueAccounts:
    print(i)
    try:
        #Selecting only the selected account's time series and ensuring it has no gaps
        ForecastingTestTimeSeries = GroupedSumOfTransactionsByAccountAndMonth[GroupedSumOfTransactionsByAccountAndMonth["AccountId"] == i][["TransactionDate", "sum"]]

        ForecastingTestTimeSeries = ForecastingTestTimeSeries.set_index("TransactionDate")

        ForecastingTestTimeSeries = ForecastingTestTimeSeries.resample("MS").sum()

        #Calculating the cutoff point between training and testing sets
        TrainEnd = int(0.7 * len(ForecastingTestTimeSeries))

        #Splitting the data into training and testing sets
        TrainData = ForecastingTestTimeSeries[:TrainEnd]
        TestData = ForecastingTestTimeSeries[TrainEnd:]

        #Assigning window length L as N/2 unless N/2 is less than 2
        if len(TrainData) <= 2:
            L = len(TrainData)
        else:
            L = max(2, len(TrainData) // 2)

        #Calculating embedding dimension K based on window length L and determining seasonality based on significant autocorrelations
        #This is set to 0 if none exist
        K = len(TrainData) - L + 1
        try:
            if K <= 1:
              MostSignificantLag = 0
            else:
              ACFValues = acf(TrainData.iloc[:, 0], nlags=K, fft=False)
              MostSignificantLag = ACFValues[1:].argmax() + 1
              Threshold = 0.2
              if np.isnan(ACFValues).any():
                  MostSignificantLag = 0
              else:
                  MostSignificantLag = ACFValues[1:].argmax() + 1
                  Threshold = 0.2
                  if ACFValues[MostSignificantLag] < Threshold:
                      MostSignificantLag = 0
        except Exception as e:
            MostSignificantLag = 0

        #Fitting SSA model, embedding and decomposing
        if TrainData.empty or TrainData.isnull().all().any():
            print(f"Skipping account {i} due to empty or invalid TrainData")
            #print(TrainData)
            break
        SSA = mySSA(TrainData)

        SSA.embed(embedding_dimension=K, suspected_frequency=MostSignificantLag)

        SSA.decompose()

        #Extracting signals and contributions from the SSA
        Contributions = SSA.view_s_contributions(adjust_scale=True, return_df=True)

        #Declaring lists to store stream numbers and error statistics
        StreamNumbers = []
        ErrorsRMSE = []
        ErrorsSI = []

        #Testing stream numbers to find the number with the lowest RMSE
        for h in range(1, (len(Contributions)+1)):
            Streams = [j for j in range(h)]
            if hasattr(SSA, 'X_com_hat'):
                del SSA.X_com_hat
            Forecast = SSA.forecast_recurrent(steps_ahead=(len(ForecastingTestTimeSeries)-TrainEnd), singular_values=Streams, return_df=True)
            Forecast["Forecast"] = Forecast["Forecast"].clip(lower=-1e6, upper=1e6)
            Forecast["Forecast"] = Forecast["Forecast"].fillna(0)
            try:
                RMSE = root_mean_squared_error(TestData["sum"], Forecast["Forecast"][TrainEnd:])
                #print(TestData["sum"])
                StreamNumbers.append(h)
                ErrorsRMSE.append(RMSE)
                ErrorsSI.append(abs(RMSE/abs(TestData["sum"]).mean()))
            except ValueError as eV:
                print("Bad RMSE value encountered.")
                print(TestData["sum"])
                print(Forecast["Forecast"])
                print(RMSE)
                print(abs(RMSE/abs(TestData["sum"]).mean()))
                ErrorType, ErrorObject, ErrorTraceback = sys.exc_info()

                ErrorFilename = os.path.split(
                    ErrorTraceback.tb_frame.f_code.co_filename
                )[1]

                ErrorMessage = str(eV)

                ErrorLineNumber = ErrorTraceback.tb_lineno

                print(f'Exception Type: {ErrorType}')

                print(f'Exception Filename: {ErrorFilename}')

                print(f'Exception Line Number: {ErrorLineNumber}')

                print(f'Exception Message: {ErrorMessage}')
                break

        ErrorData = pd.DataFrame({"Streams" : StreamNumbers,
                         "RMSE" : ErrorsRMSE,
                         "SI" : ErrorsSI})
        #Taking stream number with lowest RMSE and appending its error statistics into the errors CSV
        try:
          IdealNumberOfStreams = ErrorData.loc[ErrorData["RMSE"].idxmin()]["Streams"]
          LowestRMSE = ErrorData.loc[ErrorData["RMSE"].idxmin()]["RMSE"]
          LowestSI = ErrorData.loc[ErrorData["SI"].idxmin()]["SI"]
          AccountError = pd.DataFrame({"AccountID" : [i],
                                        "RMSE" : [LowestRMSE],
                                        "SI" : [LowestSI]})
          AccountError.to_csv("SSA Errors (Month-by-Month).csv", mode='a', header=not os.path.exists("SSA Errors (Month-by-Month).csv"), index=False)
        except ValueError:
          print("ValueError encountered")
          print(ErrorData)
          break

        #Forecasting the next 12 months with the ideal forecast
        IdealStreams = [k for k in range(int(IdealNumberOfStreams))]

        if hasattr(SSA, 'X_com_hat'):
            del SSA.X_com_hat

        BestForecast = SSA.forecast_recurrent(steps_ahead=((len(ForecastingTestTimeSeries)-TrainEnd)+12), singular_values=IdealStreams, return_df=True)
        #print(BestForecast)

        FutureForecast = BestForecast[-12:]["Forecast"].values

        #Calculating highest amount, lowest amount and difference and appending them into the forecasts CSV
        HighestAmount = FutureForecast.max()
        LowestAmount = FutureForecast.min()

        if np.isnan(HighestAmount) or np.isnan(LowestAmount):
          HighestAmount = 0
          LowestAmount = 0
        else:
          pass

        Difference = HighestAmount - LowestAmount

        ForecastData = pd.DataFrame({"AccountID" : [i],
                                     "Highest Amount" : [HighestAmount],
                                     "Lowest Amount" : [LowestAmount],
                                     "Difference" : [Difference]})

        ForecastData.to_csv("SSA Forecasts (Month-by-Month).csv", mode='a', header=not os.path.exists("SSA Forecasts (Month-by-Month).csv"), index=False)

        print("Processed account number",i)
    #Where an error occurs, the user is informed about it
    except Exception as e:
        print(f"Error encountered processing account number {i}.")
        ErrorType, ErrorObject, ErrorTraceback = sys.exc_info()

        ErrorFilename = os.path.split(
            ErrorTraceback.tb_frame.f_code.co_filename
        )[1]

        ErrorMessage = str(e)

        ErrorLineNumber = ErrorTraceback.tb_lineno

        print(f'Exception Type: {ErrorType}')

        print(f'Exception Filename: {ErrorFilename}')

        print(f'Exception Line Number: {ErrorLineNumber}')

        print(f'Exception Message: {ErrorMessage}')
        break
    #Deleting model data to conserve RAM
    finally:
        del SSA, Forecast, TrainData, TestData, ForecastingTestTimeSeries
        gc.collect()

print("Code executed successfully.")

Output hidden; open in https://colab.research.google.com to view.

In [None]:
#Calculating averaged error statistics
Errors = pd.read_csv("SSA Errors (Month-by-Month).csv")
MeanRMSE = Errors["RMSE"].mean()
print("Mean RMSE:",MeanRMSE)
MedianRMSE = Errors["RMSE"].median()
print("Median RMSE:",MedianRMSE)
MeanSI = Errors["SI"].mean()
print("Mean SI:",MeanSI)
MedianSI = Errors["SI"].median()
print("Median SI:",MedianSI)

Mean RMSE: 21288.24162596512
Median RMSE: 3637.5177942738487
Mean SI: 4.373472172357298
Median SI: 1.449840009301234
