**Importing Libraries and Data**

In [None]:
#Importing libraries
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import root_mean_squared_error
from statsmodels.tsa.stattools import acf
import matplotlib.pyplot as plt
from mySSA import mySSA
import numpy as np
import warnings
import gc
import os
import sys

#Ignoring warnings
warnings.filterwarnings("ignore")

#Reading in the data and extracting unique account values
BoostedTransactions = pd.read_csv("Boosted Transaction Dataset.csv")
AccountIDs = BoostedTransactions["AccountId"].values
UniqueAccounts = set(AccountIDs)

#Converting transaction dates to months
BoostedTransactions["TransactionDate"] = pd.to_datetime(BoostedTransactions["TransactionDate"])
BoostedTransactions["TransactionDate"] = BoostedTransactions["TransactionDate"].dt.to_period("M").dt.to_timestamp()

#Grouping transactions by account and month and summating each grouping's net transaction amount
GroupedSumOfTransactionsByAccountAndMonth = BoostedTransactions.groupby(by=["AccountId", "TransactionDate"]).agg(["sum"])["Amount"]
print(GroupedSumOfTransactionsByAccountAndMonth)

#Creating a list to store all accounts' time series in
DataFrames = []

#Imputing 0 values into the gaps in each account's time series
for i in range(0, len(UniqueAccounts)):
  ResetDataFrame = GroupedSumOfTransactionsByAccountAndMonth.reset_index()
  TimeSeries = ResetDataFrame[ResetDataFrame["AccountId"] == list(UniqueAccounts)[i]]
  TimeSeries = TimeSeries.set_index("TransactionDate")

  TimeSeries = TimeSeries.resample("MS").sum()
  for j in range(0, len(TimeSeries)):
    if TimeSeries.iloc[j]["AccountId"] == 0:
      TimeSeries.at[TimeSeries.index[j], "AccountId"] = list(UniqueAccounts)[i]

  DataFrames.append(TimeSeries)

#Combining all accounts' time series into a new grouped dataset
NewGroupedDataset = pd.concat(DataFrames)
NewGroupedDataset = NewGroupedDataset.reset_index()
NewGroupedDataset = NewGroupedDataset.set_index(["AccountId", "TransactionDate"])
print(NewGroupedDataset)

#Calculating the cumulative transaction sum over the time period for each account
GroupedCumulativeSumOfTransactionsByAccountAndMonth = NewGroupedDataset.groupby(level=0).cumsum().reset_index()
print(GroupedCumulativeSumOfTransactionsByAccountAndMonth)

                                                          sum
AccountId                            TransactionDate         
0003a5ae-0c77-4372-b44d-882ef9874a28 2019-05-01      -5541.31
                                     2019-06-01      -3601.46
                                     2019-07-01       3151.74
                                     2019-08-01      -4801.13
                                     2019-09-01       3890.05
...                                                       ...
fff7f00c-c869-4310-b705-4503538f5ecf 2020-03-01       2974.84
                                     2020-04-01        382.93
                                     2020-05-01        166.15
                                     2020-06-01       -550.89
                                     2020-07-01      -2721.18

[10995 rows x 1 columns]
                                                            sum
AccountId                            TransactionDate           
291e9077-7c91-430a-936d-2faf0876f8fe 201

**Key SSA Loop**

This cannot be executed without the mySSA.py Python class in the Colab environment, which can be found in the code folder.

In [None]:
#Creating models and forecasting for all accounts
for i in UniqueAccounts:
    print(i)
    try:
        #Selecting only the current account's time series
        ForecastingTestTimeSeries = GroupedCumulativeSumOfTransactionsByAccountAndMonth[GroupedCumulativeSumOfTransactionsByAccountAndMonth["AccountId"] == i][["TransactionDate", "sum"]]

        ForecastingTestTimeSeries = ForecastingTestTimeSeries.set_index("TransactionDate")

        #Calculating the cutoff point between training and testing sets
        TrainEnd = int(0.7 * len(ForecastingTestTimeSeries))

        #Splitting the data into training and testing sets
        TrainData = ForecastingTestTimeSeries[:TrainEnd]
        TestData = ForecastingTestTimeSeries[TrainEnd:]

        #Assigning a window length L based on training data length, with L/2 being assigned unless L/2 is less than 2
        if len(TrainData) <= 2:
            L = len(TrainData)
        else:
            L = max(2, len(TrainData) // 2)

        #Calculating K and assigning the most significant lag based on significant autocorrelations, with 0 being assigned if none exist
        K = len(TrainData) - L + 1
        try:
            if K <= 1:
              MostSignificantLag = 0
            else:
              ACFValues = acf(TrainData.iloc[:, 0], nlags=K, fft=False)
              MostSignificantLag = ACFValues[1:].argmax() + 1
              Threshold = 0.2
              if np.isnan(ACFValues).any():
                  MostSignificantLag = 0
              else:
                  MostSignificantLag = ACFValues[1:].argmax() + 1
                  Threshold = 0.2
                  if ACFValues[MostSignificantLag] < Threshold:
                      MostSignificantLag = 0
        except Exception as e:
            MostSignificantLag = 0

        #Creating the SSA model, embedding and decomposing
        if TrainData.empty or TrainData.isnull().all().any():
            print(f"Skipping account {i} due to empty or invalid TrainData")
            #print(TrainData)
            break
        SSA = mySSA(TrainData)

        SSA.embed(embedding_dimension=K, suspected_frequency=MostSignificantLag)

        SSA.decompose()

        #Getting contributing signals
        Contributions = SSA.view_s_contributions(adjust_scale=True, return_df=True)

        #Creating lists to store stream numbers and their RMSEs and SIs
        StreamNumbers = []
        ErrorsRMSE = []
        ErrorsSI = []

        #Testing each number of streams from the contributions to see which one returns the lowest RMSE
        for h in range(1, (len(Contributions)+1)):
            Streams = [j for j in range(h)]
            if hasattr(SSA, 'X_com_hat'):
                del SSA.X_com_hat
            Forecast = SSA.forecast_recurrent(steps_ahead=(len(ForecastingTestTimeSeries)-TrainEnd), singular_values=Streams, return_df=True)
            Forecast["Forecast"] = Forecast["Forecast"].clip(lower=-1e6, upper=1e6)
            Forecast["Forecast"] = Forecast["Forecast"].fillna(0)
            try:
                RMSE = root_mean_squared_error(TestData["sum"], Forecast["Forecast"][TrainEnd:])
                StreamNumbers.append(h)
                ErrorsRMSE.append(RMSE)
                ErrorsSI.append(abs(RMSE/abs(TestData["sum"]).mean()))
            except ValueError as eV:
                print("Bad RMSE value encountered.")
                print(TestData["sum"])
                print(Forecast["Forecast"])
                print(RMSE)
                print(abs(RMSE/abs(TestData["sum"]).mean()))
                ErrorType, ErrorObject, ErrorTraceback = sys.exc_info()

                ErrorFilename = os.path.split(
                    ErrorTraceback.tb_frame.f_code.co_filename
                )[1]

                ErrorMessage = str(eV)

                ErrorLineNumber = ErrorTraceback.tb_lineno

                print(f'Exception Type: {ErrorType}')

                print(f'Exception Filename: {ErrorFilename}')

                print(f'Exception Line Number: {ErrorLineNumber}')

                print(f'Exception Message: {ErrorMessage}')
                break

        ErrorData = pd.DataFrame({"Streams" : StreamNumbers,
                         "RMSE" : ErrorsRMSE,
                         "SI" : ErrorsSI})
        #Taking the number of streams with the lowest RMSE and appending its error rates into the errors CSV file
        try:
          IdealNumberOfStreams = ErrorData.loc[ErrorData["RMSE"].idxmin()]["Streams"]
          LowestRMSE = ErrorData.loc[ErrorData["RMSE"].idxmin()]["RMSE"]
          LowestSI = ErrorData.loc[ErrorData["SI"].idxmin()]["SI"]
          AccountError = pd.DataFrame({"AccountID" : [i],
                                        "RMSE" : [LowestRMSE],
                                        "SI" : [LowestSI]})
          AccountError.to_csv("SSA Errors (Cumulative).csv", mode='a', header=not os.path.exists("SSA Errors (Cumulative).csv"), index=False)
        except ValueError:
          print("ValueError encountered")
          print(ErrorData)
          break

        #Taking the ideal streams and performing a 12-month forecast based on them
        IdealStreams = [k for k in range(int(IdealNumberOfStreams))]

        if hasattr(SSA, 'X_com_hat'):
            del SSA.X_com_hat

        BestForecast = SSA.forecast_recurrent(steps_ahead=((len(ForecastingTestTimeSeries)-TrainEnd)+12), singular_values=IdealStreams, return_df=True)
        #print(BestForecast)

        FutureForecast = BestForecast[-12:]["Forecast"].values
        #print(FutureForecast)

        #Calculating starting amount, ending amount and net change and appending them into the forecasts CSV file
        NetChange = FutureForecast[-1] - FutureForecast[0]
        StartingAmount = FutureForecast[0]
        EndingAmount = FutureForecast[-1]

        ForecastData = pd.DataFrame({"AccountID" : [i],
                                     "Net Change over Forecast Period" : [NetChange],
                                     "Starting Amount" : [StartingAmount],
                                     "Ending Amount" : [EndingAmount]})

        ForecastData.to_csv("SSA Forecasts (Cumulative).csv", mode='a', header=not os.path.exists("SSA Forecasts (Cumulative).csv"), index=False)

        print("Processed account number",i)
    #Where an error occurs, the user is informed about it
    except Exception as e:
        print(f"Error encountered processing account number {i}.")
        ErrorType, ErrorObject, ErrorTraceback = sys.exc_info()

        ErrorFilename = os.path.split(
            ErrorTraceback.tb_frame.f_code.co_filename
        )[1]

        ErrorMessage = str(e)

        ErrorLineNumber = ErrorTraceback.tb_lineno

        print(f'Exception Type: {ErrorType}')

        print(f'Exception Filename: {ErrorFilename}')

        print(f'Exception Line Number: {ErrorLineNumber}')

        print(f'Exception Message: {ErrorMessage}')
        break
    #Deleting model data to conserve RAM
    finally:
        del SSA, Forecast, TrainData, TestData, ForecastingTestTimeSeries
        gc.collect()

print("Code executed successfully.")

Output hidden; open in https://colab.research.google.com to view.

In [None]:
#Calculating averaged error statistics
Errors = pd.read_csv("SSA Errors (Cumulative).csv")
MeanRMSE = Errors["RMSE"].mean()
print("Mean RMSE:",MeanRMSE)
MedianRMSE = Errors["RMSE"].median()
print("Median RMSE:",MedianRMSE)
MeanSI = Errors["SI"].mean()
print("Mean SI:",MeanSI)
MedianSI = Errors["SI"].median()
print("Median SI:",MedianSI)

Mean RMSE: 317311.17644744704
Median RMSE: 10585.65184997053
Mean SI: 0.8958266371452679
Median SI: 0.6152184939762161
