**Importing Libraries and Data**

In [None]:
#Importing libraries
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import root_mean_squared_error
from statsmodels.tsa.stattools import acf
import matplotlib.pyplot as plt
import prophet
import numpy as np
import warnings
import gc
import os
import sys
import json
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
from keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.preprocessing import MinMaxScaler

#Ignoring warnings
warnings.filterwarnings("ignore")

#Reading in data and extracting account ID values
BoostedTransactions = pd.read_csv("Longer Subset of Transaction Data (14 Months).csv")
AccountIDs = BoostedTransactions["AccountId"].values
UniqueAccounts = set(AccountIDs)

#Converting transaction dates into months
BoostedTransactions["TransactionDate"] = pd.to_datetime(BoostedTransactions["TransactionDate"])
BoostedTransactions["TransactionDate"] = BoostedTransactions["TransactionDate"].dt.to_period("M").dt.to_timestamp()

#Grouping transactions by account and month and calculating the net sum of transactions
GroupedSumOfTransactionsByAccountAndMonth = BoostedTransactions.groupby(by=["AccountId", "TransactionDate"]).agg(["sum"])["Amount"]
print(GroupedSumOfTransactionsByAccountAndMonth)

#Creating a list to store each account's manipulated time series in
DataFrames = []

#Imputing 0 values into any gaps in each account's time series
for i in range(0, len(UniqueAccounts)):
  ResetDataFrame = GroupedSumOfTransactionsByAccountAndMonth.reset_index()
  TimeSeries = ResetDataFrame[ResetDataFrame["AccountId"] == list(UniqueAccounts)[i]]
  TimeSeries = TimeSeries.set_index("TransactionDate")

  TimeSeries = TimeSeries.resample("MS").sum()
  for j in range(0, len(TimeSeries)):
    if TimeSeries.iloc[j]["AccountId"] == 0:
      TimeSeries.at[TimeSeries.index[j], "AccountId"] = list(UniqueAccounts)[i]

  DataFrames.append(TimeSeries)

#Creating a combined DataFrame containing all accounts' time series
NewGroupedDataset = pd.concat(DataFrames)
NewGroupedDataset = NewGroupedDataset.reset_index()
NewGroupedDataset = NewGroupedDataset.set_index(["AccountId", "TransactionDate"])
print(NewGroupedDataset)

#Calculating the cumulative transaction sum across the time series of each account
GroupedCumulativeSumOfTransactionsByAccountAndMonth = NewGroupedDataset.groupby(level=0).cumsum().reset_index()
print(GroupedCumulativeSumOfTransactionsByAccountAndMonth)

                                                          sum
AccountId                            TransactionDate         
0003a5ae-0c77-4372-b44d-882ef9874a28 2019-05-01      -5541.31
                                     2019-06-01      -3601.46
                                     2019-07-01       3151.74
                                     2019-08-01      -4801.13
                                     2019-09-01       3890.05
...                                                       ...
fff7f00c-c869-4310-b705-4503538f5ecf 2020-03-01       2974.84
                                     2020-04-01        382.93
                                     2020-05-01        166.15
                                     2020-06-01       -550.89
                                     2020-07-01      -2721.18

[7632 rows x 1 columns]
                                                            sum
AccountId                            TransactionDate           
22d44841-18bd-4482-a333-6ed410071c64 2019

**Generating unique Account IDs list and writing it into JSON file (only run before first batch)**

Only run this code block before running the first batch. This is to ensure the order of accounts is maintained and no accounts are duplicated.

In [None]:
#Extracting unique account IDs and writing them into a JSON file
AccountIDs = BoostedTransactions["AccountId"].values
UniqueAccounts = list(dict.fromkeys(AccountIDs))
with open("UniqueAccounts.json", "w") as File:
  json.dump(UniqueAccounts, File)


**Reading unique account IDs and manipulating time series**

In [None]:
#Reading unique accounts JSON file
with open("UniqueAccounts.json", "r") as File:
  UniqueAccounts = json.load(File)

#Converting transaction dates into months
BoostedTransactions["TransactionDate"] = pd.to_datetime(BoostedTransactions["TransactionDate"])
BoostedTransactions["TransactionDate"] = BoostedTransactions["TransactionDate"].dt.to_period("M").dt.to_timestamp()

#Grouping the dataset by account and month and calculating the net sum of transaction amounts per month
GroupedSumOfTransactionsByAccountAndMonth = BoostedTransactions.groupby(by=["AccountId", "TransactionDate"]).agg(["sum"])["Amount"]
print(GroupedSumOfTransactionsByAccountAndMonth)

#Creating a list to store each account's time series in
DataFrames = []

#Imputing 0 values into any gaps in each account's time series
for i in range(0, len(UniqueAccounts)):
  ResetDataFrame = GroupedSumOfTransactionsByAccountAndMonth.reset_index()
  TimeSeries = ResetDataFrame[ResetDataFrame["AccountId"] == list(UniqueAccounts)[i]]
  TimeSeries = TimeSeries.set_index("TransactionDate")

  TimeSeries = TimeSeries.resample("MS").sum()
  for j in range(0, len(TimeSeries)):
    if TimeSeries.iloc[j]["AccountId"] == 0:
      TimeSeries.at[TimeSeries.index[j], "AccountId"] = list(UniqueAccounts)[i]

  DataFrames.append(TimeSeries)

#Creating a new DataFrame containing all accounts' time series
NewGroupedDataset = pd.concat(DataFrames)
NewGroupedDataset = NewGroupedDataset.reset_index()
NewGroupedDataset = NewGroupedDataset.set_index(["AccountId", "TransactionDate"])
print(NewGroupedDataset)

#Calculating the cumulative transaction sum for each account across the time series
GroupedCumulativeSumOfTransactionsByAccountAndMonth = NewGroupedDataset.groupby(level=0).cumsum().reset_index()
print(GroupedCumulativeSumOfTransactionsByAccountAndMonth)

                                                          sum
AccountId                            TransactionDate         
0003a5ae-0c77-4372-b44d-882ef9874a28 2019-05-01      -5541.31
                                     2019-06-01      -3601.46
                                     2019-07-01       3151.74
                                     2019-08-01      -4801.13
                                     2019-09-01       3890.05
...                                                       ...
fff7f00c-c869-4310-b705-4503538f5ecf 2020-03-01       2974.84
                                     2020-04-01        382.93
                                     2020-05-01        166.15
                                     2020-06-01       -550.89
                                     2020-07-01      -2721.18

[7632 rows x 1 columns]
                                                            sum
AccountId                            TransactionDate           
16268                                2019

**Key LSTM Loops (split up for memory reasons)**

Batch for first 150 accounts

In [None]:
#Creating models and forecasting for first 150 accounts
for idx, i in enumerate(UniqueAccounts):
    #Ensuring that only the first 150 accounts are processed
    if idx >= 150:
        break
    print(i)
    try:
        #Selecting only the selected account's time series
        ForecastingTimeSeries = GroupedCumulativeSumOfTransactionsByAccountAndMonth[GroupedCumulativeSumOfTransactionsByAccountAndMonth["AccountId"] == i][["TransactionDate", "sum"]]

        #Calculating the cutoff between training and testing sets
        TrainEnd = int(0.7 * len(ForecastingTimeSeries))

        #Splitting the data into training and testing sets
        TrainData = ForecastingTimeSeries[:TrainEnd]
        TestData = ForecastingTimeSeries[TrainEnd:]

        #Setting transaction month as the index
        TrainData = TrainData.set_index("TransactionDate")

        #Scaling the data using min/max scaling
        Scaler = MinMaxScaler(feature_range=(0,1))
        ScaledTrain = Scaler.fit_transform(TrainData)

        #Generating a batched time series for the LSTM to process
        TimeStep = 1
        TimeSeriesData = TimeseriesGenerator(ScaledTrain, ScaledTrain, length=TimeStep, batch_size=1)

        #Declaring model layers
        Model = Sequential()
        Model.add(LSTM(units=16, return_sequences=True, activation="tanh", input_shape=(TimeStep, 1)))
        Model.add(LSTM(units=16, return_sequences=True, activation="tanh"))
        Model.add(LSTM(units=16, return_sequences=True, activation="tanh"))
        Model.add(LSTM(units=16, return_sequences=True, activation="tanh"))
        Model.add(LSTM(units=16, activation="tanh"))
        Model.add(Dense(units=1, activation="sigmoid"))
        Model.add(Dense(units=1, activation="sigmoid"))
        Model.add(Dense(1))
        Model.compile(optimizer="adam", loss="mean_squared_error")

        #Training the model
        Model.fit(TimeSeriesData, epochs=20, batch_size=1, verbose=0)

        #Creating a list to store test predictions in
        TestPredictions = []

        #Creating the first testing batch
        FirstTestingBatch = ScaledTrain[-TimeStep:]
        CurrentBatch = FirstTestingBatch.reshape((1, TimeStep, 1))

        #Making predictions on the test dataset
        for h in range(len(TestData)):
            CurrentPrediction = Model.predict(CurrentBatch)[0]
            TestPredictions.append(CurrentPrediction)
            CurrentBatch = np.append(CurrentBatch[:, 1:, :], [[CurrentPrediction]], axis=1)

        #Unscaling the test predictions
        Forecast = Scaler.inverse_transform(TestPredictions)

        #Calculating error rates and appending them into the errors CSV
        try:
          RMSE = root_mean_squared_error(Forecast, TestData["sum"])
          SI = abs(RMSE/abs(TestData["sum"]).mean())
          AccountError = pd.DataFrame({"AccountID" : [i],
                                        "RMSE" : [RMSE],
                                        "SI" : [SI]})
          AccountError.to_csv("LSTM Longer Subset Errors (Cumulative).csv", mode='a', header=not os.path.exists("LSTM Longer Subset Errors (Cumulative).csv"), index=False)
        #If a ValueError occurs, the loop is broken
        except ValueError:
          print("ValueError encountered")
          break

        #Creating a list to store future predictions
        FuturePredictions = []

        #Creating the first future batch
        FirstFutureBatch = np.array(TestPredictions[-TimeStep:])
        CurrentBatch = FirstFutureBatch.reshape((1, TimeStep, 1))

        #Forecasting the next 12 months
        for j in range(12):
            CurrentPrediction = Model.predict(CurrentBatch)[0]
            FuturePredictions.append(CurrentPrediction)
            CurrentBatch = np.append(CurrentBatch[:, 1:, :], [[CurrentPrediction]], axis=1)

        #Unscaling the future forecast
        FutureForecast = Scaler.inverse_transform(FuturePredictions)

        #Calculating the starting amount, ending amount and net change and appending these into the forecasts CSV
        NetChange = FutureForecast[-1] - FutureForecast[0]
        StartingAmount = FutureForecast[0]
        EndingAmount = FutureForecast[-1]

        ForecastData = pd.DataFrame({"AccountID" : [i],
                                     "Net Change over Forecast Period" : [NetChange],
                                     "Starting Amount" : [StartingAmount],
                                     "Ending Amount" : [EndingAmount]})

        ForecastData.to_csv("LSTM Longer Subset Forecasts (Cumulative).csv", mode='a', header=not os.path.exists("LSTM Longer Subset Forecasts (Cumulative).csv"), index=False)

        print("Processed account number",i)
    #Where an error occurs, the user is informed about it
    except Exception as e:
        print(f"Error encountered processing account number {i}.")
        ErrorType, ErrorObject, ErrorTraceback = sys.exc_info()

        ErrorFilename = os.path.split(
            ErrorTraceback.tb_frame.f_code.co_filename
        )[1]

        ErrorMessage = str(e)

        ErrorLineNumber = ErrorTraceback.tb_lineno

        print(f'Exception Type: {ErrorType}')

        print(f'Exception Filename: {ErrorFilename}')

        print(f'Exception Line Number: {ErrorLineNumber}')

        print(f'Exception Message: {ErrorMessage}')
        break
    #Deleting model data from memory to conserve RAM
    finally:
        del Model, Forecast, TrainData, TestData, ForecastingTimeSeries
        gc.collect()


16268
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 511ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

Batch for accounts 150-300

In [None]:
#Creating models and forecasting for accounts 150-300
for idx, i in enumerate(UniqueAccounts):
    #Ensuring that only accounts 150-300 are processed
    if idx < 150:
        pass
    else:
      if idx >= 300:
        break
      print(i)
      try:
          #Selecting only the chosen account's time series
          ForecastingTimeSeries = GroupedCumulativeSumOfTransactionsByAccountAndMonth[GroupedCumulativeSumOfTransactionsByAccountAndMonth["AccountId"] == i][["TransactionDate", "sum"]]

          #Calculating the cutoff point between training and testing sets
          TrainEnd = int(0.7 * len(ForecastingTimeSeries))

          #Splitting the data into training and testing sets
          TrainData = ForecastingTimeSeries[:TrainEnd]
          TestData = ForecastingTimeSeries[TrainEnd:]

          #Setting transaction month as the index
          TrainData = TrainData.set_index("TransactionDate")

          #Scaling the data using min/max scaling
          Scaler = MinMaxScaler(feature_range=(0,1))
          ScaledTrain = Scaler.fit_transform(TrainData)

          #Generating a batched time series for the LSTM to process
          TimeStep = 1
          TimeSeriesData = TimeseriesGenerator(ScaledTrain, ScaledTrain, length=TimeStep, batch_size=1)

          #Declaring model layers
          Model = Sequential()
          Model.add(LSTM(units=16, return_sequences=True, activation="tanh", input_shape=(TimeStep, 1)))
          Model.add(LSTM(units=16, return_sequences=True, activation="tanh"))
          Model.add(LSTM(units=16, return_sequences=True, activation="tanh"))
          Model.add(LSTM(units=16, return_sequences=True, activation="tanh"))
          Model.add(LSTM(units=16, activation="tanh"))
          Model.add(Dense(units=1, activation="sigmoid"))
          Model.add(Dense(units=1, activation="sigmoid"))
          Model.add(Dense(1))
          Model.compile(optimizer="adam", loss="mean_squared_error")

          #Training the model
          Model.fit(TimeSeriesData, epochs=20, batch_size=1, verbose=0)

          #Creating a list to store test predictions
          TestPredictions = []

          #Creating the first testing batch
          FirstTestingBatch = ScaledTrain[-TimeStep:]
          CurrentBatch = FirstTestingBatch.reshape((1, TimeStep, 1))

          #Forecasting on the test data
          for h in range(len(TestData)):
              CurrentPrediction = Model.predict(CurrentBatch)[0]
              TestPredictions.append(CurrentPrediction)
              CurrentBatch = np.append(CurrentBatch[:, 1:, :], [[CurrentPrediction]], axis=1)

          #Unscaling the test forecast
          Forecast = Scaler.inverse_transform(TestPredictions)

          #Calculating error rates and appending them into the errors CSV
          try:
            RMSE = root_mean_squared_error(Forecast, TestData["sum"])
            SI = abs(RMSE/abs(TestData["sum"]).mean())
            AccountError = pd.DataFrame({"AccountID" : [i],
                                          "RMSE" : [RMSE],
                                          "SI" : [SI]})
            AccountError.to_csv("LSTM Longer Subset Errors (Cumulative).csv", mode='a', header=not os.path.exists("LSTM Longer Subset Errors (Cumulative).csv"), index=False)
          except ValueError:
            print("ValueError encountered")
            break

          #Creating a list to store future predictions
          FuturePredictions = []

          #Creating the first future batch
          FirstFutureBatch = np.array(TestPredictions[-TimeStep:])
          CurrentBatch = FirstFutureBatch.reshape((1, TimeStep, 1))

          #Forecasting the next 12 months
          for j in range(12):
              CurrentPrediction = Model.predict(CurrentBatch)[0]
              FuturePredictions.append(CurrentPrediction)
              CurrentBatch = np.append(CurrentBatch[:, 1:, :], [[CurrentPrediction]], axis=1)

          #Unscaling the future forecast
          FutureForecast = Scaler.inverse_transform(FuturePredictions)

          #Calculating starting amount, ending amount and net change
          NetChange = FutureForecast[-1] - FutureForecast[0]
          StartingAmount = FutureForecast[0]
          EndingAmount = FutureForecast[-1]

          ForecastData = pd.DataFrame({"AccountID" : [i],
                                      "Net Change over Forecast Period" : [NetChange],
                                      "Starting Amount" : [StartingAmount],
                                      "Ending Amount" : [EndingAmount]})

          ForecastData.to_csv("LSTM Forecasts (Cumulative).csv", mode='a', header=not os.path.exists("LSTM Forecasts (Cumulative).csv"), index=False)

          print("Processed account number",i)
      #Where an error occurs, the user is informed about it
      except Exception as e:
          print(f"Error encountered processing account number {i}.")
          ErrorType, ErrorObject, ErrorTraceback = sys.exc_info()

          ErrorFilename = os.path.split(
              ErrorTraceback.tb_frame.f_code.co_filename
          )[1]

          ErrorMessage = str(e)

          ErrorLineNumber = ErrorTraceback.tb_lineno

          print(f'Exception Type: {ErrorType}')

          print(f'Exception Filename: {ErrorFilename}')

          print(f'Exception Line Number: {ErrorLineNumber}')

          print(f'Exception Message: {ErrorMessage}')
          break
      #Deleting model data from memory to conserve RAM
      finally:
          del Model, Forecast, TrainData, TestData, ForecastingTimeSeries
          gc.collect()

a54aa6f3-78db-4139-aff8-785b41d7ea13
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 746ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━

Batch for accounts 300-450

In [None]:
#Creating models and forecasting for accounts 300-450
for idx, i in enumerate(UniqueAccounts):
    #Ensuring that only accounts 300-450 are processed
    if idx < 300:
        pass
    else:
      if idx >= 450:
        break
      print(i)
      try:
          #Selecting only the chosen account's time series
          ForecastingTimeSeries = GroupedCumulativeSumOfTransactionsByAccountAndMonth[GroupedCumulativeSumOfTransactionsByAccountAndMonth["AccountId"] == i][["TransactionDate", "sum"]]

          #Calculating the cutoff point between training and testing sets
          TrainEnd = int(0.7 * len(ForecastingTimeSeries))

          #Splitting the data into training and testing sets
          TrainData = ForecastingTimeSeries[:TrainEnd]
          TestData = ForecastingTimeSeries[TrainEnd:]

          #Setting transaction month as the index
          TrainData = TrainData.set_index("TransactionDate")

          #Scaling the data using min/max scaling
          Scaler = MinMaxScaler(feature_range=(0,1))
          ScaledTrain = Scaler.fit_transform(TrainData)

          #Generating a batched time series that the LSTM can process
          TimeStep = 1
          TimeSeriesData = TimeseriesGenerator(ScaledTrain, ScaledTrain, length=TimeStep, batch_size=1)

          #Declaring model layers
          Model = Sequential()
          Model.add(LSTM(units=16, return_sequences=True, activation="tanh", input_shape=(TimeStep, 1)))
          Model.add(LSTM(units=16, return_sequences=True, activation="tanh"))
          Model.add(LSTM(units=16, return_sequences=True, activation="tanh"))
          Model.add(LSTM(units=16, return_sequences=True, activation="tanh"))
          Model.add(LSTM(units=16, activation="tanh"))
          Model.add(Dense(units=1, activation="sigmoid"))
          Model.add(Dense(units=1, activation="sigmoid"))
          Model.add(Dense(1))
          Model.compile(optimizer="adam", loss="mean_squared_error")

          #Training the model
          Model.fit(TimeSeriesData, epochs=20, batch_size=1, verbose=0)

          #Creating a list to store test predictions
          TestPredictions = []

          #Generating the first test batch
          FirstTestingBatch = ScaledTrain[-TimeStep:]
          CurrentBatch = FirstTestingBatch.reshape((1, TimeStep, 1))

          #Forecasting on the test dataset
          for h in range(len(TestData)):
              CurrentPrediction = Model.predict(CurrentBatch)[0]
              TestPredictions.append(CurrentPrediction)
              CurrentBatch = np.append(CurrentBatch[:, 1:, :], [[CurrentPrediction]], axis=1)

          #Unscaling the test forecast
          Forecast = Scaler.inverse_transform(TestPredictions)

          #Calculating error rates and appending them into the errors CSV
          try:
            RMSE = root_mean_squared_error(Forecast, TestData["sum"])
            SI = abs(RMSE/abs(TestData["sum"]).mean())
            AccountError = pd.DataFrame({"AccountID" : [i],
                                          "RMSE" : [RMSE],
                                          "SI" : [SI]})
            AccountError.to_csv("LSTM Longer Subset Errors (Cumulative).csv", mode='a', header=not os.path.exists("LSTM Longer Subset Errors (Cumulative).csv"), index=False)
          #When a ValueError occurs, the loop is broken
          except ValueError:
            print("ValueError encountered")
            break

          #Creating a list to store future predictions
          FuturePredictions = []

          #Creating the first future batch
          FirstFutureBatch = np.array(TestPredictions[-TimeStep:])
          CurrentBatch = FirstFutureBatch.reshape((1, TimeStep, 1))

          #Forecasting the next 12 months
          for j in range(12):
              CurrentPrediction = Model.predict(CurrentBatch)[0]
              FuturePredictions.append(CurrentPrediction)
              CurrentBatch = np.append(CurrentBatch[:, 1:, :], [[CurrentPrediction]], axis=1)

          #Unscaling the future forecast
          FutureForecast = Scaler.inverse_transform(FuturePredictions)

          #Calculating starting amount, ending amount and net change and appending them into the forecasts CSV
          NetChange = FutureForecast[-1] - FutureForecast[0]
          StartingAmount = FutureForecast[0]
          EndingAmount = FutureForecast[-1]

          ForecastData = pd.DataFrame({"AccountID" : [i],
                                      "Net Change over Forecast Period" : [NetChange],
                                      "Starting Amount" : [StartingAmount],
                                      "Ending Amount" : [EndingAmount]})

          ForecastData.to_csv("LSTM Longer Subset Forecasts (Cumulative).csv", mode='a', header=not os.path.exists("LSTM Longer Subset Forecasts (Cumulative).csv"), index=False)

          print("Processed account number",i)
      #When an error occurs, the user is informed about it
      except Exception as e:
          print(f"Error encountered processing account number {i}.")
          ErrorType, ErrorObject, ErrorTraceback = sys.exc_info()

          ErrorFilename = os.path.split(
              ErrorTraceback.tb_frame.f_code.co_filename
          )[1]

          ErrorMessage = str(e)

          ErrorLineNumber = ErrorTraceback.tb_lineno

          print(f'Exception Type: {ErrorType}')

          print(f'Exception Filename: {ErrorFilename}')

          print(f'Exception Line Number: {ErrorLineNumber}')

          print(f'Exception Message: {ErrorMessage}')
          break
      #Deleting model data from memory to conserve RAM
      finally:
          del Model, Forecast, TrainData, TestData, ForecastingTimeSeries
          gc.collect()

63798331-704a-4630-86f9-da64cc42ad40
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 536ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━

Batch for final 68 accounts

In [None]:
#Creating models and forecasting for the final 68 accounts
for idx, i in enumerate(UniqueAccounts):
    #Ensuring that only the final 68 accounts are processed
    if idx < 450:
        pass
    else:
      if idx >= 600:
        break
      print(i)
      try:
          #Selecting only the chosen account's time series
          ForecastingTimeSeries = GroupedCumulativeSumOfTransactionsByAccountAndMonth[GroupedCumulativeSumOfTransactionsByAccountAndMonth["AccountId"] == i][["TransactionDate", "sum"]]

          #Calculating the cutoff point between the training and testing sets
          TrainEnd = int(0.7 * len(ForecastingTimeSeries))

          #Splitting the data into training and testing sets
          TrainData = ForecastingTimeSeries[:TrainEnd]
          TestData = ForecastingTimeSeries[TrainEnd:]

          #Setting the transaction month as an index
          TrainData = TrainData.set_index("TransactionDate")

          #Scaling the data using min/max scaling
          Scaler = MinMaxScaler(feature_range=(0,1))
          ScaledTrain = Scaler.fit_transform(TrainData)

          #Generating a batched time series that the LSTM can process
          TimeStep = 1
          TimeSeriesData = TimeseriesGenerator(ScaledTrain, ScaledTrain, length=TimeStep, batch_size=1)

          #Declaring model layers
          Model = Sequential()
          Model.add(LSTM(units=16, return_sequences=True, activation="tanh", input_shape=(TimeStep, 1)))
          Model.add(LSTM(units=16, return_sequences=True, activation="tanh"))
          Model.add(LSTM(units=16, return_sequences=True, activation="tanh"))
          Model.add(LSTM(units=16, return_sequences=True, activation="tanh"))
          Model.add(LSTM(units=16, activation="tanh"))
          Model.add(Dense(units=1, activation="sigmoid"))
          Model.add(Dense(units=1, activation="sigmoid"))
          Model.add(Dense(1))
          Model.compile(optimizer="adam", loss="mean_squared_error")

          #Training the model
          Model.fit(TimeSeriesData, epochs=20, batch_size=1, verbose=0)

          #Creating a list to store test predictions
          TestPredictions = []

          #Creating the first testing batch
          FirstTestingBatch = ScaledTrain[-TimeStep:]
          CurrentBatch = FirstTestingBatch.reshape((1, TimeStep, 1))

          #Forecasting on the test data
          for h in range(len(TestData)):
              CurrentPrediction = Model.predict(CurrentBatch)[0]
              TestPredictions.append(CurrentPrediction)
              CurrentBatch = np.append(CurrentBatch[:, 1:, :], [[CurrentPrediction]], axis=1)

          #Unscaling the test forecast
          Forecast = Scaler.inverse_transform(TestPredictions)

          #Calculating the error rates and appending into the errors CSV file
          try:
            RMSE = root_mean_squared_error(Forecast, TestData["sum"])
            SI = abs(RMSE/abs(TestData["sum"]).mean())
            AccountError = pd.DataFrame({"AccountID" : [i],
                                          "RMSE" : [RMSE],
                                          "SI" : [SI]})
            AccountError.to_csv("LSTM Longer Subset Errors (Cumulative).csv", mode='a', header=not os.path.exists("LSTM Longer Subset Errors (Cumulative).csv"), index=False)
          #When a ValueError occurs, the loop is broken
          except ValueError:
            print("ValueError encountered")
            break

          #Creating a list to store future predictions
          FuturePredictions = []

          #Creating the first future batch
          FirstFutureBatch = np.array(TestPredictions[-TimeStep:])
          CurrentBatch = FirstFutureBatch.reshape((1, TimeStep, 1))

          #Forecasting the next 12 months
          for j in range(12):
              CurrentPrediction = Model.predict(CurrentBatch)[0]
              FuturePredictions.append(CurrentPrediction)
              CurrentBatch = np.append(CurrentBatch[:, 1:, :], [[CurrentPrediction]], axis=1)

          #Unscaling the future forecast
          FutureForecast = Scaler.inverse_transform(FuturePredictions)

          #Calculating the starting amount, ending amount and net change
          NetChange = FutureForecast[-1] - FutureForecast[0]
          StartingAmount = FutureForecast[0]
          EndingAmount = FutureForecast[-1]

          ForecastData = pd.DataFrame({"AccountID" : [i],
                                      "Net Change over Forecast Period" : [NetChange],
                                      "Starting Amount" : [StartingAmount],
                                      "Ending Amount" : [EndingAmount]})

          ForecastData.to_csv("LSTM Longer Subset Forecasts (Cumulative).csv", mode='a', header=not os.path.exists("LSTM Longer Subset Forecasts (Cumulative).csv"), index=False)

          print("Processed account number",i)
      #Where an error occurs, the user is informed about it
      except Exception as e:
          print(f"Error encountered processing account number {i}.")
          ErrorType, ErrorObject, ErrorTraceback = sys.exc_info()

          ErrorFilename = os.path.split(
              ErrorTraceback.tb_frame.f_code.co_filename
          )[1]

          ErrorMessage = str(e)

          ErrorLineNumber = ErrorTraceback.tb_lineno

          print(f'Exception Type: {ErrorType}')

          print(f'Exception Filename: {ErrorFilename}')

          print(f'Exception Line Number: {ErrorLineNumber}')

          print(f'Exception Message: {ErrorMessage}')
          break
      #Deleting the model data from memory to conserve RAM
      finally:
          del Model, Forecast, TrainData, TestData, ForecastingTimeSeries
          gc.collect()

8d9a12fc-4c88-4de3-aa82-19243f2b4049
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 481ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━

**Calculating Error Statistics**

In [None]:
#Calculating averaged error statistics
Errors = pd.read_csv("LSTM Longer Subset Errors (Cumulative).csv")
MeanRMSE = Errors["RMSE"].mean()
print("Mean RMSE:",MeanRMSE)
MedianRMSE = Errors["RMSE"].median()
print("Median RMSE:",MedianRMSE)
MeanSI = Errors["SI"].mean()
print("Mean SI:",MeanSI)
MedianSI = Errors["SI"].median()
print("Median SI:",MedianSI)

Mean RMSE: 209508.46597121548
Median RMSE: 16872.771599645806
Mean SI: 0.4923533028960231
Median SI: 0.41191503962190323
