

# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:center; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'></span></b> <b>Table of Content</b></div>

* [1. Importing Libraries](#1)
* [2. Adjusting Row & Column Settings](#2)
* [3. Loading The Data Set](#3)
* [4. Data Preprocessing](#4)
* [5. Feature Engineering](#5)
* [6. Custom Cost Function](#6)
* [7. Time-Based Validation Sets](#7)
* [8. LSTM Model](#8)
* [9. Train an LSTM Model using the entire dataset](#9)
* [10. Prepare the submission](#10)

<a id='1'></a>
# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>1 |</span></b> <b>Importing Libraries</b></div>

In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from keras import optimizers
from keras.models import Sequential, Model
from keras.layers import Conv1D, MaxPooling1D
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Flatten
from keras.layers import Dense, LSTM, Dropout, GRU, Bidirectional
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import seaborn as sns
from IPython.display import HTML as html_print
import warnings

from termcolor import colored
from IPython.display import display
warnings.filterwarnings('ignore')

<a id='2'></a>
# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>2 |</span></b> <b>Adjusting Row & Column Settings</b></div>

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
warnings.filterwarnings('ignore')

<a id='3'></a>
# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>3 |</span></b> <b>Loading The Data Set</b></div>

In [3]:
df = pd.read_csv('/kaggle/input/effyis-datalab-challenge/train_test_data.csv')
df['Date'] = pd.to_datetime(df['Date'])

In [4]:
df.head()

Unnamed: 0,Date,Id_compte,amount_transaction,category_transaction
0,2022-01-01 00:23:00,SEWC_CL.1,462.96,Food & Drink - Groceries
1,2022-01-01 00:26:00,SEW0C_LI.1,15.51,General Services - Home Repair + Maintenance
2,2022-01-01 01:01:00,US.1,2.24,Bank Transfers - ATM withdrawals
3,2022-01-01 01:21:00,TA.1,2.86,Transportation - Public Transport
4,2022-01-01 01:54:00,US.1,1.68,Bank Transfers - ATM withdrawals


<a id='4'></a>
# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>4 |</span></b> <b>Data Preprocessing</b></div>

We can see that the dataset doesn't have any missing values.

In [5]:
df.isna().sum()

Date                    0
Id_compte               0
amount_transaction      0
category_transaction    0
dtype: int64

In [6]:
le = LabelEncoder()
df['Id_compte_encoded'] = le.fit_transform(df['Id_compte'])

In [7]:
# Remove unnecessary columns from the DataFrame

df.drop(['category_transaction','Id_compte'],axis=1,inplace=True)

In [8]:
df.set_index('Date', inplace=True)

In [9]:
# Initialize a new DataFrame to store the processed data
df_new = pd.DataFrame()

# Iterate over unique account IDs to process each separately
for i in range(12):
    # Resample the dataset to daily frequency, aggregating data for each day
    daily_aggregated_data = df[df['Id_compte_encoded'] == i].drop('Id_compte_encoded', axis=1).resample('D').sum()
    
    # Re-attach the account ID to the resampled data
    daily_aggregated_data['Id_compte_encoded'] = np.array([i for _ in range(daily_aggregated_data.shape[0])])
    
    # Append the processed data for this account to the main DataFrame
    df_new = pd.concat([df_new, daily_aggregated_data])


<a id='6'></a>
# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>6 |</span></b> <b>Feature Engineering</b></div>

In [13]:

def create_features(df):
    """
    Create time series features based on time series index and mark holidays, including Moroccan holidays.
    """
    # Create a copy of the DataFrame to avoid modifying the original data
    df = df.copy()
    
    # Extract day of the week (0 = Monday, 6 = Sunday)
    df['dayofweek'] = df.index.dayofweek
    
    # Extract the quarter of the year (1 to 4)
    df['quarter'] = df.index.quarter
    
    # Extract the month of the year (1 to 12)
    df['month'] = df.index.month
    
    # Extract the day of the month
    df['day'] = df.index.day
    
    # Calculate the season (1 = Winter, 2 = Spring, 3 = Summer, 4 = Fall)
    df['season'] = df['month'] % 12 // 3 + 1
    
    # Extract the day of the year (1 to 365/366)
    df['dayofyear'] = df.index.dayofyear
    
    # Extract the day of the month
    df['dayofmonth'] = df.index.day
    
    # Extract the week of the year (ISO week date system)
    df['weekofyear'] = df.index.isocalendar().week
    
    # Additional features
    
    # Check if the day is a weekend (1 for Saturday or Sunday, 0 otherwise)
    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)

    # Check if the day is the start of the month (1 if it is, 0 otherwise)
    df['is_month_start'] = (df['dayofmonth'] == 1).astype(int)
    
    # Check if the day is the end of the month (1 if it is, 0 otherwise)
    df['is_month_end'] = (df['dayofmonth'] == df.index.days_in_month).astype(int)
    
    # Check if the day is the start of a quarter (1 if it is, 0 otherwise)
    df['is_quarter_start'] = (df['dayofmonth'] == 1) & (df['month'] % 3 == 1).astype(int)
    df['days_since_start_of_year'] = (df.index - pd.to_datetime(df.index.year, format='%Y')).days
    
    # Check if the day is a working day (1 for Monday to Friday, 0 otherwise)
    df['is_working_day'] = df['dayofweek'].isin([0, 1, 2, 3, 4]).astype(int)
    # Sin/Cos transforms for cyclic features like day of week, month, etc.
    
     # Mark holidays
     # Define holiday dates including the Moroccan holidays from the provided image
    holidays = {
        "New Year's Day": "2022-01-01",
        "Labor Day": "2022-05-01",
        "Manifeste de l'Independance": "2022-01-11",
        "Fete du travail": "2022-05-01",
        "Aid Al Fitr": "2022-05-03", 
        "Aid al Adha": "2022-07-10", 
        "Fete du Trone": "2022-07-30",
        "Liberation de Oued Ed-Dahab": "2022-08-14",
        "La revolution du Roi et du peuple": "2022-08-20",
        "Fete de la Jeunesse": "2022-08-21",
        "Marche verte": "2022-11-06",
        "Fete de l'Independance": "2022-11-18",
    }
    
    # Create a column for holidays and initialize with zeros
    df['holiday'] = 0

    
    # Iterate through the holidays and mark them in the DataFrame
    for holiday_date in holidays.values():
           df.loc[df.index == pd.to_datetime(holiday_date), 'holiday'] = 1
            
    
    # Return the DataFrame with added features and holiday columns
    return df.astype(float)

In [14]:
df_new = create_features(df_new)

In [15]:
df_new.head()

Unnamed: 0_level_0,amount_transaction,Id_compte_encoded,dayofweek,quarter,month,day,season,dayofyear,dayofmonth,weekofyear,is_weekend,is_month_start,is_month_end,is_quarter_start,days_since_start_of_year,is_working_day,holiday
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2022-01-01,10.34,0.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,52.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
2022-01-02,31.49,0.0,6.0,1.0,1.0,2.0,1.0,2.0,2.0,52.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2022-01-03,1430.34,0.0,0.0,1.0,1.0,3.0,1.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
2022-01-04,32.9,0.0,1.0,1.0,1.0,4.0,1.0,4.0,4.0,1.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0
2022-01-05,0.0,0.0,2.0,1.0,1.0,5.0,1.0,5.0,5.0,1.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0


<a id='7'></a>
# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>7 |</span></b> <b>Custom Cost Function</b></div>

<div style="background-color: #f7f7f9; padding: 10px 15px; border-radius: 5px; border: 1px solid #e0e0e0; margin: 20px 0;">
    <h3 style="color: #333;">Evaluation Metrics: SMAPE</h3>
    <p style="color: #555;">
        In this section, we focus on the Symmetric Mean Absolute Percentage Error (SMAPE) as our evaluation metric. SMAPE is an adjusted version of the standard MAPE, designed to address some of its shortcomings and provide a symmetrical measure, treating both over-predictions and under-predictions equally.
    </p>
</div>

In [16]:
def smape(preds, target):
    n = len(preds)
    masked_arr = ~((preds == 0) & (target == 0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds - target)
    denom = np.abs(preds) + np.abs(target)
    smape_val = (200 * np.sum(num / denom)) / n
    return smape_val

<a id='8'></a>
# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>8 |</span></b> <b>Time-Based Validation Sets</b></div>

In [17]:
total_days = (df_new.index.max() - df_new.index.min()).days
split_date = df_new.index.min() + pd.Timedelta(days=total_days * 0.8)
train = df_new[df_new.index <= split_date]
test = df_new[df_new.index > split_date]

In [18]:
# Split the data into training and validation sets
X_train = train.drop('amount_transaction', axis=1)
y_train = train['amount_transaction']

X_val = test.drop('amount_transaction', axis=1)
y_val = test['amount_transaction']

In [19]:
# control
y_train.shape, X_train.shape, y_val.shape, X_val.shape

((2988,), (2988, 16), (739,), (739, 16))

In [20]:
X_train_series = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_series = X_val.values.reshape((X_val.shape[0], X_val.shape[1], 1))
print('Train set shape', X_train_series.shape)
print('Validation set shape', X_test_series.shape)

Train set shape (2988, 16, 1)
Validation set shape (739, 16, 1)


<a id='9'></a>
# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>9 |</span></b> <b>LSTM Model</b></div>

In [21]:
# The LSTM architecture
regressor = Sequential()
# First LSTM layer with Dropout regularisation
regressor.add(LSTM(units=50, return_sequences=True, input_shape=(X_train_series.shape[1], X_train_series.shape[2])))
regressor.add(Dropout(0.2))
# Second LSTM layer
regressor.add(LSTM(units=50, return_sequences=True))
regressor.add(Dropout(0.2))
# Third LSTM layer
regressor.add(LSTM(units=50, return_sequences=True))
regressor.add(Dropout(0.2))
# Fourth LSTM layer

regressor.add(LSTM(units=50))
regressor.add(Dropout(0.2))

# The output layer
regressor.add(Dense(units=1))

# Compiling the RNN
regressor.compile(optimizer='rmsprop',loss='mean_absolute_error')
# Fitting to the training set
regressor.fit(X_train_series,y_train,epochs=800,batch_size=32, validation_data=(X_test_series, y_val))

Epoch 1/800
Epoch 2/800
Epoch 3/800
Epoch 4/800
Epoch 5/800
Epoch 6/800
Epoch 7/800
Epoch 8/800
Epoch 9/800
Epoch 10/800
Epoch 11/800
Epoch 12/800
Epoch 13/800
Epoch 14/800
Epoch 15/800
Epoch 16/800
Epoch 17/800
Epoch 18/800
Epoch 19/800
Epoch 20/800
Epoch 21/800
Epoch 22/800
Epoch 23/800
Epoch 24/800
Epoch 25/800
Epoch 26/800
Epoch 27/800
Epoch 28/800
Epoch 29/800
Epoch 30/800
Epoch 31/800
Epoch 32/800
Epoch 33/800
Epoch 34/800
Epoch 35/800
Epoch 36/800
Epoch 37/800
Epoch 38/800
Epoch 39/800
Epoch 40/800
Epoch 41/800
Epoch 42/800
Epoch 43/800
Epoch 44/800
Epoch 45/800
Epoch 46/800
Epoch 47/800
Epoch 48/800
Epoch 49/800
Epoch 50/800
Epoch 51/800
Epoch 52/800
Epoch 53/800
Epoch 54/800
Epoch 55/800
Epoch 56/800
Epoch 57/800
Epoch 58/800
Epoch 59/800
Epoch 60/800
Epoch 61/800
Epoch 62/800
Epoch 63/800
Epoch 64/800
Epoch 65/800
Epoch 66/800
Epoch 67/800
Epoch 68/800
Epoch 69/800
Epoch 70/800
Epoch 71/800
Epoch 72/800
Epoch 73/800
Epoch 74/800
Epoch 75/800
Epoch 76/800
Epoch 77/800
Epoch 78

<keras.src.callbacks.History at 0x7fd9a008bd60>

In [22]:
train_predict = regressor.predict(X_train)


# Calculate MSE and MAE as you already did
mse = mean_squared_error(y_train, train_predict)
mae = mean_absolute_error(y_train, train_predict)

# Print the results
print("Mean squared error on train set: {:.4f}".format(mse))
print("Mean absolute error on train set: {:.4f}".format(mae))
print("SPAME", smape(train_predict.reshape(1,train_predict.shape[0])[0], y_train))

Mean squared error on train set: 169132.7054
Mean absolute error on train set: 246.3554
SPAME 96.70076322981886


In [23]:
train_predict = regressor.predict(X_train)
test_predict = regressor.predict(X_val)


# Calculate MSE and MAE as you already did
mse = mean_squared_error(y_val, test_predict)
mae = mean_absolute_error(y_val, test_predict)

# Print the results
print("Mean squared error on test set: {:.4f}".format(mse))
print("Mean absolute error on test set: {:.4f}".format(mae))
print("SPAME", smape(test_predict.reshape(1,test_predict.shape[0])[0], y_val))

Mean squared error on test set: 179191.7291
Mean absolute error on test set: 260.0517
SPAME 100.31686674546518


<a id='10'></a>
# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>10 |</span></b> <b>Train an LSTM Model using the entire dataset</b></div>

In [24]:
# Prepare the data for training
X = df_new.drop('amount_transaction', axis=1)
y = df_new['amount_transaction']

In [25]:
X_series = X.values.reshape((X.shape[0], X.shape[1], 1))

In [26]:
regressor.fit(X_series,y,epochs=800,batch_size=32)

Epoch 1/800
Epoch 2/800
Epoch 3/800
Epoch 4/800
Epoch 5/800
Epoch 6/800
Epoch 7/800
Epoch 8/800
Epoch 9/800
Epoch 10/800
Epoch 11/800
Epoch 12/800
Epoch 13/800
Epoch 14/800
Epoch 15/800
Epoch 16/800
Epoch 17/800
Epoch 18/800
Epoch 19/800
Epoch 20/800
Epoch 21/800
Epoch 22/800
Epoch 23/800
Epoch 24/800
Epoch 25/800
Epoch 26/800
Epoch 27/800
Epoch 28/800
Epoch 29/800
Epoch 30/800
Epoch 31/800
Epoch 32/800
Epoch 33/800
Epoch 34/800
Epoch 35/800
Epoch 36/800
Epoch 37/800
Epoch 38/800
Epoch 39/800
Epoch 40/800
Epoch 41/800
Epoch 42/800
Epoch 43/800
Epoch 44/800
Epoch 45/800
Epoch 46/800
Epoch 47/800
Epoch 48/800
Epoch 49/800
Epoch 50/800
Epoch 51/800
Epoch 52/800
Epoch 53/800
Epoch 54/800
Epoch 55/800
Epoch 56/800
Epoch 57/800
Epoch 58/800
Epoch 59/800
Epoch 60/800
Epoch 61/800
Epoch 62/800
Epoch 63/800
Epoch 64/800
Epoch 65/800
Epoch 66/800
Epoch 67/800
Epoch 68/800
Epoch 69/800
Epoch 70/800
Epoch 71/800
Epoch 72/800
Epoch 73/800
Epoch 74/800
Epoch 75/800
Epoch 76/800
Epoch 77/800
Epoch 78

<keras.src.callbacks.History at 0x7fd982924130>

<a id='11'></a>
# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>11 |</span></b> <b>Prepare the submission</b></div>

In [27]:
# Prepare the prediction dataset
prediction_dates = pd.date_range(start='2022-11-08', end='2022-12-31')
prediction_data1 = pd.DataFrame({
    'Date': np.repeat(prediction_dates, len(le.classes_)),
    'Id_compte_encoded': np.tile( le.transform(le.classes_), len(prediction_dates))
})

In [28]:
prediction_data1.set_index('Date', inplace=True)
prediction_data1 = create_features(prediction_data1)

In [29]:
X_sub= prediction_data1.values.reshape((prediction_data1.shape[0], prediction_data1.shape[1], 1))

In [30]:
pred_data = pd.DataFrame({})

In [31]:
pred_data['Date'] = prediction_data1.index.date

In [32]:
pred_data['Id_compte'] = le.inverse_transform(prediction_data1['Id_compte_encoded'].to_numpy().astype(int))

In [33]:
pred_data['Predicted_Amount'] = regressor.predict(prediction_data1).reshape(1,prediction_data1.shape[0])[0]



In [34]:
pred_data['Date']=pd.to_datetime(pred_data['Date'])

In [35]:
# Creating an ID column for output
pred_data['ID'] = pred_data['Date'].dt.strftime('%Y-%m-%d') + '_' + pred_data['Id_compte']

# Preparing the final output
final_output = pred_data[['ID', 'Predicted_Amount']]

# Save the final output data to a CSV file
final_output.to_csv('LSTM_predictions.csv', index=False)

In [37]:
from IPython.display import FileLink

# Provide the path to your file
FileLink(r'/kaggle/working/LSTM_predictions.csv')


In [39]:
from IPython.display import FileLink

# Create a download link for the CSV file
FileLink('/kaggle/working/LSTM_predictions.csv')


In [40]:
model.save('lstm_model.h5')  # Save the trained model as 'lstm_model.h5'

NameError: name 'model' is not defined

In [41]:
# Assuming your model is trained like this:
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(LSTM(units=50, return_sequences=False))
model.add(Dense(units=1))

model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Save the trained model to a file
model.save('lstm_model.h5')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [43]:
import tensorflow as tf


In [44]:
model = tf.keras.models.load_model('lstm_model.h5')

In [45]:
# When saving the model
model.save('lstm_model.keras')