In [1]:
import pandas as pd
import numpy as np

### Directory List

In [2]:
input_file_dir = "../dataset/nonfin_2020_2023_striped.csv"
train_data_output_dir = "../train_data/train_data_CMB_cabang_64"
model_output_dir = "../models/model_LSTM_CMB_cabang_64" # append this with the name of the file

### Input Dataframe

In [3]:
df = pd.read_csv(input_file_dir, delimiter="|", header=0, dtype=str)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15319761 entries, 0 to 15319760
Data columns (total 12 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   dt                 object
 1   trans_type_key     object
 2   trans_lng_nm       object
 3   branch_key         object
 4   sumtrx             object
 5   branch_address1    object
 6   branch_address2    object
 7   region_key         object
 8   jabo_nonjabotabek  object
 9   city_key           object
 10  geography          object
 11  flag_big_city      object
dtypes: object(12)
memory usage: 1.4+ GB


In [5]:
df.head()

Unnamed: 0,dt,trans_type_key,trans_lng_nm,branch_key,sumtrx,branch_address1,branch_address2,region_key,jabo_nonjabotabek,city_key,geography,flag_big_city
0,2020-10-20,435,Pemesanan Warkat,838,3,SENTRA NIAGA HARAPAN INDAH KAV. CP2 NO.1 KOTA ...,17214,9,J,24,JAKARTA,B
1,2021-04-19,727,Cetak Mutasi Harian,259,2,JL. MUWARDI 1/44,11450,10,J,24,,B
2,2021-05-03,482,Buka Blokir Kartu,1074,2,"KOMP. PERUM TAMAN PALEM LESTARI C5 NO. 9 -11, ...",11730,12,J,24,,B
3,2021-03-08,383,Registrasi m-BCA,196,22,JL. TRUNOJOYO 145,68137,7,N,20,,K
4,2021-11-09,436,Penawaran Solusi,1336,1,"JL. RADEN SALEH NO. 39B, KEL. KARANG MULYA, KE...",15157,12,J,24,,B


### 1.2 Trim dataframe to satisfy LSTM model

In [6]:
row_filter = (df["branch_key"] == "64") & (df["trans_lng_nm"] == "Cetak Mutasi Bulanan")
col_filter = ["dt", "trans_lng_nm", "branch_key", "sumtrx"]
df = df.loc[row_filter, col_filter]

In [7]:
df.head()

Unnamed: 0,dt,trans_lng_nm,branch_key,sumtrx
11949,2021-11-29,Cetak Mutasi Bulanan,64,70
12345,2021-11-15,Cetak Mutasi Bulanan,64,81
16903,2020-03-09,Cetak Mutasi Bulanan,64,76
68742,2020-03-23,Cetak Mutasi Bulanan,64,56
81483,2020-02-21,Cetak Mutasi Bulanan,64,85


#### Final Adjustments
- Adjust Datatypes

In [8]:
df["dt"]                 = pd.to_datetime(df["dt"])
df["trans_lng_nm"]       = df["trans_lng_nm"].astype(str)
df["branch_key"]         = df["branch_key"].astype(np.int32)
df["sumtrx"]             = df["sumtrx"].astype(np.int32)

In [9]:
df

Unnamed: 0,dt,trans_lng_nm,branch_key,sumtrx
11949,2021-11-29,Cetak Mutasi Bulanan,64,70
12345,2021-11-15,Cetak Mutasi Bulanan,64,81
16903,2020-03-09,Cetak Mutasi Bulanan,64,76
68742,2020-03-23,Cetak Mutasi Bulanan,64,56
81483,2020-02-21,Cetak Mutasi Bulanan,64,85
...,...,...,...,...
13351196,2022-03-21,Cetak Mutasi Bulanan,64,1
13485625,2023-04-25,Cetak Mutasi Bulanan,64,1
13602347,2022-02-08,Cetak Mutasi Bulanan,64,1
13629289,2022-05-10,Cetak Mutasi Bulanan,64,1


In [10]:
train_data = df.groupby(["dt"]) \
               .agg({"sumtrx": "sum"}) \
               .sort_values(by=["dt"], ascending=True) \
               .asfreq('D', fill_value=0)
train_data

Unnamed: 0_level_0,sumtrx
dt,Unnamed: 1_level_1
2020-01-02,32
2020-01-03,69
2020-01-04,0
2020-01-05,0
2020-01-06,165
...,...
2023-06-12,101
2023-06-13,36
2023-06-14,22
2023-06-15,47


In [11]:
train_data.to_csv(train_data_output_dir, sep='|')