#Import Library

In [1]:
import pandas as pd
import gdown
import zipfile
import os
from sklearn.model_selection import train_test_split
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt

#Load Data

In [2]:
# 1. Unduh file ZIP
file_id = '1yC1VwixLUz8ZtYedFTFbP-Mh7iWLFUmz'
url = f"https://drive.google.com/uc?id={file_id}"
output_zip = 'data.zip'
print(f"Mengunduh dataset dari {url}...")
gdown.download(url, output_zip, quiet=False)

# 2. Ekstrak file ZIP
extract_dir = 'unzipped_data'
with zipfile.ZipFile(output_zip, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# 3. Baca dan proses semua file CSV
csv_files = [f for f in os.listdir(extract_dir) if f.endswith('.csv')]
dataframes = []

for file in csv_files:
    file_path = os.path.join(extract_dir, file)
    main_df = pd.read_csv(file_path)

    # Tentukan user_ID berdasarkan nama file
    if "dataset_6" in file:
        user_id = "6"
    else:
        dataset_number = file.split('_')[1].split('.')[0]  # Ambil angka dari 'dataset_X.csv'
        user_id = f"{dataset_number}"

    main_df["user_id"] = user_id
    dataframes.append(main_df)

# 4. Gabungkan semua DataFrame
df = pd.concat(dataframes, ignore_index=True)

# 5. Cek hasil
print(f"Total baris: {len(main_df)}")
print(df["user_id"].value_counts())
df.head()

Mengunduh dataset dari https://drive.google.com/uc?id=1yC1VwixLUz8ZtYedFTFbP-Mh7iWLFUmz...


Downloading...
From: https://drive.google.com/uc?id=1yC1VwixLUz8ZtYedFTFbP-Mh7iWLFUmz
To: /content/data.zip
100%|██████████| 28.2k/28.2k [00:00<00:00, 34.3MB/s]

Total baris: 1000
user_id
7    1000
6     506
2     499
3     397
5     174
1     154
4      59
Name: count, dtype: int64





Unnamed: 0,id,tanggal,pemasukan,pengeluaran,saldo,kategori,created_at,updated_at,user_id
0,1,2023-01-01,400000.0,0.0,400000.0,Uang Saku,2025-05-17 23:55:35,2025-05-17 23:55:35,2
1,2,2023-01-01,300000.0,0.0,700000.0,Admin,2025-05-17 23:55:35,2025-05-17 23:55:35,2
2,3,2023-01-01,200000.0,0.0,900000.0,Aslab,2025-05-17 23:55:35,2025-05-17 23:55:35,2
3,4,2023-01-02,0.0,28000.0,872000.0,Transport,2025-05-17 23:55:35,2025-05-17 23:55:35,2
4,5,2023-01-03,0.0,26000.0,846000.0,Jajan,2025-05-17 23:55:35,2025-05-17 23:55:35,2


# Data Preprocessing

### **1. Drop kolom id, kategori, created_at, updated_at dan menjadikan user_id kolom paling awal sebelah kiri**

In [3]:
# Dropping unnecessary columns
df = df.drop(['id', 'kategori', 'created_at', 'updated_at'], axis=1)

# Reordering columns to put 'user_ID' at the beginning
cols = df.columns.tolist()
cols.remove('user_id')
new_cols = ['user_id'] + cols
df = df[new_cols]

print("\nDataFrame after dropping columns and reordering 'user_ID':")
df.head()


DataFrame after dropping columns and reordering 'user_ID':


Unnamed: 0,user_id,tanggal,pemasukan,pengeluaran,saldo
0,2,2023-01-01,400000.0,0.0,400000.0
1,2,2023-01-01,300000.0,0.0,700000.0
2,2,2023-01-01,200000.0,0.0,900000.0
3,2,2023-01-02,0.0,28000.0,872000.0
4,2,2023-01-03,0.0,26000.0,846000.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2789 entries, 0 to 2788
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   user_id      2789 non-null   object 
 1   tanggal      2789 non-null   object 
 2   pemasukan    2789 non-null   float64
 3   pengeluaran  2789 non-null   float64
 4   saldo        2789 non-null   float64
dtypes: float64(3), object(2)
memory usage: 109.1+ KB


In [5]:
df['tanggal'] = pd.to_datetime(df['tanggal'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2789 entries, 0 to 2788
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      2789 non-null   object        
 1   tanggal      2789 non-null   datetime64[ns]
 2   pemasukan    2789 non-null   float64       
 3   pengeluaran  2789 non-null   float64       
 4   saldo        2789 non-null   float64       
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 109.1+ KB


In [6]:
''' targets = {
    "1": 500000000,
    "2": 400000000,
    "3": 10000000,
    "4": 20000000,
    "5": 20000000,
    "6": 300000000,
    "7": 25000000
}
df['target'] = df['user_id'].map(targets)
df.head() '''

' targets = {\n    "1": 500000000,\n    "2": 400000000,\n    "3": 10000000,\n    "4": 20000000,\n    "5": 20000000,\n    "6": 300000000,\n    "7": 25000000\n}\ndf[\'target\'] = df[\'user_id\'].map(targets)\ndf.head() '

In [7]:
df['user_id'] = df['user_id'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2789 entries, 0 to 2788
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      2789 non-null   int64         
 1   tanggal      2789 non-null   datetime64[ns]
 2   pemasukan    2789 non-null   float64       
 3   pengeluaran  2789 non-null   float64       
 4   saldo        2789 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 109.1 KB


### **2. Data Cleaning**

In [8]:
df['tanggal'] = pd.to_datetime(df['tanggal'])
df = df.drop_duplicates().dropna()
df = df.sort_values(by=['user_id', 'tanggal'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2789 entries, 1635 to 2788
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      2789 non-null   int64         
 1   tanggal      2789 non-null   datetime64[ns]
 2   pemasukan    2789 non-null   float64       
 3   pengeluaran  2789 non-null   float64       
 4   saldo        2789 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 130.7 KB


### **3. Feature Engineering**

In [9]:
# Hitung net income
df['net_income'] = df['pemasukan'] - df['pengeluaran']

# Tambah fitur waktu
df['day_of_week'] = df['tanggal'].dt.dayofweek  # 0=Senin
df['month'] = df['tanggal'].dt.month

# Rata2 rolling (rata-rata net income berdasarkan waktu)
df['rolling_net_7'] = df.groupby('user_id')['net_income'].transform(lambda x: x.rolling(7, min_periods=1).mean())
df['rolling_net_30'] = df.groupby('user_id')['net_income'].transform(lambda x: x.rolling(30, min_periods=1).mean())

In [10]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 2789 entries, 1635 to 2788
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   user_id         2789 non-null   int64         
 1   tanggal         2789 non-null   datetime64[ns]
 2   pemasukan       2789 non-null   float64       
 3   pengeluaran     2789 non-null   float64       
 4   saldo           2789 non-null   float64       
 5   net_income      2789 non-null   float64       
 6   day_of_week     2789 non-null   int32         
 7   month           2789 non-null   int32         
 8   rolling_net_7   2789 non-null   float64       
 9   rolling_net_30  2789 non-null   float64       
dtypes: datetime64[ns](1), float64(6), int32(2), int64(1)
memory usage: 217.9 KB


Unnamed: 0,user_id,tanggal,pemasukan,pengeluaran,saldo,net_income,day_of_week,month,rolling_net_7,rolling_net_30
1635,1,2023-01-01,0.0,900000.0,2100000.0,-900000.0,6,1,-900000.0,-900000.0
1636,1,2023-01-05,0.0,542000.0,1558000.0,-542000.0,3,1,-721000.0,-721000.0
1637,1,2023-01-21,0.0,34000.0,1524000.0,-34000.0,5,1,-492000.0,-492000.0
1638,1,2023-01-23,0.0,50000.0,1474000.0,-50000.0,0,1,-381500.0,-381500.0
1639,1,2023-01-25,3000000.0,0.0,4474000.0,3000000.0,2,1,294800.0,294800.0


### **4. Buat Time Series Window (untuk LSTM)**

Misalnya membuat model melihat 7 hari terakhir untuk memprediksi saldo keesokan harinya.

In [11]:
import numpy as np

def create_sequence_data(user_df, window_size=7):
    sequences = []
    targets = []
    for i in range(len(user_df) - window_size):
        window = user_df.iloc[i:i+window_size]
        target = user_df.iloc[i+window_size]['saldo']
        sequences.append(window[['net_income', 'saldo', 'day_of_week', 'month']].values)
        targets.append(target)
    return np.array(sequences), np.array(targets)

# Gabungkan semua user
all_X, all_y = [], []
for uid in df['user_id'].unique():
    user_df = df[df['user_id'] == uid].reset_index(drop=True)
    X, y = create_sequence_data(user_df)
    all_X.append(X)
    all_y.append(y)

X = np.vstack(all_X)
y = np.concatenate(all_y)

print(X.shape)  # (jumlah_window, window_size, jumlah_fitur)
print(y.shape)  # (jumlah_window,)

(2740, 7, 4)
(2740,)


jumlah_window adalah berapa banyak contoh sekuens (jendela waktu) yang bisa dibuat dari seluruh data. Jika kamu punya total n hari data, dan kamu pakai window_size = w, maka: jumlah_window = n - w. Banyaknya potongan data (sequence) yang bisa dibuat dari keseluruhan data

window_size = Jumlah hari yang digunakan sebagai input (urutan waktu untuk prediksi)

#Split Data

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=False  # jangan di-shuffle untuk time series
)

# Normalisasi

In [13]:
from sklearn.preprocessing import StandardScaler

# Reshape X supaya bisa di-scale (samples, window_size * n_features)
n_samples, window_size, n_features = X_train.shape
X_train_reshaped = X_train.reshape((n_samples, window_size * n_features))
X_test_reshaped = X_test.reshape((X_test.shape[0], window_size * n_features))

# Inisialisasi scaler
scaler_X = StandardScaler()
scaler_y = StandardScaler()

# Fit scaler di data training, transform di training dan testing
X_train_scaled = scaler_X.fit_transform(X_train_reshaped)
X_test_scaled = scaler_X.transform(X_test_reshaped)

y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).flatten()
y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1)).flatten()

# Kembalikan bentuk X ke 3D untuk LSTM input
X_train_scaled = X_train_scaled.reshape((n_samples, window_size, n_features))
X_test_scaled = X_test_scaled.reshape((X_test.shape[0], window_size, n_features))

#Modelling

In [14]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Buat model LSTM
model = Sequential([
    LSTM(128, return_sequences=False, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1)  # Output: saldo prediksi hari berikutnya
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
# Callback untuk early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    X_train_scaled, y_train_scaled,
    validation_data=(X_test_scaled, y_test_scaled),
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - loss: 0.3952 - mae: 0.4311 - val_loss: 0.0035 - val_mae: 0.0504
Epoch 2/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - loss: 0.0244 - mae: 0.1108 - val_loss: 0.0017 - val_mae: 0.0343
Epoch 3/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 0.0213 - mae: 0.1013 - val_loss: 0.0035 - val_mae: 0.0515
Epoch 4/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 0.0213 - mae: 0.1017 - val_loss: 0.0026 - val_mae: 0.0448
Epoch 5/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 0.0209 - mae: 0.0983 - val_loss: 0.0016 - val_mae: 0.0331
Epoch 6/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 0.0202 - mae: 0.0976 - val_loss: 0.0013 - val_mae: 0.0283
Epoch 7/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - loss: 0

### **1. Fungsi Prediksi**

In [17]:
def predict_future_balance(user_id, model, original_df, scaler_X, scaler_y, days_to_predict=30, window_size=7):
    """
    Memprediksi saldo harian user tertentu selama beberapa hari ke depan.

    Args:
        user_id (int): ID user.
        model (tf.keras.Model): Model LSTM.
        original_df (pd.DataFrame): DataFrame lengkap yang sudah diproses.
        scaler_X (StandardScaler): Scaler fitur input (fit saat training).
        scaler_y (StandardScaler): Scaler target output (fit saat training).
        days_to_predict (int): Jumlah hari yang ingin diprediksi.
        window_size (int): Jumlah hari sebelumnya yang digunakan untuk memprediksi.

    Returns:
        pd.DataFrame: Tabel tanggal dan prediksi saldo.
    """
    # Filter user yang ingin diprediksi
    user_df = original_df[original_df['user_id'] == user_id].sort_values(by='tanggal').reset_index(drop=True)

    if len(user_df) < window_size:
        print(f"Data user_id {user_id} kurang dari window_size ({window_size}). Tidak dapat memprediksi.")
        return pd.DataFrame(columns=['tanggal', 'saldo_prediksi'])

    # Ambil sequence awal (net_income, saldo, day_of_week, month)
    last_sequence = user_df.iloc[-window_size:][['net_income', 'saldo', 'day_of_week', 'month']].values

    # Simpan tanggal dan saldo awal
    current_date = user_df['tanggal'].iloc[-1]
    predicted_balances = []

    for _ in range(days_to_predict):
        # Reshape dan scaling
        input_sequence = last_sequence.reshape(1, window_size * last_sequence.shape[1])
        input_sequence_scaled = scaler_X.transform(input_sequence).reshape(1, window_size, last_sequence.shape[1])

        # Prediksi saldo hari berikutnya (masih dalam skala terstandardisasi)
        scaled_prediction = model.predict(input_sequence_scaled, verbose=0)[0][0]

        # Kembalikan ke skala asli
        predicted_saldo = scaler_y.inverse_transform([[scaled_prediction]])[0][0]
        predicted_balances.append(predicted_saldo)

        # Siapkan data untuk hari berikutnya
        next_date = current_date + pd.Timedelta(days=1)
        next_day_of_week = next_date.dayofweek
        next_month = next_date.month
        avg_net_income_last_window = last_sequence[:, 0].mean() if not np.isnan(last_sequence[:, 0].mean()) else 0

        # Buat baris fitur baru
        new_row = np.array([[avg_net_income_last_window, predicted_saldo, next_day_of_week, next_month]])

        # Update sequence untuk iterasi berikutnya
        last_sequence = np.vstack([last_sequence[1:], new_row])
        current_date = next_date

    # Buat DataFrame hasil
    future_dates = pd.date_range(start=user_df['tanggal'].iloc[-1] + pd.Timedelta(days=1), periods=days_to_predict)
    result_df = pd.DataFrame({
        'tanggal': future_dates,
        'saldo_prediksi': predicted_balances
    })

    return result_df

### **2. Contoh penggunaan**

In [18]:
user_id_to_predict = 1
days = 30

# Pastikan scaler_X, scaler_y sudah fit ke data training sebelumnya
future_predictions = predict_future_balance(
    user_id=user_id_to_predict,
    model=model,
    original_df=df,
    scaler_X=scaler_X,
    scaler_y=scaler_y,
    days_to_predict=days,
    window_size=7
)

print(future_predictions)

      tanggal  saldo_prediksi
0  2025-12-27    1.437509e+07
1  2025-12-28    1.434085e+07
2  2025-12-29    1.509885e+07
3  2025-12-30    1.461310e+07
4  2025-12-31    1.439597e+07
5  2026-01-01    1.418059e+07
6  2026-01-02    1.489027e+07
7  2026-01-03    1.414947e+07
8  2026-01-04    1.395164e+07
9  2026-01-05    1.370252e+07
10 2026-01-06    1.359174e+07
11 2026-01-07    1.350710e+07
12 2026-01-08    1.347143e+07
13 2026-01-09    1.340092e+07
14 2026-01-10    1.309436e+07
15 2026-01-11    1.286487e+07
16 2026-01-12    1.259404e+07
17 2026-01-13    1.267070e+07
18 2026-01-14    1.261959e+07
19 2026-01-15    1.251684e+07
20 2026-01-16    1.236510e+07
21 2026-01-17    1.216420e+07
22 2026-01-18    1.194903e+07
23 2026-01-19    1.170269e+07
24 2026-01-20    1.180073e+07
25 2026-01-21    1.174692e+07
26 2026-01-22    1.163974e+07
27 2026-01-23    1.149481e+07
28 2026-01-24    1.131889e+07
29 2026-01-25    1.111734e+07


## Deploy ke tensorflow.js

In [19]:
# Simpan model dalam format HDF5 (.h5)
model.save('saldo_prediction_model.h5')

print("Model berhasil disimpan dalam format HDF5 (.h5)")



Model berhasil disimpan dalam format HDF5 (.h5)


: