#Import Library

In [1]:
import pandas as pd
import gdown
import zipfile
import os
from sklearn.model_selection import train_test_split
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import matplotlib.pyplot as plt
from tensorflow.keras.layers import LSTM, Dense

#Load Data

In [2]:
# 1. Unduh file ZIP
file_id = '1yC1VwixLUz8ZtYedFTFbP-Mh7iWLFUmz'
url = f"https://drive.google.com/uc?id={file_id}"
output_zip = 'data.zip'
print(f"Mengunduh dataset dari {url}...")
gdown.download(url, output_zip, quiet=False)

# 2. Ekstrak file ZIP
extract_dir = 'unzipped_data'
with zipfile.ZipFile(output_zip, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# 3. Baca dan proses semua file CSV
csv_files = [f for f in os.listdir(extract_dir) if f.endswith('.csv')]
dataframes = []

for file in csv_files:
    file_path = os.path.join(extract_dir, file)
    main_df = pd.read_csv(file_path)

    # Tentukan user_ID berdasarkan nama file
    if "dataset_6" in file:
        user_id = "6"
    else:
        dataset_number = file.split('_')[1].split('.')[0]  # Ambil angka dari 'dataset_X.csv'
        user_id = f"{dataset_number}"

    main_df["user_id"] = user_id
    dataframes.append(main_df)

# 4. Gabungkan semua DataFrame
df = pd.concat(dataframes, ignore_index=True)

# 5. Cek hasil
print(f"Total baris: {len(main_df)}")
print(df["user_id"].value_counts())
df.head()

Mengunduh dataset dari https://drive.google.com/uc?id=1yC1VwixLUz8ZtYedFTFbP-Mh7iWLFUmz...


Downloading...
From: https://drive.google.com/uc?id=1yC1VwixLUz8ZtYedFTFbP-Mh7iWLFUmz
To: /content/data.zip
100%|██████████| 28.2k/28.2k [00:00<00:00, 56.3MB/s]

Total baris: 1000
user_id
7    1000
6     506
2     499
3     397
5     174
1     154
4      59
Name: count, dtype: int64





Unnamed: 0,id,tanggal,pemasukan,pengeluaran,saldo,kategori,created_at,updated_at,user_id
0,1,2023-01-01,400000.0,0.0,400000.0,Uang Saku,2025-05-17 23:55:35,2025-05-17 23:55:35,2
1,2,2023-01-01,300000.0,0.0,700000.0,Admin,2025-05-17 23:55:35,2025-05-17 23:55:35,2
2,3,2023-01-01,200000.0,0.0,900000.0,Aslab,2025-05-17 23:55:35,2025-05-17 23:55:35,2
3,4,2023-01-02,0.0,28000.0,872000.0,Transport,2025-05-17 23:55:35,2025-05-17 23:55:35,2
4,5,2023-01-03,0.0,26000.0,846000.0,Jajan,2025-05-17 23:55:35,2025-05-17 23:55:35,2


# Data Preprocessing

### **1. Drop kolom id, kategori, created_at, updated_at dan menjadikan user_id kolom paling awal sebelah kiri**

In [3]:
# Dropping unnecessary columns
df = df.drop(['id', 'kategori', 'created_at', 'updated_at'], axis=1)

# Reordering columns to put 'user_ID' at the beginning
cols = df.columns.tolist()
cols.remove('user_id')
new_cols = ['user_id'] + cols
df = df[new_cols]

print("\nDataFrame after dropping columns and reordering 'user_ID':")
df.head()


DataFrame after dropping columns and reordering 'user_ID':


Unnamed: 0,user_id,tanggal,pemasukan,pengeluaran,saldo
0,2,2023-01-01,400000.0,0.0,400000.0
1,2,2023-01-01,300000.0,0.0,700000.0
2,2,2023-01-01,200000.0,0.0,900000.0
3,2,2023-01-02,0.0,28000.0,872000.0
4,2,2023-01-03,0.0,26000.0,846000.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2789 entries, 0 to 2788
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   user_id      2789 non-null   object 
 1   tanggal      2789 non-null   object 
 2   pemasukan    2789 non-null   float64
 3   pengeluaran  2789 non-null   float64
 4   saldo        2789 non-null   float64
dtypes: float64(3), object(2)
memory usage: 109.1+ KB


In [5]:
df['tanggal'] = pd.to_datetime(df['tanggal'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2789 entries, 0 to 2788
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      2789 non-null   object        
 1   tanggal      2789 non-null   datetime64[ns]
 2   pemasukan    2789 non-null   float64       
 3   pengeluaran  2789 non-null   float64       
 4   saldo        2789 non-null   float64       
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 109.1+ KB


In [6]:
df['user_id'] = df['user_id'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2789 entries, 0 to 2788
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      2789 non-null   int64         
 1   tanggal      2789 non-null   datetime64[ns]
 2   pemasukan    2789 non-null   float64       
 3   pengeluaran  2789 non-null   float64       
 4   saldo        2789 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 109.1 KB


### **2. Data Cleaning**

In [7]:
df['tanggal'] = pd.to_datetime(df['tanggal'])
df = df.drop_duplicates().dropna()
df = df.sort_values(by=['user_id', 'tanggal'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2789 entries, 1576 to 2788
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      2789 non-null   int64         
 1   tanggal      2789 non-null   datetime64[ns]
 2   pemasukan    2789 non-null   float64       
 3   pengeluaran  2789 non-null   float64       
 4   saldo        2789 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 130.7 KB


### **3. Feature Engineering**

In [8]:
# Hitung net income
df['net_income'] = df['pemasukan'] - df['pengeluaran']

# Tambah fitur waktu
df['day_of_week'] = df['tanggal'].dt.dayofweek  # 0=Senin
df['month'] = df['tanggal'].dt.month

# Rata2 rolling (rata-rata net income berdasarkan waktu)
df['rolling_net_7'] = df.groupby('user_id')['net_income'].transform(lambda x: x.rolling(7, min_periods=1).mean())
df['rolling_net_30'] = df.groupby('user_id')['net_income'].transform(lambda x: x.rolling(30, min_periods=1).mean())

In [9]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 2789 entries, 1576 to 2788
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   user_id         2789 non-null   int64         
 1   tanggal         2789 non-null   datetime64[ns]
 2   pemasukan       2789 non-null   float64       
 3   pengeluaran     2789 non-null   float64       
 4   saldo           2789 non-null   float64       
 5   net_income      2789 non-null   float64       
 6   day_of_week     2789 non-null   int32         
 7   month           2789 non-null   int32         
 8   rolling_net_7   2789 non-null   float64       
 9   rolling_net_30  2789 non-null   float64       
dtypes: datetime64[ns](1), float64(6), int32(2), int64(1)
memory usage: 217.9 KB


Unnamed: 0,user_id,tanggal,pemasukan,pengeluaran,saldo,net_income,day_of_week,month,rolling_net_7,rolling_net_30
1576,1,2023-01-01,0.0,900000.0,2100000.0,-900000.0,6,1,-900000.0,-900000.0
1577,1,2023-01-05,0.0,542000.0,1558000.0,-542000.0,3,1,-721000.0,-721000.0
1578,1,2023-01-21,0.0,34000.0,1524000.0,-34000.0,5,1,-492000.0,-492000.0
1579,1,2023-01-23,0.0,50000.0,1474000.0,-50000.0,0,1,-381500.0,-381500.0
1580,1,2023-01-25,3000000.0,0.0,4474000.0,3000000.0,2,1,294800.0,294800.0


### **4. Menentukan fitur dan target**

In [10]:
# Tentukan fitur dan target
features = ['pemasukan', 'pengeluaran', 'day_of_week', 'month', 'rolling_net_7', 'rolling_net_30']
target_col = 'saldo'

#### **5. Mengambil salah satu user dan data setiap kolomnya**

In [11]:
# Pilih satu user (misalnya user pertama)
user_id = df['user_id'].unique()[0]
user_df = df[df['user_id'] == user_id].copy()
user_df = user_df.sort_values('tanggal')

# Drop NA dan pastikan tidak ada duplikat
user_df = user_df.dropna()

# Buat input dan target
X_all = user_df[features].values
y_all = user_df[target_col].values

### **4. Buat Time Series Window (untuk LSTM)**

Misalnya membuat model melihat 7 hari terakhir untuk memprediksi saldo keesokan harinya.

In [12]:

# Buat sequence dengan lookback window 14 hari, prediksi 7 hari ke depan
def create_sequences(X, y, input_window=14, output_window=7):
    Xs, ys = [], []
    for i in range(len(X) - input_window - output_window + 1):
        Xs.append(X[i:(i + input_window)])
        ys.append(y[(i + input_window):(i + input_window + output_window)])
    return np.array(Xs), np.array(ys)

jumlah_window adalah berapa banyak contoh sekuens (jendela waktu) yang bisa dibuat dari seluruh data. Jika kamu punya total n hari data, dan kamu pakai window_size = w, maka: jumlah_window = n - w. Banyaknya potongan data (sequence) yang bisa dibuat dari keseluruhan data

window_size = Jumlah hari yang digunakan sebagai input (urutan waktu untuk prediksi)

# Split Data dan Normalisasi

In [14]:
from sklearn.preprocessing import MinMaxScaler

# Split manual: 80% train, 20% test
split_idx = int(len(X_all) * 0.8)
X_raw_train, X_raw_test = X_all[:split_idx], X_all[split_idx:]
y_raw_train, y_raw_test = y_all[:split_idx], y_all[split_idx:]

# Normalisasi hanya pada data train
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_train_scaled = scaler_X.fit_transform(X_raw_train)
X_test_scaled = scaler_X.transform(X_raw_test)

y_train_scaled = scaler_y.fit_transform(y_raw_train.reshape(-1, 1))
y_test_scaled = scaler_y.transform(y_raw_test.reshape(-1, 1))

# Gabung kembali
X_scaled_all = np.concatenate((X_train_scaled, X_test_scaled), axis=0)
y_scaled_all = np.concatenate((y_train_scaled, y_test_scaled), axis=0).flatten()

# Buat sequences dari data scaled
X_seq, y_seq = create_sequences(X_scaled_all, y_scaled_all)

# Bagi ulang X_seq dan y_seq ke train/test berdasarkan split index
split_seq_idx = int(len(X_seq) * 0.8)
X_train, X_test = X_seq[:split_seq_idx], X_seq[split_seq_idx:]
y_train, y_test = y_seq[:split_seq_idx], y_seq[split_seq_idx:]

# Bentuk y agar 7 output per sample (reshape ke [samples, 7])
y_train = y_train.reshape(-1, 7)
y_test = y_test.reshape(-1, 7)

#Modelling

In [15]:
# Build model LSTM
model = Sequential([
    LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=False),
    Dense(7)  # Output 7 hari ke depan
])

model.compile(optimizer='adam', loss='mse')
model.summary()

# Train
model.fit(X_train, y_train, epochs=30, batch_size=16, validation_split=0.1, verbose=1)

# Predict
y_pred_scaled = model.predict(X_test)

# Inverse transform prediksi dan label
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_true = scaler_y.inverse_transform(y_test)

Epoch 1/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 320ms/step - loss: 0.4066 - val_loss: 0.4997
Epoch 2/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.1884 - val_loss: 0.2152
Epoch 3/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.0505 - val_loss: 0.0528
Epoch 4/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.0544 - val_loss: 0.0780
Epoch 5/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.0376 - val_loss: 0.1232
Epoch 6/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.0352 - val_loss: 0.1339
Epoch 7/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.0350 - val_loss: 0.1155
Epoch 8/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.0343 - val_loss: 0.1081
Epoch 9/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [17]:
# Evaluate
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    # Avoid division by zero
    return np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100

mae = mean_absolute_error(y_true.flatten(), y_pred.flatten())
mse = np.mean((y_true.flatten() - y_pred.flatten())**2)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_true.flatten(), y_pred.flatten())

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Absolute Percentage Error (MAPE): {mape}')

Mean Absolute Error (MAE): 5835785.134920635
Root Mean Squared Error (RMSE): 5938889.186731627
Mean Absolute Percentage Error (MAPE): 41.690402956629036


### **1. Fungsi Prediksi**

In [18]:
def predict_future_saldo(user_id, df, model, scaler_X, scaler_y, features, input_window=14, future_days=7):

    # Ambil data user
    user_data = df[df['user_id'] == user_id].sort_values('tanggal').copy()

    if len(user_data) < input_window:
        print(f"Tidak cukup data ({len(user_data)} hari) untuk user {user_id}. Butuh minimal {input_window} hari.")
        return None, None

    # Ambil data terakhir yang cukup untuk window input
    last_window_data = user_data.tail(input_window)

    # Ambil fitur dari data terakhir
    last_features = last_window_data[features].values

    # Normalisasi fitur
    last_features_scaled = scaler_X.transform(last_features)

    # Reshape untuk input model [samples, timesteps, features]
    X_input = last_features_scaled.reshape(1, input_window, len(features))

    # Prediksi saldo 7 hari ke depan (scaled)
    predicted_scaled = model.predict(X_input)

    # Inverse transform prediksi
    predicted_saldo = scaler_y.inverse_transform(predicted_scaled)

    # Flatten the predicted_saldo to a 1D array
    predicted_saldo = predicted_saldo.flatten()

    # Dapatkan tanggal terakhir dari data user
    last_date = user_data['tanggal'].max()

    # Buat daftar tanggal untuk 7 hari ke depan
    future_dates = [last_date + pd.Timedelta(days=i) for i in range(1, future_days + 1)]

    return future_dates, predicted_saldo

### **2. Contoh penggunaan**

In [19]:
# Contoh penggunaan fungsi untuk user_id = 6
user_to_predict = 6
future_dates, predicted_saldo = predict_future_saldo(
    user_to_predict,
    df,
    model,
    scaler_X,
    scaler_y,
    features,
    input_window=X_train.shape[1], # Use the actual input window size from training
    future_days=7 # Predict 7 days
)

if future_dates is not None and predicted_saldo is not None:
    print(f"\nPrediksi saldo untuk user {user_to_predict} dalam 7 hari ke depan:")
    for date, saldo in zip(future_dates, predicted_saldo):
        print(f"{date.strftime('%Y-%m-%d')}: Rp {saldo:,.2f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step

Prediksi saldo untuk user 6 dalam 7 hari ke depan:
2026-01-01: Rp 9,201,398.00
2026-01-02: Rp 8,844,758.00
2026-01-03: Rp 9,068,369.00
2026-01-04: Rp 9,063,854.00
2026-01-05: Rp 9,715,724.00
2026-01-06: Rp 8,466,700.00
2026-01-07: Rp 8,766,879.00


## Deploy ke tensorflow.js

In [20]:
model.save('saldo_prediction_model.h5')



In [21]:
!pip install tensorflowjs

import tensorflowjs as tfjs

# Konversi model Keras ke format TensorFlow.js
tfjs.converters.save_keras_model(model, 'tfjs_model')

print("Model telah berhasil dikonversi dan disimpan dalam folder 'tfjs_model'.")
print("File-file model yang dihasilkan:")
!ls tfjs_model

Collecting tensorflowjs
  Downloading tensorflowjs-4.22.0-py3-none-any.whl.metadata (3.2 kB)
Collecting packaging~=23.1 (from tensorflowjs)
  Downloading packaging-23.2-py3-none-any.whl.metadata (3.2 kB)
Downloading tensorflowjs-4.22.0-py3-none-any.whl (89 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.1/89.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading packaging-23.2-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: packaging, tensorflowjs
  Attempting uninstall: packaging
    Found existing installation: packaging 24.2
    Uninstalling packaging-24.2:
      Successfully uninstalled packaging-24.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-cloud-bigquery 3.34.0 requires packa



failed to lookup keras version from the file,
    this is likely a weight only file
Model telah berhasil dikonversi dan disimpan dalam folder 'tfjs_model'.
File-file model yang dihasilkan:
group1-shard1of1.bin  model.json


: