In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import root_mean_squared_error
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from tensorflow.keras import backend as K
from tensorflow.keras.regularizers import l2

# --------------------------
# Custom RMSE loss
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

# --------------------------
# Load dataset
df = pd.read_csv('final_dataset.csv')

# items = ['WeTV VIP','IQIYI VIP']
# df = df[df["Produk"].isin(items)].copy()
df['Tanggal'] = pd.to_datetime(df['Tanggal'])

# --------------------------
# Select features and target
features = [
  'Transaction_Count', 'month_cos',
  'smoothed_revenue', 'rolling_mean_7', 'rolling_mean_14', 'rolling_mean_30',
  'rolling_std_7', 'rolling_std_14', 'rolling_std_30',
  'revenue_lag_1', 'revenue_lag_7', 'revenue_lag_14', 'revenue_lag_30',
'ema_7', 'ema_14', 'ema_30',
]
target = 'Total_Revenue'

# One-hot product columns
products_columns = [col for col in df.columns if col.startswith('Produk_')]

# Features to normalize
continuous_cols = [ 'Transaction_Count',
    'smoothed_revenue', 'rolling_mean_7', 'rolling_mean_14', 'rolling_mean_30',
    'rolling_std_7', 'rolling_std_14', 'rolling_std_30',
    'revenue_lag_1', 'revenue_lag_7', 'revenue_lag_14', 'revenue_lag_30',
    'ema_7', 'ema_14', 'ema_30',
]

scaler_y = MinMaxScaler()

# --------------------------
# Normalize features and target
for prod_col in products_columns:
    mask = df[prod_col] == 1
    scaler = MinMaxScaler()
    df.loc[mask, continuous_cols] = scaler.fit_transform(df.loc[mask, continuous_cols])
    
df[features] = df[features].astype(float)
# Normalize target
df[target] = scaler_y.fit_transform(df[[target]])


# --------------------------
# Sliding window function
def create_sequences(data, features, target, time_steps=30):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[features].iloc[i:i+time_steps].values)
        y.append(data[target].iloc[i+time_steps])
    return np.array(X), np.array(y)

X, y = create_sequences(df, features, target, time_steps=30)

# Apply log1p AFTER slicing for training (optional: can also apply before scaling)
y = np.log1p(y)

# plt.hist(df['Total_Revenue'], bins=50)
# plt.title("Revenue Distribution")
# plt.show()

df['Total_Revenue'] = np.log1p(df['Total_Revenue'])

df.to_csv('final_dataset_scaled.csv', index=False)

# --------------------------
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=False
)

# --------------------------
# Build LSTM model
model = Sequential()
model.add(LSTM(100, return_sequences=True, kernel_regularizer=l2(0.001), input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.3))
model.add(LSTM(50, return_sequences=False,))
model.add(Dense(64,activation="relu"))
model.add(Dense(1))


optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss="mae")
# model.compile(optimizer="Adam", loss="mae")

# --------------------------
# Train model
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

lr_scheduler = ReduceLROnPlateau(monitor='val_loss',
    factor=0.5,
    patience=5,
    verbose=1,
    min_lr=1e-5
)

history = model.fit(X_train, y_train, 
          validation_data=(X_test, y_test),
          epochs=150,
          batch_size=16,
          
        #   callbacks=[early_stop]
        # callbacks=[ lr_scheduler],
        )

# --------------------------
# Predict and inverse scale

y_pred = model.predict(X_test)

y_pred_inv = scaler_y.inverse_transform(y_pred)
y_test_inv = scaler_y.inverse_transform(y_test.reshape(-1, 1))

y_pred_inv = np.expm1(y_pred)
y_test_inv = np.expm1(y_test)

rmse = root_mean_squared_error(y_test_inv, y_pred_inv)
print(f"RMSE: {rmse:.4f}")


# --------------------------
# Plot actual vs predicted
# Predict and inverse scale

plt.figure(figsize=(12, 5))
plt.plot(y_test_inv, label='Actual Revenue')
plt.plot(y_pred_inv, label='Predicted Revenue')
plt.title('LSTM Revenue Prediction')
plt.legend()
plt.show()

# --------------------------
# Plot training history
plt.figure(figsize=(10, 4))
plt.plot(history.history['loss'], label='Training Loss (RMSE)')
plt.plot(history.history['val_loss'], label='Validation Loss (RMSE)')
plt.title('Training vs Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('training_history.png')
plt.show()



Index(['Tanggal', 'Total_Revenue', 'Transaction_Count', 'Avg_Price',
       'is_weekend', 'is_month_start', 'is_month_end', 'dayofweek_sin',
       'dayofweek_cos', 'month_sin', 'month_cos', 'Produk_Netflix Premium',
       'Produk_VIU Premium', 'Produk_Vidio Premier', 'smoothed_revenue',
       'rolling_mean_7', 'rolling_mean_14', 'rolling_mean_30', 'rolling_std_7',
       'rolling_std_14', 'rolling_std_30', 'revenue_trend_7',
       'revenue_trend_14', 'revenue_trend_30', 'revenue_lag_1',
       'revenue_lag_7', 'revenue_lag_14', 'revenue_lag_30', 'ema_7', 'ema_14',
       'ema_30'],
      dtype='object')


Unnamed: 0,Tanggal,Total_Revenue,Transaction_Count,Avg_Price,is_weekend,is_month_start,is_month_end,dayofweek_sin,dayofweek_cos,month_sin,...,revenue_trend_7,revenue_trend_14,revenue_trend_30,revenue_lag_1,revenue_lag_7,revenue_lag_14,revenue_lag_30,ema_7,ema_14,ema_30
0,2024-01-31,16958.0,1,16958.0,0,0,1,0.974928,-0.222521,0.5,...,-3747.0,-4472.206593,-1741.191546,193622.0,86832.0,156706.0,109832.0,89636.281662,95894.994648,95894.994648
1,2024-02-01,176290.0,6,29381.666667,0,1,0,0.433884,-0.900969,0.866025,...,1134.0,2210.786813,-499.401112,16958.0,69874.0,240580.0,224538.0,111299.711247,106614.328695,106614.328695
2,2024-02-02,163706.0,7,23386.571429,0,0,0,-0.433884,-0.900969,0.866025,...,9991.0,3878.698901,77.221357,176290.0,133748.0,90790.0,136748.0,124401.283435,114226.551535,114226.551535
3,2024-02-03,303496.0,12,25291.333333,1,0,0,-0.974928,-0.222521,0.866025,...,30761.571429,10077.243956,1543.720133,163706.0,119790.0,116790.0,136748.0,169174.962576,139462.477997,139462.477997
4,2024-02-04,495034.0,23,21523.217391,1,0,0,-0.781831,0.62349,0.866025,...,60356.285714,20815.002198,4079.520801,303496.0,83832.0,113374.0,123374.0,250639.721932,186872.014264,186872.014264


In [27]:
print("Sample predicted:", y_pred_inv[:10].flatten())
print("Sample actual   :", y_test_inv[:10].flatten())

Sample predicted: [0.23718807 0.18524306 0.2606841  0.20970945 0.17995718 0.19053864
 0.20143455 0.20614715 0.19411561 0.15265505]
Sample actual   : [0.20003566 0.45685623 0.16727086 0.17125182 0.2291689  0.24994043
 0.24550932 0.25075677 0.12334593 0.11420146]
