In [None]:
import pandas as pd
df = pd.read_csv("POP.csv")

In [None]:
df

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df = df.sort_values('date').reset_index(drop=True)


In [None]:
df.set_index('date', inplace=True)

In [None]:
print("Jumlah missing value tiap kolom:\n", df.isnull().sum())

In [None]:
duplicates = df.duplicated()
print(f"Jumlah baris duplikat: {duplicates.sum()}")

In [None]:
import numpy as np

In [None]:
threshold = 3
mean_val = df['value'].mean()
std_val = df['value'].std()
df['z_score'] = (df['value'] - mean_val) / std_val

In [None]:
df['outlier'] = df['z_score'].abs() > threshold
print(f"Jumlah outlier terdeteksi: {df['outlier'].sum()}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(df.index, df['value'], label='Value')
plt.scatter(df.index[df['outlier']], df.loc[df['outlier'], 'value'], color='red', label='Outlier')
plt.title('Outlier Detection')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
plt.hist(df['value'], bins=30, color='skyblue')
plt.title('Histogram Value')
plt.xlabel('Value')
plt.ylabel('Frekuensi')
plt.show()

In [None]:
df['year'] = df.index.year
yearly_mean = df.groupby('year')['value'].mean()

plt.figure(figsize=(12, 4))
yearly_mean.plot()
plt.title('Rata-rata Value per Tahun')
plt.ylabel('Mean Value')
plt.show()

In [None]:
rolling_mean = df['value'].rolling(window=12).mean()
rolling_std = df['value'].rolling(window=12).std()

plt.figure(figsize=(12,5))
plt.plot(df['value'], color='blue', label='Original')
plt.plot(rolling_mean, color='red', label='Rolling Mean')
plt.plot(rolling_std, color='black', label='Rolling Std')
plt.title('Rolling Mean & Std')
plt.legend()
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [None]:
df['log_value'] = np.log(df['value'])
df['log_diff_seasonal'] = df['log_value'].diff(12)
df['log_diff_seasonal_diff'] = df['log_diff_seasonal'].diff()

In [None]:
df_feat = df.dropna(subset=['log_diff_seasonal_diff']).copy()

In [None]:
df_feat['month'] = df_feat.index.month
df_feat['lag_1'] = df_feat['log_diff_seasonal_diff'].shift(1)
df_feat['rolling_mean_3'] = df_feat['log_diff_seasonal_diff'].shift(1).rolling(3).mean()

In [None]:
df_feat = df_feat.dropna(subset=['lag_1', 'rolling_mean_3'])

In [None]:
train = df_feat.loc[:'2018-12-01']
test = df_feat.loc['2019-01-01':]

In [None]:
target_col = 'log_diff_seasonal_diff'
exog_cols = ['month', 'lag_1', 'rolling_mean_3']

In [None]:
train_y = train[target_col]
test_y = test[target_col]

In [None]:
train_exog = train[exog_cols]
test_exog = test[exog_cols]

In [None]:
import itertools
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error

In [None]:
p = q = P = Q = range(0, 2)
d, D, s = 1, 1, 12

best_mae = np.inf
best_order = None
best_seasonal_order = None

for order in itertools.product(p, [d], q):
    for seasonal in itertools.product(P, [D], Q):
        seasonal_order = (seasonal[0], D, seasonal[1], s)
        try:
            model = SARIMAX(train_y, exog=train_exog, 
                            order=order, seasonal_order=seasonal_order,
                            enforce_stationarity=False, enforce_invertibility=False)
            result = model.fit(disp=False)
            mae = mean_absolute_error(train_y, result.fittedvalues)
            if mae < best_mae:
                best_mae = mae
                best_order = order
                best_seasonal_order = seasonal_order
        except:
            continue

print("\n✅ Best params:")
print(f" - order: {best_order}")
print(f" - seasonal_order: {best_seasonal_order}")

In [None]:
final_model = SARIMAX(train_y, exog=train_exog, 
                      order=best_order, seasonal_order=best_seasonal_order,
                      enforce_stationarity=False, enforce_invertibility=False)
final_result = final_model.fit(disp=False)



In [None]:
forecast_log_diff = final_result.forecast(steps=len(test), exog=test_exog)

In [None]:
test = test.copy()
test['forecast_log_diff'] = forecast_log_diff.values

In [None]:
history_log = list(train['log_value'][-12:])  # ambil 12 terakhir dari train
predicted_log_values = []

In [None]:
for i in range(len(test)):
    # Seasonal lag
    seasonal_lag = history_log[i] - history_log[i - 12]
    pred = test['forecast_log_diff'].iloc[i] + history_log[-1] + seasonal_lag
    predicted_log_values.append(pred)
    history_log.append(pred)  # update history

In [None]:
predicted_values = np.exp(predicted_log_values)


In [None]:
actual_values = np.exp(test['log_value'].values)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
mae = mean_absolute_error(actual_values, predicted_values)
rmse = np.sqrt(mean_squared_error(actual_values, predicted_values))
mape = np.mean(np.abs((actual_values - predicted_values) / actual_values)) * 100
r2 = r2_score(actual_values, predicted_values)

print("\n📊 📍 Evaluasi Test Set (skala asli):")
print(f" - MAE  : {mae:.2f}")
print(f" - RMSE : {rmse:.2f}")
print(f" - MAPE : {mape:.2f}%")
print(f" - R²   : {r2:.4f}")

# Hasil prediksi
df_result = pd.DataFrame({
    'actual': actual_values,
    'forecast': predicted_values
}, index=test.index)
print("\n✅ Hasil prediksi (5 data teratas):")
print(df_result.head())

In [None]:
df