### Import Library

In [None]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

### Load Dataset

In [None]:
# load dataset
df = pd.read_csv("dataset/dataset_laporan_iklim_harian.csv")
df.columns

### EDA Dasar

In [None]:
# cek jumlah data
df.shape

In [None]:
# info umum dataset
print("\n== Info Dataset ==")
df.info()

# menampilkan 10 baris pertama dataset
print("\n== 20 Baris Pertama Dataset ==")
df.head(20)

In [None]:
# statistik deskriptif
print("\n== Statistik Deskriptif ==")
df.describe()

In [None]:
# cek missing values
print("\n== Cek Missing Values ==")
df.isnull().sum()

In [None]:
# cek total data 0
print("\n== Cek Total Data Nol ==")
(df == 0).sum()

In [None]:
# cek duplikasi data
df.duplicated().sum()

### Parsing Tanggal & Time Series Index

In [None]:
# convert tanggal ke format datetime dengan dayfirst=True
# data tanggal menggunakan format dd-mm-yyyy, sehingga perlu menambahkan dayfirst=True

df["TANGGAL"] = pd.to_datetime(df["TANGGAL"], dayfirst=True)

# set sebagai index (best practice time-series)
df = df.set_index("TANGGAL").sort_index()

### Normalisasi Nilai Tidak Valid BMKG (8888 & 9999)

In [None]:
# daftar kolom numerik iklim
climate_cols = ["TN", "TX", "TAVG", "RH_AVG", "RR", "FF_X", "FF_AVG"]

# replace nilai invalid BMKG
df[climate_cols] = df[climate_cols].replace([8888, 9999], np.nan)

### Penanganan Nilai 0 (BERBASIS DOMAIN)

In [None]:
# 0 dianggap error untuk fitur-fitur ini
zero_as_nan_cols = ["TN", "TX", "TAVG", "RH_AVG", "FF_X", "FF_AVG"]

for col in zero_as_nan_cols:
    df.loc[df[col] == 0, col] = np.nan

### Validasi Missing Value

In [None]:
# validasi missing values 
print("\n== Cek Missing Values ==")
df.isnull().sum()

### Interpolasi Time Series

In [None]:
# interpolasi missing values dengan metode time
df_clean = df.interpolate(method="time")
df_clean.shape

### Penanganan Akhir Curah Hujan (RR)

In [None]:
# ubah nilai RR yang masih NaN menjadi 0
df_clean["RR"] = df_clean["RR"].fillna(0)

### Final Validation

In [None]:
# validasi missing values setelah cleaning
print("\n== Cek Missing Values Setelah Cleaning ==")
df_clean.isna().sum()

In [None]:
# display data setelah cleaning
df_clean.head(10)

### Plot Suhu Minimun (TN)

In [None]:
# Plot Suhu Minimum (TN)
plt.figure(figsize=(15, 4))
plt.plot(df_clean.index, df_clean['TN'], color='blue')
plt.title('Suhu Minimum (TN)')
plt.ylabel('Suhu (°C)')
plt.grid(True, alpha=0.3)
plt.show()

### Plot Suhu Maksimum (TX)

In [None]:
# Plot Suhu Maksimum (TX)
plt.figure(figsize=(15, 4))
plt.plot(df_clean.index, df_clean['TX'], color='purple')
plt.title('Suhu Maksimum (TX)')
plt.ylabel('Suhu (°C)')
plt.grid(True, alpha=0.3)
plt.show()

### Tren Temperatur Rata-rata Harian (TAVG)

In [None]:
# Tren Temperatur Rata-rata Harian (TAVG)
plt.figure(figsize=(15, 4))
plt.plot(df_clean.index, df_clean["TAVG"])
plt.title("Tren Temperatur Rata-rata Harian (TAVG)")
plt.xlabel("Tanggal")
plt.ylabel("Temperatur (°C)")
plt.show()

### Distribusi Temperatur Rata-rata (TAVG)

In [None]:
# Histogram Temperatur Rata-rata Harian (TAVG)
plt.figure(figsize=(8, 4))
plt.hist(df_clean["TAVG"], bins=30)
plt.title("Distribusi Temperatur Rata-rata Harian")
plt.xlabel("Temperatur (°C)")
plt.ylabel("Frekuensi")
plt.show()


### Tren TN, TX, dan TAVG (Perbandingan)

In [None]:
# Perbandingan Tren Temperatur Minimum, Maksimum, dan Rata-rata
plt.figure(figsize=(15, 4))
plt.plot(df_clean.index, df_clean["TN"], label="Tn (Min)")
plt.plot(df_clean.index, df_clean["TX"], label="Tx (Max)")
plt.plot(df_clean.index, df_clean["TAVG"], label="Tavg")
plt.legend()
plt.title("Perbandingan Temperatur Minimum, Maksimum, dan Rata-rata")
plt.xlabel("Tanggal")
plt.ylabel("Temperatur (°C)")
plt.show()


### Pola Musiman (Bulanan)

In [None]:
# Rata-rata Temperatur per Bulan
monthly_avg = df_clean.groupby(df_clean.index.month)["TAVG"].mean()

# Plot Rata-rata Temperatur per Bulan
plt.figure(figsize=(10, 6))
monthly_avg.plot(kind="bar")
plt.title("Rata-rata Temperatur per Bulan")
plt.xlabel("Bulan")
plt.ylabel("Temperatur Rata-rata (°C)")
plt.show()


### Curah Hujan vs Temperatur

In [None]:

# Scatter Plot: Curah Hujan vs Temperatur

plt.figure(figsize=(10, 6))

# Gunakan regplot dari Seaborn untuk otomatis menambah garis tren
sns.regplot(
    x="RR", 
    y="TAVG", 
    data=df_clean, 
    scatter_kws={'alpha':0.5, 'color':'#1f77b4'},
    line_kws={'color':'red', 'linewidth':2}
)

# Hitung korelasi spesifik untuk judul
corr_val = df_clean['RR'].corr(df_clean['TAVG'])

plt.xlabel("Curah Hujan (mm)")
plt.ylabel("Temperatur Rata-rata (°C)")
plt.title(f"Hubungan Curah Hujan vs Temperatur (Korelasi: {corr_val:.2f})", fontsize=14)
plt.grid(True, alpha=0.3)
plt.show()

# Print matriks korelasi spesifik untuk konfirmasi angka
print("Correlation Matrix Spesifik:")
print(df_clean[["RR", "TAVG"]].corr())

### Correlation Matriks Variabel Iklim

In [None]:
# Correlation Matrix Variabel Iklim
climate_features = [
    "TN", "TX", "TAVG", "RH_AVG", "RR", "FF_X", "FF_AVG"
]

plt.figure(figsize=(12, 8))
sns.heatmap(
    df_clean[climate_features].corr(),
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    square=True,
    linewidths=0.5
)
plt.title("Correlation Matrix Variabel Iklim")
plt.tight_layout()
plt.show()

In [None]:
df_fe = df_clean.copy()

df_fe["TAVG_lag1"] = df_fe["TAVG"].shift(1)
df_fe["TAVG_lag2"] = df_fe["TAVG"].shift(2)
df_fe["TAVG_roll7"] = df_fe["TAVG"].rolling(7).mean()

df_fe["month"] = df_fe.index.month

df_fe = df_fe.dropna()

In [None]:
train_size = int(len(df_fe) * 0.8)

train = df_fe.iloc[:train_size]
test = df_fe.iloc[train_size:]

x_train = train.drop(columns=["TAVG"])
y_train = train["TAVG"]

x_test = test.drop(columns=["TAVG"])
y_test = test["TAVG"]


In [None]:
# Naive Forecasting: Menggunakan nilai TAVG hari sebelumnya sebagai prediksi
y_pred_naive = test["TAVG_lag1"]

mae_naive = mean_absolute_error(y_test, y_pred_naive)
rmse_naive = np.sqrt(mean_squared_error(y_test, y_pred_naive))

print(f"MAE Naive Forecasting: {mae_naive:.4f}")
print(f"RMSE Naive Forecasting: {rmse_naive:.4f}")


In [None]:
# ==============================================================================
# 5. KOMPARASI MODEL: NAIVE VS LINEAR REGRESSION (SEASONAL)
# ==============================================================================

# 1. Siapkan Data dengan Fitur Lengkap
df_eval = df_clean.copy()
df_eval['Date_Ordinal'] = df_eval.index.map(pd.Timestamp.toordinal)
df_eval['Month'] = df_eval.index.month
df_eval['sin_month'] = np.sin(2 * np.pi * df_eval['Month'] / 12)
df_eval['cos_month'] = np.cos(2 * np.pi * df_eval['Month'] / 12)

# 2. Split Data (80% Train, 20% Test)
train_size = int(len(df_eval) * 0.8)
train_data = df_eval.iloc[:train_size]
test_data = df_eval.iloc[train_size:]

# Definisi Fitur & Target
features = ['Date_Ordinal', 'sin_month', 'cos_month']
X_train = train_data[features]
y_train = train_data['TAVG']
X_test = test_data[features]
y_test = test_data['TAVG']

# 3. Training Model Linear Regression
model_validator = LinearRegression()
model_validator.fit(X_train, y_train)
y_pred_lr_test = model_validator.predict(X_test) # Hasil Prediksi Linear

# 4. Generate Ulang Naive Forecast (Supaya jumlah baris sinkron)
y_pred_naive_new = test_data['TAVG'].shift(1)

# === BAGIAN PENTING: ALIGNMENT (PENYAMAAN UKURAN) ===
# Naive pasti NaN di baris pertama karena shift(1).
# Jadi kita potong baris pertama di SEMUA data (Linear, Naive, Aktual)
# Supaya ukurannya sama persis dan tidak Error.

y_test_final = y_test.iloc[1:]           # Potong baris pertama aktual
y_pred_naive_final = y_pred_naive_new.iloc[1:] # Potong baris pertama naive (NaN)
y_pred_lr_final = y_pred_lr_test[1:]     # Potong baris pertama hasil linear

# 5. Hitung Error dengan Data yang Sudah Dipotong
mae_naive_final = mean_absolute_error(y_test_final, y_pred_naive_final)
rmse_naive_final = np.sqrt(mean_squared_error(y_test_final, y_pred_naive_final))

mae_lr_final = mean_absolute_error(y_test_final, y_pred_lr_final)
rmse_lr_final = np.sqrt(mean_squared_error(y_test_final, y_pred_lr_final))

# 6. Tabel Perbandingan
comparison_df = pd.DataFrame({
    'Model': ['Naive Forecast', 'Linear Regression (Seasonal)'],
    'MAE': [mae_naive_final, mae_lr_final],
    'RMSE': [rmse_naive_final, rmse_lr_final]
})

print("== TABEL PERBANDINGAN PERFORMA MODEL (DATA TEST) ==")
print(comparison_df)

# 7. Visualisasi Perbandingan Error
plt.figure(figsize=(8, 5))
x = np.arange(len(comparison_df['Model']))
width = 0.35

plt.bar(x - width/2, comparison_df['MAE'], width, label='MAE', color='skyblue')
plt.bar(x + width/2, comparison_df['RMSE'], width, label='RMSE', color='orange')
plt.title('Perbandingan Error Model')
plt.xticks(x, comparison_df['Model'])
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.show()

# 8. Visualisasi Prediksi (Head-to-Head)
plt.figure(figsize=(15, 5))
# Gunakan variabel _final agar ukurannya sama
plt.plot(y_test_final.index, y_test_final, label='Aktual', color='gray', alpha=0.5)
plt.plot(y_test_final.index, y_pred_naive_final, label='Naive Forecast', linestyle='--', color='orange')
plt.plot(y_test_final.index, y_pred_lr_final, label='Linear Regression (Seasonal)', linewidth=2, color='blue')

plt.title('Head-to-Head: Naive vs Linear Regression pada Data Test')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(15, 4))
plt.plot(y_test.index, y_test, label="Aktual")
plt.plot(y_test.index, y_pred_naive, label="Prediksi (Naive)")
plt.legend()
plt.title("Perbandingan Aktual vs Prediksi Temperatur")
plt.xlabel("Tanggal")
plt.ylabel("Temperatur (°C)")
plt.show()


In [None]:
# ==============================================================================
# 5. KOMPARASI MODEL: NAIVE VS LINEAR REGRESSION (SEASONAL)
# ==============================================================================

# 1. Siapkan Data dengan Fitur Lengkap (Sama seperti persiapan forecasting)
df_eval = df_clean.copy()
df_eval['Date_Ordinal'] = df_eval.index.map(pd.Timestamp.toordinal)
df_eval['Month'] = df_eval.index.month
df_eval['sin_month'] = np.sin(2 * np.pi * df_eval['Month'] / 12)
df_eval['cos_month'] = np.cos(2 * np.pi * df_eval['Month'] / 12)

# 2. Split Data (80% Train, 20% Test) - Harus sama persis dengan Naive
train_size = int(len(df_eval) * 0.8)
train_data = df_eval.iloc[:train_size]
test_data = df_eval.iloc[train_size:]

# Definisi Fitur & Target
features = ['Date_Ordinal', 'sin_month', 'cos_month']
X_train = train_data[features]
y_train = train_data['TAVG']
X_test = test_data[features]
y_test = test_data['TAVG']

# 3. Training Model Evaluasi (Hanya pada Data Train)
model_validator = LinearRegression()
model_validator.fit(X_train, y_train)

# 4. Prediksi ke Data Test
y_pred_lr_test = model_validator.predict(X_test)

# 5. Hitung Error (MAE & RMSE) untuk Linear Regression
mae_lr = mean_absolute_error(y_test, y_pred_lr_test)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr_test))

# 6. Buat Tabel Perbandingan
comparison_df = pd.DataFrame({
    'Model': ['Naive Forecast', 'Linear Regression (Seasonal)'],
    'MAE': [mae_naive, mae_lr],
    'RMSE': [rmse_naive, rmse_lr]
})

print("== TABEL PERBANDINGAN PERFORMA MODEL (DATA TEST) ==")
print(comparison_df)

# 7. Visualisasi Perbandingan Error
plt.figure(figsize=(8, 5))
x = np.arange(len(comparison_df['Model']))
width = 0.35

plt.bar(x - width/2, comparison_df['MAE'], width, label='MAE', color='skyblue')
plt.bar(x + width/2, comparison_df['RMSE'], width, label='RMSE', color='orange')

plt.xlabel('Model')
plt.ylabel('Error (Semakin Kecil Semakin Baik)')
plt.title('Perbandingan Error Model (MAE & RMSE)')
plt.xticks(x, comparison_df['Model'])
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.show()

# 8. Visualisasi Prediksi di Data Test (Zoom-in)
plt.figure(figsize=(15, 5))
plt.plot(y_test.index, y_test, label='Aktual', color='gray', alpha=0.5)
plt.plot(y_test.index, y_pred_naive, label='Naive Forecast', linestyle='--', color='orange')
plt.plot(y_test.index, y_pred_lr_test, label='Linear Regression (Seasonal)', linewidth=2, color='blue')
plt.title('Head-to-Head: Naive vs Linear Regression pada Data Test')
plt.legend()
plt.show()

### PREDIKSI MASA DEPAN (SEASONAL LINEAR REGRESSION)
Pada tahap ini, kita melakukan forecasting untuk 20 hari ke depan.
Kita tidak hanya menggunakan tren garis lurus (linear), tetapi menambahkan fitur Seasonality (Musiman) menggunakan transformasi Sinus dan Cosinus pada bulan. Hal ini agar prediksi suhu bisa menangkap pola naik-turun yang alami.

### Feature Engineering

In [None]:
# Feature Engineering: Menambahkan Fitur Tren & Musiman
df_forecast = df_clean.copy()

# Fitur Tren Global (Date Ordinal)
# Mengubah tanggal menjadi angka urut (1, 2, 3...) agar bisa dibaca regresi
df_forecast['Date_Ordinal'] = df_forecast.index.map(pd.Timestamp.toordinal)

# Fitur Musiman (Cyclical Features)
# Menggunakan Sinus & Cosinus agar Bulan 12 (Desember) nyambung polanya ke Bulan 1 (Januari)
df_forecast['Month'] = df_forecast.index.month
df_forecast['sin_month'] = np.sin(2 * np.pi * df_forecast['Month'] / 12)
df_forecast['cos_month'] = np.cos(2 * np.pi * df_forecast['Month'] / 12)

# Cek 5 data teratas untuk memastikan fitur terbentuk
print("Fitur Siap:")
df_forecast[['TAVG', 'Date_Ordinal', 'sin_month', 'cos_month']].head()

### Training Model

In [None]:
# Training Model Linear Regression
# Kita gunakan seluruh data yang ada untuk menangkap tren jangka panjang

# Definisi Fitur (x) dan Target (y)
features = ['Date_Ordinal', 'sin_month', 'cos_month']
x = df_forecast[features]
y = df_forecast['TAVG']

# Inisialisasi dan Training
model_lr_season = LinearRegression()
model_lr_season.fit(x, y)

### Generate Data Masa Depan

In [None]:
# Persiapan Data Masa Depan (20 Hari)

# Tentukan durasi prediksi
future_days = 20
last_date = df_forecast.index[-1]

# Buat rentang tanggal baru (Mulai besoknya data terakhir)
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=future_days)

# Buat DataFrame baru khusus masa depan
df_future = pd.DataFrame(index=future_dates)

# FEATURE ENGINEERING MASA DEPAN (Wajib sama persis dengan tahap training)
df_future['Date_Ordinal'] = df_future.index.map(pd.Timestamp.toordinal)
df_future['Month'] = df_future.index.month
df_future['sin_month'] = np.sin(2 * np.pi * df_future['Month'] / 12)
df_future['cos_month'] = np.cos(2 * np.pi * df_future['Month'] / 12)

# Lakukan Prediksi
df_future['Prediksi_TAVG'] = model_lr_season.predict(df_future[features])

### Visualisasi Hasil

In [None]:
# Visualisasi Hasil Forecasting

plt.figure(figsize=(15, 6))

# Plot Data Aktual (Ambil 1 tahun terakhir biar grafik tidak terlalu padat)
plt.plot(df_forecast.index[-365:], df_forecast['TAVG'][-365:], 
         label='Data Aktual (1 Tahun Terakhir)', color='gray', alpha=0.5)

# Plot Pola Model pada Data Lama (Evaluasi Pola)
# Ini untuk menunjukkan bahwa model kita mengikuti gelombang musim, bukan garis lurus
y_pred_history = model_lr_season.predict(x)
plt.plot(df_forecast.index[-365:], y_pred_history[-365:], 
         color='red', linestyle='--', label='Pola Model (Trend + Seasonality)')

# Plot Prediksi Masa Depan (Hasil Akhir)
plt.plot(df_future.index, df_future['Prediksi_TAVG'], 
         color='green', linewidth=3, label=f'Prediksi {future_days} Hari Depan')

# Kosmetik Grafik
plt.title(f'Forecasting Suhu Rata-rata: Tren Global + Pola Musiman', fontsize=14)
plt.xlabel('Tanggal')
plt.ylabel('Suhu Rata-rata (°C)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Tampilkan Tabel Hasil Prediksi
print(f"\n== Tabel Prediksi Suhu {future_days} Hari Ke Depan ==")
# Tampilkan dengan pembulatan 2 desimal
display_df = df_future[['Prediksi_TAVG']].copy()
display_df['Prediksi_TAVG'] = display_df['Prediksi_TAVG'].round(2)

# Tampilkan 10 hari pertama
print(display_df.head(10))

In [None]:
# # 1. Persiapan Data Khusus Linear Regression (Butuh Tanggal Numerik)
# # Kita gunakan df_clean yang sudah final dari tahap cleaning kamu
# df_linreg = df_clean.copy()
# df_linreg['Date_Ordinal'] = df_linreg.index.map(pd.Timestamp.toordinal)

# # Definisi X (Fitur) dan y (Target)
# x_trend = df_linreg[['Date_Ordinal']]
# y_trend = df_linreg['TAVG']

# # 2. Training Model Linear Regression (Full Data untuk Forecasting Maksimal)
# model_lr = LinearRegression()
# model_lr.fit(x_trend, y_trend)

# # Evaluasi Singkat Model Linear (Optional, untuk laporan)
# y_pred_history = model_lr.predict(x_trend)
# mae_lr = mean_absolute_error(y_trend, y_pred_history)
# rmse_lr = np.sqrt(mean_squared_error(y_trend, y_pred_history))
# print(f"MAE Linear Regression (Trend Fitting): {mae_lr:.4f}")
# print(f"RMSE Linear Regression (Trend Fitting): {rmse_lr:.4f}")


In [None]:
# # ------------------------------------------------------------------------------
# # 3. GENERATE 20 HARI KEDEPAN
# # ------------------------------------------------------------------------------

# # Tentukan berapa hari ke depan
# future_days = 20
# last_date = df_linreg.index[-1]

# # Buat range tanggal masa depan (mulai besoknya data terakhir)
# future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=future_days)

# # Buat DataFrame untuk masa depan
# df_future = pd.DataFrame(index=future_dates)
# df_future['Date_Ordinal'] = df_future.index.map(pd.Timestamp.toordinal)

# # Lakukan Prediksi
# future_pred = model_lr.predict(df_future[['Date_Ordinal']])
# df_future['Prediksi_TAVG'] = future_pred


In [None]:
# # ------------------------------------------------------------------------------
# # 4. VISUALISASI HASIL (GAYA SEPERTI NOTEBOOK TEMAN)
# # ------------------------------------------------------------------------------

# plt.figure(figsize=(15, 6))

# # Plot A: Data Aktual (Biru)
# plt.plot(df_linreg.index, df_linreg['TAVG'], label='Data Aktual (BMKG)', color='blue', alpha=0.5)

# # Plot B: Garis Tren Linear pada Data Lama (Merah Putus-putus)
# plt.plot(df_linreg.index, y_pred_history, color='red', linestyle='--', linewidth=2, label='Garis Tren Linear')

# # Plot C: Prediksi Masa Depan (Hijau Tebal)
# plt.plot(df_future.index, df_future['Prediksi_TAVG'], color='green', linewidth=4, label=f'Prediksi {future_days} Hari Depan')

# plt.title(f'Prediksi Suhu Rata-rata {future_days} Hari Ke Depan (Linear Trend)', fontsize=14)
# plt.xlabel('Tanggal')
# plt.ylabel('Suhu Rata-rata (°C)')
# plt.legend()
# plt.grid(True, alpha=0.3)
# plt.show()

In [None]:
# # ------------------------------------------------------------------------------
# # 5. TAMPILKAN TABEL PREDIKSI
# # ------------------------------------------------------------------------------
# print(f"\n== Tabel Hasil Prediksi {future_days} Hari Ke Depan ==")
# # Format tampilan agar rapi
# display_df = df_future[['Prediksi_TAVG']].copy()
# display_df['Prediksi_TAVG'] = display_df['Prediksi_TAVG'].round(2) # Bulatkan 2 desimal
# print(display_df)

# # Simpan hasil prediksi ke CSV (Optional)
# # display_df.to_csv('hasil_prediksi_20_hari.csv')