# Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Implementation with Non-Machine Learning Methods

### Import Data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
metered_path = "/content/drive/MyDrive/DeepLearning/GAN/metered_data.xlsx"
df = pd.read_excel(metered_path)

In [3]:
pjm_2024 = df[df['year'] == 2024].copy().reset_index(drop=True)
pjm_2024 = pjm_2024.iloc[:100]

# Extract the 'mw' values as a NumPy array
series = pjm_2024['mw'].values.astype(np.float32)

# Normalize (optional but helps training)
mean, std = np.mean(series), np.std(series)
series_norm = (series - mean) / std

# Function to create overlapping masked sequences
def prepare_pjm_sequences(series, seq_len=32, step=8, missing_rate=0.2, placeholder=-1.0):
    sequences, masked_seqs, masks = [], [], []
    for i in range(0, len(series) - seq_len, step):
        seq = series[i:i+seq_len]
        mask = np.ones(seq_len)
        masked = seq.copy()
        miss_idx = np.random.choice(seq_len, size=int(missing_rate * seq_len), replace=False)
        masked[miss_idx] = placeholder
        mask[miss_idx] = 0
        sequences.append(seq)
        masked_seqs.append(masked)
        masks.append(mask)
    return (
        np.array(masked_seqs)[..., np.newaxis].astype(np.float32),
        np.array(sequences)[..., np.newaxis].astype(np.float32),
        np.array(masks)[..., np.newaxis].astype(np.float32)
    )

# Create overlapping sequences
X_input, Y_true, mask = prepare_pjm_sequences(series_norm, seq_len=32, step=8)

# Sanity check
print("Shape of input:", X_input.shape)
print("Example masked sequence:", X_input[0].flatten())

Shape of input: (9, 32, 1)
Example masked sequence: [-1.2747332  -1.3608261  -1.4758302  -1.4954778  -1.362427   -1.1194233
 -0.84764266 -0.66690636 -0.6161491  -0.6826831  -1.         -0.7524867
 -0.7695219  -0.76666176 -0.72652274 -1.         -1.         -1.
 -1.          0.45319468  0.25726914 -0.02461262 -0.5140798  -1.
 -1.1802894  -1.3332347  -1.3548298  -1.2266588  -0.8606255  -0.16787215
  0.7679667   1.3874413 ]


### Mean / Median / Linear Imputations

In [4]:
missing_rate_list = [0.2, 0.3, 0.4, 0.5]
classical_results = []

for missing_rate in missing_rate_list:
    print(f"\n📌 Running classical imputations with missing rate = {missing_rate:.1f}")

    # === Generate masked sequences for current missing rate ===
    X_input, Y_true, mask = prepare_pjm_sequences(
        series_norm, seq_len=32, step=8, missing_rate=missing_rate
    )

    # === Denormalize for human-readable scale ===
    X_input_orig = X_input * std + mean
    Y_true_orig = Y_true * std + mean

    # === Create placeholders ===
    mean_imputed = X_input_orig.copy()
    median_imputed = X_input_orig.copy()
    linear_imputed = X_input_orig.copy()
    rolling_imputed = X_input_orig.copy()

    for i in range(X_input_orig.shape[0]):
        seq = X_input_orig[i, :, 0]
        mask_seq = mask[i, :, 0].astype(bool)

        # Mean Imputation
        mean_val = seq[mask_seq].mean()
        mean_imputed[i, ~mask_seq, 0] = mean_val

        # Median Imputation
        median_val = np.median(seq[mask_seq])
        median_imputed[i, ~mask_seq, 0] = median_val

        # Linear Interpolation (no padding)
        temp_seq = seq.copy()
        temp_seq[~mask_seq] = np.nan
        linear_filled = pd.Series(temp_seq).interpolate(method='linear', limit_area='inside').values
        linear_imputed[i, :, 0] = linear_filled

        # Rolling Mean (no padding)
        temp_seq = seq.copy()
        temp_seq[~mask_seq] = np.nan
        rolling_series = pd.Series(temp_seq)
        rolling_filled = rolling_series.fillna(
            rolling_series.rolling(window=3, min_periods=1, center=True).mean()
        )
        rolling_imputed[i, :, 0] = rolling_filled.values

    # === Evaluation function ===
    def evaluate_imputation_no_padding(imputed, true, mask):
        condition = (mask == 0) & ~np.isnan(imputed)
        true_vals = true[condition]
        imputed_vals = imputed[condition]
        rmse = np.sqrt(np.mean((true_vals - imputed_vals) ** 2))
        mae = np.mean(np.abs(true_vals - imputed_vals))
        return rmse, mae

    # === Evaluate all methods for this missing rate ===
    for method_name, imputed_array in [
        ("Mean", mean_imputed),
        ("Median", median_imputed),
        ("Linear", linear_imputed),
        ("Rolling", rolling_imputed),
    ]:
        rmse, mae = evaluate_imputation_no_padding(imputed_array, Y_true_orig, mask)
        classical_results.append({
            "Missing Rate": missing_rate,
            "Method": method_name,
            "RMSE": round(float(rmse), 4),
            "MAE": round(float(mae), 4)
        })


📌 Running classical imputations with missing rate = 0.2

📌 Running classical imputations with missing rate = 0.3

📌 Running classical imputations with missing rate = 0.4

📌 Running classical imputations with missing rate = 0.5


# Evaluate

In [7]:
df_classical = pd.DataFrame(classical_results)
df_classical = df_classical.sort_values(by=["Missing Rate", "RMSE"]).reset_index(drop=True)
df_classical.to_csv("Classical_Imputation_Results.csv", index=False)
df_classical.head(15)

Unnamed: 0,Missing Rate,Method,RMSE,MAE
0,0.2,Linear,223.0207,176.3759
1,0.2,Rolling,280.4501,190.6692
2,0.2,Mean,1259.1753,989.9136
3,0.2,Median,1280.4517,982.9071
4,0.3,Linear,221.4252,180.8022
5,0.3,Rolling,424.2827,286.5014
6,0.3,Mean,1181.7408,950.2992
7,0.3,Median,1199.5634,967.9546
8,0.4,Linear,242.461,185.6984
9,0.4,Rolling,381.5441,257.4945
