In [None]:
import pandas as pd

df = pd.read_csv("./LSTM-Multivariate_pollution.csv")

df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date").reset_index(drop=True)

# add time features
df["hour"] = df["date"].dt.hour
df["weekday"] = df["date"].dt.weekday
df["is_weekend"] = df["weekday"].isin([5, 6]).astype(int)

df["wnd_dir"] = df["wnd_dir"].astype("category").cat.codes

# add lag features
for lag in [1, 3, 6, 24]:
    df[f"pollution_lag{lag}"] = df["pollution"].shift(lag)

# add classification features
def classify_aqi(x):
    if x <= 50:
        return 0  # very good
    elif x <= 100:
        return 1  # good
    elif x <= 150:
        return 2  # less polluted
    elif x <= 200:
        return 3  # moderate polluted
    else:
        return 4  # polluted heavily

df["pollution_level"] = df["pollution"].apply(classify_aqi)

df = df.dropna().reset_index(drop=True)

df.to_csv("processed_pollution_multivariate.csv", index=False)
print("saved as: processed_pollution_multivariate.csv")


✅ 已保存为 processed_pollution_multivariate.csv


In [None]:
from sklearn.preprocessing import MinMaxScaler

df_climate = pd.read_csv("climate_processed_pollution_multivariate.csv")

cols_to_normalize_climate = [
    "pollution",         
    "wnd_spd",           
    "pollution_lag1",   
    "pollution_lag3",
    "pollution_lag6",
    "pollution_lag24"
]

scaler_climate = MinMaxScaler()

# Fit and transform the selected columns
normalized_values_climate = scaler_climate.fit_transform(df_climate[cols_to_normalize_climate])

normalized_df_climate = pd.DataFrame(
    normalized_values_climate,
    columns=[col + "_normalized" for col in cols_to_normalize_climate]
)

# Concatenate 
df_climate_normalized = pd.concat([df_climate, normalized_df_climate], axis=1)

df_climate_normalized.to_csv("climate_normalized_selected.csv", index=False)
print("Saved as climate_normalized_selected.csv")

