In [11]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from bigdl.chronos.autots import AutoTSTrainer
from bigdl.chronos.autots.model import TCN, LSTM, NBeats
from bigdl.chronos.data import TSDataset
import os

# Load and preprocess the air pollution data
def load_and_preprocess_data(file_path, target_features):
    data = pd.read_csv(file_path)

    # Correct the date format
    data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y %H:%M', dayfirst=True, errors='coerce')

    # Drop rows with invalid or missing date values
    data = data.dropna(subset=['Date'])

    # Set the 'Date' column as the index
    data.set_index('Date', inplace=True)

    # Ensure the target features exist
    for target_feature in target_features:
        if target_feature not in data.columns:
            raise ValueError(f"Target column '{target_feature}' not found in the dataset.")

    return data

# Define your features and target
target_features = ['PM2.5', 'PM10', 'RH', 'SR']
seq_len = 12  # You can adjust the sequence length here

# Load the dataset
file_path = r"E:/Q/Q_DATA/pm_sr.csv"
df = load_and_preprocess_data(file_path, target_features)

# Store the date for the test set
dates = df.index

# Scale the features using MinMaxScaler
scaler = MinMaxScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(df), index=df.index, columns=df.columns)

# Create Chronos TSDataset
tsdata = TSDataset.from_pandas(df=scaled_df, dt_col='Date', target_col=target_features, 
                               feature_col=scaled_df.columns.tolist(), 
                               lookback=seq_len, horizon=1)

# Train-Test Split
train_tsdata, test_tsdata = tsdata.split(train_ratio=0.8)

# Get the test set dates after splitting
test_dates = dates[-len(test_tsdata):]

# Initialize AutoTSTrainer with LSTM as an example (can switch to TCN, NBeats, etc.)
trainer = AutoTSTrainer(model=LSTM(input_feature_num=len(target_features),
                                  output_feature_num=len(target_features),
                                  past_seq_len=seq_len, 
                                  hidden_dim=64,
                                  layer_num=2, 
                                  dropout=0.2),
                        search_alg="random",  # or use "bayesian"
                        optimization_metric="mse",  # Optimization metric for the best model
                        loss="mse", 
                        logs_dir="./chronos_logs",
                        cpus_per_trial=4)

# Train the model
best_model = trainer.fit(train_tsdata, val_ratio=0.1, epochs=5)

# Test the model on the test dataset
y_pred = best_model.predict(test_tsdata)

# Inverse transform to original scale
y_pred_rescaled = scaler.inverse_transform(y_pred.reshape(-1, len(target_features)))

# Evaluate the model
mse = ((y_pred_rescaled - test_tsdata.get_target().reshape(-1, len(target_features))) ** 2).mean()
print(f"Mean Squared Error on test set: {mse}")

# Prepare the actual and predicted values along with the dates
results_df = pd.DataFrame({
    'Date': test_dates,  # Add the test set dates
    'Predicted PM2.5': y_pred_rescaled[:, 0],
    'Predicted PM10': y_pred_rescaled[:, 1],
    'Predicted RH': y_pred_rescaled[:, 2],
    'Predicted SR': y_pred_rescaled[:, 3],
    'Actual PM2.5': test_tsdata.get_target().reshape(-1, len(target_features))[:, 0],
    'Actual PM10': test_tsdata.get_target().reshape(-1, len(target_features))[:, 1],
    'Actual RH': test_tsdata.get_target().reshape(-1, len(target_features))[:, 2],
    'Actual SR': test_tsdata.get_target().reshape(-1, len(target_features))[:, 3]
})

# Save results to CSV
results_csv_path = os.path.join("E:/Q/RESULTS", "predictions_vs_actuals_with_dates.csv")
results_df.to_csv(results_csv_path, index=False)
print(f"Predictions and actuals saved to {results_csv_path}")


ModuleNotFoundError: No module named 'bigdl'