# AutoEncoder Model
This model is not suitable for Autocorrelated Time Series Data with lags (The PM2.5 dataset may not fit very well). While it is more commonly used in the image domain, the encoder's output can effectively represent the correlations between variables in a two-dimensional plane. The dimensionality reduction method employed is t-SNE, which is somewhat similar to PCA but has the advantage of capturing nonlinear relationships.

In [None]:
import torch
import numpy as np
import pandas as pd 
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
import torch.nn as nn
import random 
import os 
import torch.optim as optim
from tqdm import tqdm
import torch
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import textwrap
from sklearn.manifold import TSNE
# Parameter configuration
args = {
    "corpusFile": "./ground_pm25.csv",  # Path to the data file
    "gpu": 0,  # GPU device index
    "epochs": 100,  # Number of training epochs
    "input_size": 8,  # Dimension of input features
    "encoding_size": 2,  # Dimension of output of the encoder
    "lr": 0.0008,  # Learning rate
    "batch_size": 64,  # Batch size
    "useGPU": True,  # Whether to use GPU
    "batch_first": True,  # Whether to set batch_size as the first dimension
    "dropout": 0.005,  # Dropout rate
    "model_name": "AutoEncoder",  # Model name
    "output_size": 1,  # Dimension of output features
}

# Automatically generate the model save path
args["save_file"] = f"model/{args['model_name'].lower()}.pth"

# Device selection
device = torch.device(f"cuda:{args['gpu']}" if torch.cuda.is_available() and args['useGPU'] else "cpu")
args["device"] = device

# Print configuration parameters
print("Configuration Parameters:")
for k, v in args.items():
    print(f"{k}: {v}")



### Load Data

In [33]:
class AutoEncoderDataset(Dataset):
    def __init__(self, X, Y):

        self.X = torch.tensor(X, dtype=torch.float32)  # Convert features to PyTorch tensors
        self.Y = torch.tensor(Y, dtype=torch.float32)  # Convert targets to PyTorch tensors
    def __len__(self):
        """Return the total number of samples."""
        return len(self.X)
    def __getitem__(self, idx):
        """Retrieve the feature-target pair at the specified index."""
        return self.X[idx], self.Y[idx]


def get_data(corpusFile, batch_size):
    df = pd.read_csv(corpusFile, index_col=0)
    
    # Store the maximum and minimum values for each column for later scaling
    df_max = df.max()
    df_min = df.min()
    
    # Initialize the MinMaxScaler for feature scaling
    scaler = MinMaxScaler()
    scaler_Y = MinMaxScaler()
    # Separate the target variable 'pm25' from the features
    Y = df['pm25'].values  # Extract target values
    X = df.drop('pm25', axis=1).values  # Extract feature values
    
    # Fit the scaler on the features and transform them
    X_scaled = scaler.fit_transform(X)

    Y_scaled = scaler_Y.fit_transform(Y.reshape(-1, 1))
    
    # Determine the lengths for training, validation, and testing splits
    total_len = len(X_scaled)
    train_len = int(total_len * 0.8)  # 80% for training
    test_size = int(0.1 * total_len)  # 10% for testing
    val_size = total_len - train_len - test_size  # Remaining 10% for validation
    
    # Split the data into training, validation, and testing sets
    train_x, train_y = X_scaled[:train_len], Y_scaled[:train_len]
    val_x, val_y = X_scaled[train_len:train_len + val_size], Y_scaled[train_len:train_len + val_size]
    test_x, test_y = X_scaled[train_len + val_size:], Y_scaled[train_len + val_size:]
    
    # Create DataLoaders for each dataset split
    train_loader = DataLoader(
        dataset=AutoEncoderDataset(train_x, train_y),
        batch_size=args['batch_size'],
        shuffle=True,
        drop_last=True
    )

    val_loader = DataLoader(
        dataset=AutoEncoderDataset(val_x, val_y),
        batch_size=args['batch_size'],
        shuffle=False,
        drop_last=True
    )

    test_loader = DataLoader(
        dataset=AutoEncoderDataset(test_x, test_y),
        batch_size=args['batch_size'],
        shuffle=False,
        drop_last=True
    )
    return train_loader, val_loader, test_loader, df_max, df_min


### Define Model

In [34]:
class AutoEncoder(nn.Module):
    def __init__(self,input_size,output_size,encoding_size,dropout):
        super(AutoEncoder,self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size,128),
            nn.BatchNorm1d(128),
            nn.ReLU(True),
            nn.Dropout(dropout),
            nn.Linear(128,64),
            nn.BatchNorm1d(64),
            nn.ReLU(True),
            nn.Linear(64,encoding_size),
            nn.BatchNorm1d(encoding_size),
            nn.ReLU(True),
            )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_size,64),
            nn.BatchNorm1d(64),
            nn.ReLU(True),
            nn.Dropout(dropout),
            nn.Linear(64,128),
            nn.BatchNorm1d(128),
            nn.Linear(128,output_size),
            nn.Sigmoid()
            )
    def forward(self,x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        encoded_shape = encoded.shape
        decoded_shape = decoded.shape
        return encoded,decoded

### Train Model

In [None]:
#set random seed 
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  #
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)


#define train function
def train(): 
    set_seed()  # Set the random seed for reproducibility

    # Initialize the LSTM model with parameters from the args dictionary
    model = AutoEncoder(
        input_size=args['input_size'],      # Number of expected features in the input
        output_size=args['output_size'],        # Output size 
        encoding_size=args['encoding_size'],    # Size of the encoded vector
        dropout=args['dropout']             # Dropout probability
    )
    model.to(args['device'])  # Move the model to the specified device (CPU or GPU)

    criterion = nn.MSELoss()  # Define the loss function (Mean Squared Error)
    optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])  # Initialize the optimizer

    # Load and preprocess the data
    labels_max, labels_min, train_loader, val_loader, test_loader = get_data(
        args['corpusFile'],
        args['batch_size']
    )

    epoch_losses = []  # To store training losses per epoch
    val_losses = []    # To store validation losses per epoch
    best_val_loss = float('inf')  # Initialize the best validation loss

    for epoch in range(args['epochs']):
        model.train()  # Set the model to training mode
        epoch_loss = 0
        train_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{args["epochs"]}', leave=False)

        # Training loop
        for inputs, targets in train_bar:
            inputs, targets = inputs.to(args['device']), targets.to(args['device'])  # Move data to device
            encoded, decoded = model(inputs)  # Forward pass
            loss = criterion(decoded, targets)  # Compute loss

            optimizer.zero_grad()  # Clear gradients
            loss.backward()        # Backward pass
            optimizer.step()       # Update parameters

            epoch_loss += loss.item()
            train_bar.set_postfix(loss=loss.item())  # Update progress bar with current loss
        avg_train_loss = epoch_loss /len(train_loader)
        model.eval()  # Set the model to evaluation mode
        val_loss = 0

        # Validation loop without gradient computation
        with torch.no_grad():
            for val_inputs, val_targets in val_loader:
                
                val_inputs, val_targets = val_inputs.to(args['device']), val_targets.to(args['device'])
                val_encoded, val_decoded = model(val_inputs)  # Forward pass
                val_loss += criterion(val_decoded, val_targets).item()

        avg_val_loss = val_loss / len(val_loader)  # Calculate average validation loss
        epoch_losses.append(epoch_loss)
        val_losses.append(avg_val_loss)

        # Print training and validation losses
        if (epoch+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{args["epochs"]}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}') 
        
        # Log the losses to a file
        with open(f'{args["model_name"]}_loss.txt', 'a') as f:
            f.write(f'Epoch {epoch+1}, Train Loss: {epoch_loss:.4f}, Val Loss: {avg_val_loss:.4f}\n')

        # Save the model if validation loss has improved
        if avg_val_loss < best_val_loss:
            save_dir = os.path.dirname(args['save_file'])
            os.makedirs(save_dir, exist_ok=True)
            best_val_loss = avg_val_loss
            torch.save({'state_dict': model.state_dict()}, args['save_file'])
    print(f'Best model saved at epoch {epoch+1}')
        
        # Ensure the model is back in training mode for the next epoch
    model.train()
    
train()


### Evaluation on the Test Dataset and TSNE

In [None]:
def val_model(args):

    model = AutoEncoder(
        input_size=args['input_size'],
        output_size=args['output_size'],
        encoding_size=args['encoding_size'],
        dropout=args['dropout']
    )
    model.to(args['device'])
    

    checkpoint = torch.load(args['save_file'], map_location=args['device'])
    model.load_state_dict(checkpoint['state_dict'])
    model.eval()
    

    train_loader, val_loader, test_loader, df_max, df_min, scaler_X, scaler_Y = get_data(
        args['corpusFile'], 
        args['batch_size']
    )
    
    all_encoded = []
    all_targets = []
    
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs = inputs.to(args['device'])
            encoded, decoded = model(inputs)  
            all_encoded.append(encoded.cpu().numpy())
            all_targets.append(targets.cpu().numpy())
    

    encoded_features = np.vstack(all_encoded) 
    targets = np.vstack(all_targets)        

    if args['encoding_size'] > 2:
        tsne = TSNE(n_components=2, random_state=42)
        encoded_2d = tsne.fit_transform(encoded_features)  
    else:
        encoded_2d = encoded_features  
    

    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(encoded_2d[:, 0], encoded_2d[:, 1], c=targets.squeeze(), cmap='viridis', alpha=0.5)
    plt.colorbar(scatter, label='PM2.5')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.title('Encoder Output in 2D Space')
    plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
    plt.tight_layout()
    
  
    save_plot_dir = './plot'
    os.makedirs(save_plot_dir, exist_ok=True)
    
 
    plt.savefig(f'{save_plot_dir}/{args["model_name"]}_encoder_output.png', dpi=300)
    plt.show()


In [None]:

def create_table(data, title):
    
    df = pd.DataFrame(data).T
    df.reset_index(drop=True, inplace=True)
    df.index += 1 
    df.index.name = 'Index' 
    def wrap_text(text, width):
        return '\n'.join(textwrap.wrap(text, width))

    max_width = 30
    for col in df.columns:
        df[col] = df[col].apply(lambda x: wrap_text(str(x), max_width))


    fig, ax = plt.subplots(figsize=(15, 5), dpi=300)
    ax.set_title(title, fontsize=16, fontweight='bold', color='Black') 
    ax.axis('tight')
    ax.axis('off')

    table = ax.table(cellText=df.values,
                     rowLabels=df.index,
                     colLabels=df.columns,
                     cellLoc='center',
                     loc='center',
                     rowLoc='center',
                     bbox=[0, 0, 1, 1])


    table.auto_set_font_size(False)
    table.set_fontsize(12)
    table.scale(1, 1)

    num_rows = len(df) + 1 
    num_cols = len(df.columns) + 1  

    for (row, col), cell in table.get_celld().items():
        cell.visible_edges = ''
        if row == 0:
            
            cell.get_text().set_fontweight('bold')
            cell.set_height(0.05)
            cell.visible_edges = 'BT'
            cell.set_edgecolor('black')
            cell.set_linewidth(1)
            cell.get_text().set_color('blue')
            cell.set_facecolor('#d3d3d3')
        elif col == 0:
        
            cell.get_text().set_fontweight('bold')
            
            cell.set_facecolor('#ecf8fd')
        else:

            if row % 2 == 1:
                cell.set_facecolor('#f9f9f9')
            else:
                cell.set_facecolor('#ffffff')

    plt.show()

create_table()