In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
import os


TensorDataset: Create Dataset
Dataloader: Load data with batch and shuffel

The AirQualityUCI.csv dataset is a well-known dataset available from the UCI Machine Learning Repository. It contains sensor readings of air quality collected in an Italian city. Here is a full breakdown of what this dataset includes:

General Description
Dataset Name: AirQualityUCI

Source: UCI Machine Learning Repository

Data Collected: March 2004 – February 2005

Location: A road-level air quality sensor in an Italian city

Type: Multivariate time series

Instances (rows): ~9,358

Features (columns): 15

Data Type: Mainly floats, some date/time strings

Missing Values: Marked as -200, not NaN

In [None]:
import pandas as pd

df = pd.read_csv("/content/AirQualityUCI.csv", sep=";")
df.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,10/03/2004,18.00.00,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,7578,,
1,10/03/2004,19.00.00,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,7255,,
2,10/03/2004,20.00.00,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,7502,,
3,10/03/2004,21.00.00,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,7867,,
4,10/03/2004,22.00.00,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,7888,,


Check Cuda available

In [None]:
# Set random seed for reproducibility
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
print(device)

cuda


seq_len : lenght of the time



output_dim = number of feature

hidden_dim = hidden unit in lstm

input_dim = dimension of input noise

Tanh -->

 noise shape = (batch_size, seq_len, input_dim)

In [None]:
class Generator(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, seq_len):
        super(Generator, self).__init__()
        self.hidden_dim = hidden_dim
        self.seq_len = seq_len

        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=2, batch_first=True)
        self.linear = nn.Linear(hidden_dim, output_dim) #FC
        self.tanh = nn.Tanh() #---> [-1,1]

    def forward(self, noise, hidden=None):
        lstm_out, hidden = self.lstm(noise, hidden)
        output = self.linear(lstm_out)
        output = self.tanh(output)
        return output, hidden

In [None]:
# Discriminator with LSTM
class Discriminator(nn.Module):
    def __init__(self, input_dim, hidden_dim, seq_len):
        super(Discriminator, self).__init__()
        self.hidden_dim = hidden_dim
        self.seq_len = seq_len

        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=2, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid() #--> Generate 0 or 1

    def forward(self, sequence, hidden=None):
        lstm_out, hidden = self.lstm(sequence, hidden)
        last_output = lstm_out[:, -1, :] #--> last seq len
        output = self.linear(last_output)
        output = self.sigmoid(output)
        return output, hidden

In [None]:
# Load and preprocess UCI Air Quality Dataset
def load_air_quality_data(file_path, seq_len, feature_idx=7):  # feature_idx=7 for PT08.S3(NOx)
    # Check if file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}. Please download AirQualityUCI.csv from https://archive.ics.uci.edu/ml/datasets/Air+Quality and upload to /content/.")

    # Load dataset
    df = pd.read_csv(file_path, sep=';')  # UCI dataset uses semicolon separator

    # Print column names to diagnose issues
    print("Dataset columns:", df.columns.tolist())

    # Verify required columns
    required_columns = ['Date', 'Time']
    missing_cols = [col for col in required_columns if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}. Available columns: {df.columns.tolist()}")

    # Combine Date and Time columns
    df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H.%M.%S')

    # Replace -200 (missing values) with NaN and impute with forward fill
    df.replace(-200, np.nan, inplace=True)
    df.fillna(method='ffill', inplace=True)

    # Select feature (e.g., PT08.S3(NOx) for PM2.5-related sensor)
    data = df.iloc[:, feature_idx].values

    # Normalize to [-1, 1]
    scaler = MinMaxScaler(feature_range=(-1, 1))
    data = scaler.fit_transform(data.reshape(-1, 1))

    # Create sequences
    sequences = []
    for i in range(len(data) - seq_len):
        sequences.append(data[i:i + seq_len])
    sequences = np.array(sequences)

    return torch.FloatTensor(sequences), scaler




In [None]:
# Hyperparameters
input_dim = 10        # Noise dimension for generator
hidden_dim = 64       # LSTM hidden units
output_dim = 1        # Univariate output (e.g., PM2.5 sensor)
seq_len = 24          # Sequence length (e.g., 24 hours)
batch_size = 64
num_epochs = 100     # Reduced for faster testing
lr = 0.0002           # Lower learning rate for stability


In [None]:
# Initialize models
generator = Generator(input_dim, hidden_dim, output_dim, seq_len).to(device)
discriminator = Discriminator(output_dim, hidden_dim, seq_len).to(device)

In [None]:
# Optimizers
g_optimizer = optim.Adam(generator.parameters(), lr=lr)
d_optimizer = optim.Adam(discriminator.parameters(), lr=lr)


In [None]:
# Loss function
criterion = nn.BCELoss()


In [None]:
# Load dataset
file_path = "/content/AirQualityUCI.csv"  # Path for Google Colab
try:
    data_tensor, scaler = load_air_quality_data(file_path, seq_len)
except (FileNotFoundError, ValueError) as e:
    print(e)
    exit()

Dataset columns: ['Date', 'Time', 'CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH', 'Unnamed: 15', 'Unnamed: 16']


  df.fillna(method='ffill', inplace=True)


In [None]:

dataset = TensorDataset(data_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Training loop
def train_rgan():
    print("Starting training...")
    for epoch in range(num_epochs):
        for real_data in dataloader:
            real_data = real_data[0].to(device)  # Shape: (batch_size, seq_len, output_dim) #real data get from Dataloder---> data,label
            batch_size_actual = real_data.size(0) # batch size  --->last layer batch <= batch size

            # Train Discriminator
            d_optimizer.zero_grad()
            real_labels = torch.ones(batch_size_actual, 1).to(device) # real label
            d_real_output, _ = discriminator(real_data)
            d_real_loss = criterion(d_real_output, real_labels)

            noise = torch.randn(batch_size_actual, seq_len, input_dim).to(device)
            fake_data, _ = generator(noise)
            fake_labels = torch.zeros(batch_size_actual, 1).to(device)
            d_fake_output, _ = discriminator(fake_data.detach())  # detach = dont use Generatoe Gradian
            d_fake_loss = criterion(d_fake_output, fake_labels)

            d_loss = d_real_loss + d_fake_loss
            d_loss.backward()
            d_optimizer.step()

            # Train Generator
            g_optimizer.zero_grad()
            fake_data, _ = generator(noise)
            g_output, _ = discriminator(fake_data)
            g_loss = criterion(g_output, real_labels)
            g_loss.backward()
            g_optimizer.step()

        if (epoch + 1) % 50 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], D Loss: {d_loss.item():.4f}, G Loss: {g_loss.item():.4f}')

num sample = number of generate sample that you want
seq_len = lenght of time series

input_dim = dim of noise

In [None]:

# Generate samples
def generate_samples(generator, num_samples, seq_len, input_dim, scaler):
    generator.eval()
    with torch.no_grad():
        noise = torch.randn(num_samples, seq_len, input_dim).to(device)
        samples, _ = generator(noise)
        samples = samples.cpu().numpy()
        samples = samples.reshape(-1, 1)#reason : scaler.inverse_transform input
        samples = scaler.inverse_transform(samples)  #standard value
        samples = samples.reshape(num_samples, seq_len, output_dim)
    return samples

In [None]:

# Run training
if __name__ == "__main__":
    train_rgan()
    samples = generate_samples(generator, num_samples=10, seq_len=seq_len, input_dim=input_dim, scaler=scaler)
    print("Generated samples shape:", samples.shape)
    print("Example generated sequence (first sample):", samples[0])

Starting training...
Epoch [50/100], D Loss: 1.3866, G Loss: 0.6928
Epoch [100/100], D Loss: 1.3857, G Loss: 0.6924
Generated samples shape: (10, 24, 1)
Example generated sequence (first sample): [[ 622.0374   ]
 [ 569.3809   ]
 [ 258.84546  ]
 [ 158.6298   ]
 [ 112.717766 ]
 [ 116.25985  ]
 [ 138.93507  ]
 [ 122.04822  ]
 [ 133.93031  ]
 [ 145.84433  ]
 [ 153.16765  ]
 [ 168.14835  ]
 [ 212.22514  ]
 [ 316.67538  ]
 [ 568.2168   ]
 [ 907.2803   ]
 [1136.35     ]
 [ 758.1364   ]
 [  47.51143  ]
 [   4.286781 ]
 [   2.314729 ]
 [   2.055903 ]
 [   2.0208204]
 [   2.01972  ]]
