In [None]:
import pandas as pd
import numpy as np


df = pd.read_csv('Air_Quality.csv')

In [None]:
df

Unnamed: 0,Unique ID,Indicator ID,Name,Measure,Measure Info,Geo Type Name,Geo Join ID,Geo Place Name,Time Period,Start_Date,Data Value,Message
0,216498,386,Ozone (O3),Mean,ppb,CD,313,Coney Island (CD13),Summer 2013,06/01/2013,34.64,
1,216499,386,Ozone (O3),Mean,ppb,CD,313,Coney Island (CD13),Summer 2014,06/01/2014,33.22,
2,219969,386,Ozone (O3),Mean,ppb,Borough,1,Bronx,Summer 2013,06/01/2013,31.25,
3,219970,386,Ozone (O3),Mean,ppb,Borough,1,Bronx,Summer 2014,06/01/2014,31.15,
4,164876,383,Sulfur Dioxide (SO2),Mean,ppb,CD,211,Morris Park and Bronxdale (CD11),Winter 2008-09,12/01/2008,5.89,
...,...,...,...,...,...,...,...,...,...,...,...,...
16117,671118,386,Ozone (O3),Mean,ppb,CD,306,Park Slope and Carroll Gardens (CD6),Summer 2020,06/01/2020,28.70,
16118,671119,386,Ozone (O3),Mean,ppb,CD,305,East New York and Starrett City (CD5),Summer 2020,06/01/2020,29.56,
16119,671120,386,Ozone (O3),Mean,ppb,CD,304,Bushwick (CD4),Summer 2020,06/01/2020,29.65,
16120,671121,386,Ozone (O3),Mean,ppb,CD,303,Bedford Stuyvesant (CD3),Summer 2020,06/01/2020,29.28,


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16122 entries, 0 to 16121
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unique ID       16122 non-null  int64  
 1   Indicator ID    16122 non-null  int64  
 2   Name            16122 non-null  object 
 3   Measure         16122 non-null  object 
 4   Measure Info    16122 non-null  object 
 5   Geo Type Name   16122 non-null  object 
 6   Geo Join ID     16122 non-null  int64  
 7   Geo Place Name  16122 non-null  object 
 8   Time Period     16122 non-null  object 
 9   Start_Date      16122 non-null  object 
 10  Data Value      16122 non-null  float64
 11  Message         0 non-null      float64
dtypes: float64(2), int64(3), object(7)
memory usage: 1.5+ MB


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from datetime import datetime

# Load the dataset
df = pd.read_csv('Air_Quality.csv')

# Handle missing values by removing rows with NaN in 'Data Value'
df = df.dropna(subset=['Data Value'])

# Convert 'Start_Date' to datetime
df['Start_Date'] = pd.to_datetime(df['Start_Date'], errors='coerce')

# Extract temporal features (month, day, hour)
df['month'] = df['Start_Date'].dt.month
df['day'] = df['Start_Date'].dt.day
df['hour'] = df['Start_Date'].dt.hour

# Filter out any rows with NaT in the 'Start_Date' column (if conversion failed)
df = df.dropna(subset=['Start_Date'])

# Drop unnecessary columns, including 'Start_Date' before feature scaling
df = df.drop(columns=['Unique ID', 'Indicator ID', 'Name', 'Measure', 'Measure Info', 'Geo Type Name', 'Geo Join ID', 'Geo Place Name', 'Message', 'Time Period'])

# Resample the data to ensure consistent daily intervals and aggregate only numeric columns
df = df.set_index('Start_Date').resample('D').mean().reset_index()

# If necessary, handle missing dates by filling forward or backward
df = df.ffill().bfill()

# Separate the 'Start_Date' column from the features
X = df.drop(columns=['Data Value', 'Start_Date'])
y = df['Data Value']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
from statsmodels.tsa.arima.model import ARIMA

# ARIMA Model (For Univariate Time-Series Forecasting)
arima_model = ARIMA(y_train, order=(5, 1, 0))
arima_model = arima_model.fit()

# Save the ARIMA model
joblib.dump(arima_model, 'arima_model.pkl')

# LSTM Model (For Multi-Step Predictions)
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_layer_size, output_size, num_layers):
        super(LSTMModel, self).__init__()
        self.hidden_layer_size = hidden_layer_size
        self.lstm = nn.LSTM(input_size, hidden_layer_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_layer_size, output_size)

    def forward(self, input_seq):
        # Initialize hidden state with correct batch size
        batch_size = input_seq.size(0)
        h0 = torch.zeros(self.lstm.num_layers, batch_size, self.hidden_layer_size).to(input_seq.device)
        c0 = torch.zeros(self.lstm.num_layers, batch_size, self.hidden_layer_size).to(input_seq.device)

        lstm_out, _ = self.lstm(input_seq, (h0, c0))
        predictions = self.linear(lstm_out[:, -1])
        return predictions

# Prepare data for LSTM
X_train_lstm = torch.tensor(X_train, dtype=torch.float32).unsqueeze(1)
y_train_lstm = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_test_lstm = torch.tensor(X_test, dtype=torch.float32).unsqueeze(1)

# Initialize LSTM Model
lstm_model = LSTMModel(input_size=X_train_lstm.shape[2], hidden_layer_size=50, output_size=1, num_layers=1)
criterion = nn.MSELoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

# Train LSTM Model
epochs = 100
for epoch in range(epochs):
    lstm_model.train()

    optimizer.zero_grad()
    y_pred = lstm_model(X_train_lstm)
    loss = criterion(y_pred, y_train_lstm)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch} Loss: {loss.item()}')

# Save the LSTM model
torch.save(lstm_model.state_dict(), 'lstm_model.pth')

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Epoch 0 Loss: 572.4285278320312
Epoch 10 Loss: 570.1659545898438
Epoch 20 Loss: 567.7715454101562
Epoch 30 Loss: 565.1539916992188
Epoch 40 Loss: 562.20556640625
Epoch 50 Loss: 558.8118286132812
Epoch 60 Loss: 554.8638916015625
Epoch 70 Loss: 550.2589721679688
Epoch 80 Loss: 544.9114379882812
Epoch 90 Loss: 538.753662109375


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Define GAN components for data imputation and synthetic data generation
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim)
        )

    def forward(self, x):
        return self.model(x)

class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Hyperparameters
input_dim = X_train.shape[1]
output_dim = 1
lr = 0.0002
num_epochs = 200

# Initialize models
generator = Generator(input_dim, output_dim)
discriminator = Discriminator(input_dim + output_dim)

# Optimizers
optimizer_G = optim.Adam(generator.parameters(), lr=lr)
optimizer_D = optim.Adam(discriminator.parameters(), lr=lr)

# Loss function
criterion = nn.BCELoss()

# Prepare data for GAN
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Training loop for GAN
for epoch in range(num_epochs):
    for real_data, real_labels in train_loader:
        batch_size = real_data.size(0)

        # Real data
        real_labels = real_labels.view(-1, 1)
        real_input = torch.cat((real_data, real_labels), dim=1)
        real_target = torch.ones(batch_size, 1)

        # Fake data
        noise = torch.randn(batch_size, input_dim)
        fake_labels = generator(noise).detach()
        fake_input = torch.cat((noise, fake_labels), dim=1)
        fake_target = torch.zeros(batch_size, 1)

        # Train Discriminator
        optimizer_D.zero_grad()
        real_loss = criterion(discriminator(real_input), real_target)
        fake_loss = criterion(discriminator(fake_input), fake_target)
        d_loss = real_loss + fake_loss
        d_loss.backward()
        optimizer_D.step()

        # Train Generator
        optimizer_G.zero_grad()
        generated_labels = generator(noise)
        g_input = torch.cat((noise, generated_labels), dim=1)
        g_loss = criterion(discriminator(g_input), real_target)
        g_loss.backward()
        optimizer_G.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, D Loss: {d_loss.item()}, G Loss: {g_loss.item()}')

# Save the GAN models
torch.save(generator.state_dict(), 'gan_generator.pth')
torch.save(discriminator.state_dict(), 'gan_discriminator.pth')

# Generate synthetic data
with torch.no_grad():
    synthetic_noise = torch.randn(X_train.shape[0], input_dim)
    synthetic_labels = generator(synthetic_noise).numpy()
    synthetic_data = np.hstack((synthetic_noise.numpy(), synthetic_labels))

# Augment original data with synthetic data
augmented_X_train = np.vstack((X_train, synthetic_data[:, :-1]))
augmented_y_train = np.hstack((y_train, synthetic_data[:, -1]))


Epoch 0, D Loss: 0.5882652401924133, G Loss: 1.0045017004013062
Epoch 10, D Loss: 0.56004798412323, G Loss: 3.001343011856079
Epoch 20, D Loss: 0.14574433863162994, G Loss: 6.081140518188477
Epoch 30, D Loss: 0.06749594956636429, G Loss: 12.923426628112793
Epoch 40, D Loss: 0.04541280120611191, G Loss: 15.385089874267578
Epoch 50, D Loss: 0.10260885953903198, G Loss: 23.122194290161133
Epoch 60, D Loss: 0.036485981196165085, G Loss: 29.038841247558594
Epoch 70, D Loss: 0.026543045416474342, G Loss: 29.49405860900879
Epoch 80, D Loss: 0.008256061002612114, G Loss: 42.79937744140625
Epoch 90, D Loss: 0.00861719623208046, G Loss: 37.5733528137207
Epoch 100, D Loss: 0.004618980456143618, G Loss: 41.394344329833984
Epoch 110, D Loss: 0.06812510639429092, G Loss: 54.72383499145508
Epoch 120, D Loss: 0.00271749310195446, G Loss: 48.9306526184082
Epoch 130, D Loss: 0.002388925291597843, G Loss: 72.4206314086914
Epoch 140, D Loss: 0.0044220262207090855, G Loss: 80.40792083740234
Epoch 150, D Lo