### Preprocessing


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from scipy.stats import mstats
from matplotlib import pyplot as plt
import random
import tensorflow as tf
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
# Imports for Random Forest
from sklearn.ensemble import RandomForestRegressor

#Import for box plot
import matplotlib.pyplot as pl

from scipy.stats import pearsonr

# Imports for LSTM
from keras.models import Sequential

from keras.layers import LSTM, Dense

# Importing required libraries for evaluation metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
df=pd.read_csv('/content/city_day.csv')
df.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,


In [None]:
#Removing Outliers
def remove_outliers_iqr(df, columns, multiplier=4):
    for column in columns:
        q25, q75 = df[column].quantile(0.25), df[column].quantile(0.75)
        iqr = q75 - q25
        cut_off = iqr * multiplier
        lower_bound, upper_bound = q25 - cut_off, q75 + cut_off
        # Identify indices of rows with outliers
        outlier_indices = df[(df[column] < lower_bound) | (df[column] > upper_bound)].index
        # Drop rows with outliers
        df.drop(outlier_indices, inplace=True)

# Example usage:
# Remove outliers using IQR method for the specified columns
remove_outliers_iqr(df, columns=['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene'])

In [None]:
# Handling Null values of numerical data using mean for original DataFrame
pmean = df["PM2.5"].mean()
df["PM2.5"].fillna(pmean, inplace=True)

pmmean = df["PM10"].mean()
df["PM10"].fillna(pmmean, inplace=True)

nmean = df["NO"].mean()
df["NO"].fillna(nmean, inplace=True)

nomean = df["NO2"].mean()
df["NO2"].fillna(nomean, inplace=True)

noxmean = df["NOx"].mean()
df["NOx"].fillna(noxmean, inplace=True)

nhmean = df["NH3"].mean()
df["NH3"].fillna(nhmean, inplace=True)

cmean = df["CO"].mean()
df["CO"].fillna(cmean, inplace=True)

smean = df["SO2"].mean()
df["SO2"].fillna(smean, inplace=True)

omean = df["O3"].mean()
df["O3"].fillna(omean, inplace=True)

bmean = df["Benzene"].mean()
df["Benzene"].fillna(bmean, inplace=True)

tmean = df["Toluene"].mean()
df["Toluene"].fillna(tmean, inplace=True)

xmean = df["Xylene"].mean()
df["Xylene"].fillna(xmean, inplace=True)

amean = df["AQI"].mean()
df["AQI"].fillna(amean, inplace=True)

# Handling Null labels of categorical data with the mode for original DataFrame
df.fillna(df.mode().iloc[0], inplace=True)

# Checking Null values again
print(df.isnull().sum())

City          0
Date          0
PM2.5         0
PM10          0
NO            0
NO2           0
NOx           0
NH3           0
CO            0
SO2           0
O3            0
Benzene       0
Toluene       0
Xylene        0
AQI           0
AQI_Bucket    0
dtype: int64


In [None]:
# Dropping feature AQI_Bucket since we will be doing regression
df.drop(columns=['AQI_Bucket'], inplace=True)

# Dropping date and city
df.drop(columns=['City', 'Date'], inplace=True)

In [None]:
# Extract the features and target variable from your dataset
X = df[['PM2.5', 'NO2','CO', 'SO2', 'O3']]
y = df['AQI']


In [None]:
df.shape

(24579, 13)

In [None]:
# Extract input features and target
X = df[['PM2.5', 'NO2', 'CO', 'SO2', 'O3']]
y = df['AQI']

### Without any transformation or standardscaling

In [None]:
# Set seed for reproducibility
seed_value = 42
np.random.seed(seed_value)

# Define the training function
def train_model(X_train, y_train, X_test, y_test):
    # Define the LSTM model
    model = Sequential([
        LSTM(units=50, return_sequences=False, input_shape=(X_train.shape[1], X_train.shape[2])),
        Dense(1)
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error')

    # Train the model
    model.fit(X_train, y_train, epochs=160, batch_size=32, validation_split=0.1, verbose=0)

    # Make predictions
    predictions = model.predict(X_test)

    # Calculate performance metrics
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    # Return the computed metrics
    return rmse, mse, mae, r2

# Define the create_sequences function
def create_sequences(input_data, target_data, window_size):
    X, y = [], []
    for i in range(len(input_data) - window_size):
        X.append(input_data[i:(i + window_size)])
        y.append(target_data[i + window_size])
    return np.array(X), np.array(y)



# Create sequences without any transformation
window_sizes = [3, 5, 7, 10]
performance_metrics = {}

for window_size in window_sizes:
    # Create sequences for the current window size
    X_seq, y_seq = create_sequences(X.values, y.values, window_size)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

    # Train the model and calculate metrics
    rmse, mse, mae, r2 = train_model(X_train, y_train, X_test, y_test)

    # Store the metrics for this window size
    performance_metrics[window_size] = {'RMSE': rmse, 'MSE': mse, 'MAE': mae, 'R2': r2}

# Display the performance metrics for each window size
for window_size, metrics in performance_metrics.items():
    print(f"Window Size: {window_size}, Metrics: {metrics}")

Window Size: 3, Metrics: {'RMSE': 33.48439520540997, 'MSE': 1121.204722272082, 'MAE': 22.23968760120633, 'R2': 0.8122541636253643}
Window Size: 5, Metrics: {'RMSE': 34.2998302425538, 'MSE': 1176.4783546680087, 'MAE': 21.43313436841901, 'R2': 0.7985676496241826}
Window Size: 7, Metrics: {'RMSE': 36.404144188545075, 'MSE': 1325.2617141003802, 'MAE': 21.385329030873052, 'R2': 0.7805496028591983}
Window Size: 10, Metrics: {'RMSE': 34.72738384711771, 'MSE': 1205.991188865052, 'MAE': 21.550136576085674, 'R2': 0.797201244356484}


### Using StandardScaler

In [None]:
# Set seed for reproducibility
seed_value = 42
np.random.seed(seed_value)

# Training function
def train_model(X_train, y_train, X_test, y_test):
    # Define the LSTM model
    model = Sequential([
        LSTM(units=50, return_sequences=False, input_shape=(X_train.shape[1], X_train.shape[2])),
        Dense(1)
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error')

    # Train the model
    model.fit(X_train, y_train, epochs=160, batch_size=32, validation_split=0.1, verbose=0)

    # Make predictions
    predictions = model.predict(X_test)

    # Calculate performance metrics
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    # Return the computed metrics
    return rmse, mse, mae, r2

# Define the create_sequences function with standard scaling
def create_sequences(input_data, target_data, window_size):
    X, y = [], []
    for i in range(len(input_data) - window_size):
        X.append(input_data[i:(i + window_size)])
        y.append(target_data[i + window_size])
    X = np.array(X)
    y = np.array(y)

    # Reshape for StandardScaler
    num_samples, window_length, num_features = X.shape
    X = X.reshape((num_samples, -1))  # Flatten to 2D array
    y = y.reshape(-1, 1)  # Convert to 2D array

    # Apply standard scaling to X and y
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y)

    # Reshape back to original shape
    X_scaled = X_scaled.reshape((num_samples, window_length, num_features))
    y_scaled = y_scaled.reshape(-1)

    return X_scaled, y_scaled

# Create sequences without any transformation
window_sizes = [3, 5, 7, 10]
performance_metrics = {}

for window_size in window_sizes:
    # Create sequences for the current window size
    X_seq, y_seq = create_sequences(X.values, y.values, window_size)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

    # Train the model and calculate metrics
    rmse, mse, mae, r2 = train_model(X_train, y_train, X_test, y_test)

    # Store the metrics for this window size
    performance_metrics[window_size] = {'RMSE': rmse, 'MSE': mse, 'MAE': mae, 'R2': r2}

# Display the performance metrics for each window size
for window_size, metrics in performance_metrics.items():
    print(f"Window Size: {window_size}, Metrics: {metrics}")

Window Size: 3, Metrics: {'RMSE': 0.5237585249846527, 'MSE': 0.274322992494099, 'MAE': 0.33736478204797304, 'R2': 0.7171470928107035}
Window Size: 5, Metrics: {'RMSE': 0.5613154074865326, 'MSE': 0.3150749866817722, 'MAE': 0.3610344242916195, 'R2': 0.667793909080328}
Window Size: 7, Metrics: {'RMSE': 0.5527878618546532, 'MSE': 0.3055744202138391, 'MAE': 0.33998712913330525, 'R2': 0.6883727347394778}
Window Size: 10, Metrics: {'RMSE': 0.5231393798080118, 'MSE': 0.2736748107059112, 'MAE': 0.3383151215428487, 'R2': 0.7165862325009428}


### Using Log Transformation

In [None]:
# Set seed for reproducibility
seed_value = 42
np.random.seed(seed_value)

# Training function
def train_model(X_train, y_train, X_test, y_test):
    # Define the LSTM model
    model = Sequential([
        LSTM(units=50, return_sequences=False, input_shape=(X_train.shape[1], X_train.shape[2])),
        Dense(1)
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error')

    # Train the model
    model.fit(X_train, y_train, epochs=160, batch_size=32, validation_split=0.1, verbose=0)

    # Make predictions
    predictions = model.predict(X_test)

    # Calculate performance metrics
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    # Return the computed metrics
    return rmse, mse, mae, r2

# Define the create_sequences function with log transformation for both X and y
def create_sequences(input_data, target_data, window_size):
    X, y = [], []
    for i in range(len(input_data) - window_size):
        X.append(np.log(input_data[i:(i + window_size)] + 1e-6))
        y.append(np.log(target_data[i + window_size] + 1e-6))
    return np.array(X), np.array(y)

# Create sequences without any transformation
window_sizes = [3, 5, 7, 10]
performance_metrics = {}

for window_size in window_sizes:
    # Create sequences for the current window size
    X_seq, y_seq = create_sequences(X.values, y.values, window_size)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

    # Train the model and calculate metrics
    rmse, mse, mae, r2 = train_model(X_train, y_train, X_test, y_test)

    # Store the metrics for this window size
    performance_metrics[window_size] = {'RMSE': rmse, 'MSE': mse, 'MAE': mae, 'R2': r2}

# Display the performance metrics for each window size
for window_size, metrics in performance_metrics.items():
    print(f"Window Size: {window_size}, Metrics: {metrics}")


Window Size: 3, Metrics: {'RMSE': 0.2525993244875409, 'MSE': 0.06380641873156198, 'MAE': 0.17861276858995623, 'R2': 0.7702492646470448}
Window Size: 5, Metrics: {'RMSE': 0.25615365831254105, 'MSE': 0.06561469666689804, 'MAE': 0.1781016781158622, 'R2': 0.7594022385050806}
Window Size: 7, Metrics: {'RMSE': 0.25225029046950104, 'MSE': 0.06363020904194763, 'MAE': 0.17352376870725159, 'R2': 0.7639721100484405}
Window Size: 10, Metrics: {'RMSE': 0.2591196346493323, 'MSE': 0.06714298506080345, 'MAE': 0.17930690534960975, 'R2': 0.7547558093636446}
