In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Load the dataset to examine its contents
file_path = './Data/Pollutants_Daily_Averages_with_AQI.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe along with summary statistics
data_info = data.info()
data_head = data.head()
data_description = data.describe()

(data_info, data_head, data_description)


In [None]:
# Function to find outliers using IQR method
def find_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

# Identifying outliers for each pollutant
outliers_dict = {}
for pollutant in ['CO', 'NO2', 'NOX', 'O3', 'PM10', 'PM25', 'SO2']:
    outliers_dict[pollutant] = find_outliers(data, pollutant)

# Counting the number of outliers for each pollutant
outliers_count = {pollutant: len(outliers_dict[pollutant]) for pollutant in outliers_dict}
outliers_count


In [None]:
# Function to cap the outliers with the IQR method boundaries
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower_bound, upper_bound)
    return df

# Capping the outliers for each pollutant
for pollutant in ['CO', 'NO2', 'NOX', 'O3', 'PM10', 'PM25', 'SO2']:
    data = cap_outliers(data, pollutant)

# Now we'll normalize the data using Min-Max scaling to the [0, 1] range.
from sklearn.preprocessing import MinMaxScaler

# Selecting the columns to be scaled
pollutants = ['CO', 'NO2', 'NOX', 'O3', 'PM10', 'PM25', 'SO2']

# Initializing the MinMaxScaler
scaler = MinMaxScaler()

# Fitting the scaler to the data and transforming it
data[pollutants] = scaler.fit_transform(data[pollutants])

# Checking the data after capping and normalization
data_describe_after_processing = data.describe()
data_head_after_processing = data.head()

(data_describe_after_processing, data_head_after_processing)


In [None]:
# Set style for seaborn
sns.set_theme(style="whitegrid")

# Function to plot histograms and kernel density plots for each pollutant
def plot_histograms(data, pollutants):
    fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(14, 16))
    axes = axes.flatten()
    for i, pollutant in enumerate(pollutants):
        sns.histplot(data[pollutant], kde=True, ax=axes[i], color='skyblue')
        axes[i].set_title(f'Distribution of {pollutant}')
        axes[i].set_xlabel('Normalized Value')
        axes[i].set_ylabel('Frequency')
    plt.tight_layout()
    plt.show()

# Plot histograms and kernel density plots for each pollutant
plot_histograms(data, pollutants)

# Calculate summary statistics for each pollutant
summary_statistics = data.describe()
summary_statistics


In [None]:
# Make sure 'DATE' column is in datetime format and set as index
data['DATE'] = pd.to_datetime(data['DATE'])
data.set_index('DATE', inplace=True)
# plot monthly trends
monthly_data = data.resample('M').mean()

numerical_columns = ['CO', 'NO2', 'NOX', 'O3', 'PM10', 'PM25', 'SO2', 'AQI']

fig, axes = plt.subplots(len(numerical_columns), 1, figsize=(15, 20), sharex=True)

for i, col in enumerate(numerical_columns):
    sns.lineplot(ax=axes[i], x=monthly_data.index, y=monthly_data[col])
    axes[i].set_title(f'Monthly Time Series of {col}', fontsize=14)
    axes[i].set_ylabel('Concentration')


plt.xlabel('Date')
plt.tight_layout()
plt.show()
