In [1]:
#Import libraries as needed
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [11]:
#Import single csv file to dataframe
df = pd.read_csv ('madrid_2017.csv', delimiter =',')

In [12]:
#STEP 1: IMPUTE DATA AND MAKE SURE U HAVE CHOSEN
#the most suitable imputaion method

df['date'] = pd.to_datetime(df['date'])

# Set the 'date' column as the index of the DataFrame
df = df.set_index('date')

# Interpolate missing values using time-based interpolation
df= df.interpolate(method='time',limit_direction='both')

# Reset the index to use an integer index instead of 'date'
df = df.reset_index(drop=False)

In [None]:
# Get the columns to plot
pollutants = df.columns[1:-1]

# Create a figure with 4x4 subplots
fig, axs = plt.subplots(nrows=4, ncols=4, figsize=(16, 16))

# Plot a histogram for each column
for i, pollutant in enumerate(pollutants):
    # Get the data for the column
    data = df[pollutant].dropna()
    
    # Determine the subplot location based on the column index
    row = i // 4
    col = i % 4
    # get the range of values in the column, ignoring NaN and Inf
    x_min = np.nanmin(df[pollutant][np.isfinite(df[pollutant])])
    x_max = np.nanmax(df[pollutant][np.isfinite(df[pollutant])])
    
    # calculate number of bins using IQR rule
    n = len(df[pollutant])
    std = np.std(df[pollutant])
    k = 3.5 * std / (n**(1/3))
    num_bins = int((x_max - x_min) / k) if k != 0 else 1 # Added this line to handle the case when k=0
    # Plot the histogram with kde
    sns.histplot(data, kde=True, bins=num_bins, color='#8C78F0', ax=axs[row, col])
    
    # Add a vertical line for the mean
    mean = data.mean()
    axs[row, col].axvline(mean, color='k', linestyle='dashed', linewidth=1)
    
    # Add a vertical line for the standard deviation
    std = data.std()
    axs[row, col].axvline(mean+std, color='#8C78F0', linestyle='dashed', linewidth=1)
    axs[row, col].axvline(mean-std, color='#8C78F0', linestyle='dashed', linewidth=1)
    
    # Set the title and axis labels
    axs[row, col].set_title(pollutant)
    axs[row, col].set_xlabel('Value')
    axs[row, col].set_ylabel('Density')
    axs[row, col].set_xlim([data.min(), data.max()])  # set x-axis range

# Adjust the spacing between subplots
fig.subplots_adjust(hspace=0.4, wspace=0.4)

# Show the plot
plt.show()

In [None]:
#range calc

# Calculate the range of each column in df
ranges = np.ptp(df, axis=0)

# Print the ranges
print(ranges)

In [None]:
# mean, median, and standard deviation check
for pollutant in df.columns[1:-1]:
    mean = df[pollutant].mean()
    median = df[pollutant].median()
    std = df[pollutant].std()
    print(f"{pollutant}: mean={mean:.2f}, median={median:.2f}, std={std:.2f}")


In [None]:
#STEP 3: DEAL WITH OUTLIERS/ NORMALIZE
df_top = df[['CO', 'EBE', 'NO_2', 'O_3', 'PM10', 'PM25','SO_2']]
df_bottom = df[['BEN', 'CH4', 'NMHC', 'NO', 'NOx', 'TCH', 'TOL']]

fig, axs = plt.subplots(2, figsize=(10, 8))

# First plot with first 7 rows
sns.boxplot(data=df_top, palette='PuRd', ax=axs[0])
axs[0].set_xticklabels(df_top.columns, rotation=45)

# Second plot with last 7 rows
sns.boxplot(data=df_bottom, palette='PuRd', ax=axs[1])
axs[1].set_xticklabels(df_bottom.columns, rotation=45)

plt.show()

In [None]:
#IQR OUTLIER REMOVAL AND NORMALIZATION


# Get the columns to normalize
pollutants = df.columns[1:-1]

# Find outliers using the IQR method with k=1.5
Q1 = df[pollutants].quantile(0.25)
Q3 = df[pollutants].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

#show number of outliers
outliers = (df[pollutants] < lower_bound) | (df[pollutants] > upper_bound)
print(outliers.sum())

#Remove outliers
df[pollutants] = df[pollutants].mask(outliers)

In [15]:
# Fill NaN values using time-based interpolation
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
df = df.interpolate(method='time', limit_direction='both')
df = df.reset_index(drop=False)

In [None]:
nan_counts = df.isnull().sum()
print(nan_counts)

In [50]:
# SCALE IF NECESARY using the interquartile range (IQR) method
#for pollutant in pollutants:
#    col = df[pollutant]
#    scaler = RobustScaler(with_centering=False, quantile_range=(25.0, 75.0), copy=True, with_scaling=True)
#    df[pollutant] = scaler.fit_transform(col.values.reshape(-1,1))

In [None]:
df.head(100)

In [None]:
#ADD AQI column(AQI categorical value in categories = ['Good', 'Fair', 'Moderate', 'Poor', 'Very Poor', 'Extremely Poor'] )

In [13]:
# Define the breakpoints and categories
breakpoints = {
    'PM25': [0, 10, 20, 25, 50, 75, 800],
    'PM10': [0, 20, 40, 50, 100, 150, 1200],
    'NO_2': [0, 40, 90, 120, 230, 340, 1000],
    'O_3': [0, 50, 100, 130, 240, 380, 800],
    'SO_2': [0, 100, 200, 350, 500, 750, 1250]
}

categories = ['Good', 'Fair', 'Moderate', 'Poor', 'Very Poor', 'Extremely Poor']

In [20]:
# Define a function to calculate the AQI index for a given pollutant and concentration
def calculate_index_level(pollutant, concentration):
    breakpoints_list = breakpoints[pollutant]
    for i in range(len(breakpoints_list)-1):
        if breakpoints_list[i] <= concentration < breakpoints_list[i+1]:
            return i+1
    return 6  # If concentration exceeds the highest breakpoint, return the highest index level

In [26]:
# Define a function to calculate the overall AQI index for a row of data
def calculate_aqi_index(row):
    levels = [calculate_index_level('PM25', row['PM25']),
              calculate_index_level('PM10', row['PM10']),
              calculate_index_level('NO_2', row['NO_2']),
              calculate_index_level('O_3', row['O_3']),
              calculate_index_level('SO_2', row['SO_2'])]
    return max(levels)

In [27]:
# Calculate the AQI index for each row
df['AQI_Index'] = df.apply(calculate_aqi_index, axis=1)

In [28]:
# Define a function to calculate the AQI category for a given index level
def calculate_aqi_category(index_level):
    return categories[index_level-1]

# Add a column for the AQI category
df['AQI_Category'] = df['AQI_Index'].apply(calculate_aqi_category)

In [30]:
# Drop the pollutant AQI index columns
df = df.drop(['PM25_AQI_Index', 'PM10_AQI_Index', 'NO_2_AQI_Index', 'O_3_AQI_Index', 'SO_2_AQI_Index'], axis=1)

In [None]:
df.tail(100)

In [19]:

# Save the DataFrame without the AQI values to a CSV file in the current working directory
df.to_csv('C:\\Users\\eleni\\Documents\\Diplw\\Jupyter-Notebooks\\diplw\\clean_iqr.csv', index=False)

In [34]:

# Save the DataFrame with AQI to a CSV file in the current working directory
df.to_csv('C:\\Users\\eleni\\Documents\\Diplw\\Jupyter-Notebooks\\diplw\\clean_iqr_with_AQI.csv', index=False)