In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data Cleaning

##### Reading the csv file

In [2]:
# Reading a csv file
Geelong=pd.read_csv('Geelong.csv')
Geelong.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Geelong.csv'

##### Here we are dropping the columns that we don't need for the analysis

In [None]:
# Dropping unnecessary columns
Geelong=Geelong.drop(["hour","day","year","hod","dow","measurement_count","displayName","first_datetime","last_datetime"], axis=1)
# Checking the first few rows
Geelong.head()

Unnamed: 0,id,name,month,average,parameter,parameterId,unit
0,10869,Geelong South,2020-01-01,44.870594,pm10,1,µg/m³
1,10869,Geelong South,2020-01-01,24.428586,pm25,2,µg/m³
2,10869,Geelong South,2020-01-01,0.005555,no2,7,ppm
3,10869,Geelong South,2020-01-01,0.305779,co,8,ppm
4,10869,Geelong South,2020-01-01,0.000632,so2,9,ppm


##### Checking for duplicate values

In [None]:
# Check for duplicated rows
duplicates = Geelong[Geelong.duplicated(keep=False)]

# Display the duplicated rows
print(duplicates)

Empty DataFrame
Columns: [id, name, month, average, parameter, parameterId, unit]
Index: []


##### Modifying the months column

In [None]:
# Creating a dataframe
Geelong = pd.DataFrame(Geelong)

# Convert the 'month' column to datetime
Geelong['month']=pd.to_datetime(Geelong['month'])

# Apply strftime to the 'month' column to format it as "Month Year"
Geelong['formatted_month'] = Geelong['month'].dt.strftime('%B %Y')

# Checking on few rows
Geelong.head()

Unnamed: 0,id,name,month,average,parameter,parameterId,unit,formatted_month
0,10869,Geelong South,2020-01-01,44.870594,pm10,1,µg/m³,January 2020
1,10869,Geelong South,2020-01-01,24.428586,pm25,2,µg/m³,January 2020
2,10869,Geelong South,2020-01-01,0.005555,no2,7,ppm,January 2020
3,10869,Geelong South,2020-01-01,0.305779,co,8,ppm,January 2020
4,10869,Geelong South,2020-01-01,0.000632,so2,9,ppm,January 2020


In [None]:
# Removing the previous 'month' column
Geelong=Geelong.drop('month',axis=1)
Geelong.head()
  

Unnamed: 0,id,name,average,parameter,parameterId,unit,formatted_month
0,10869,Geelong South,44.870594,pm10,1,µg/m³,January 2020
1,10869,Geelong South,24.428586,pm25,2,µg/m³,January 2020
2,10869,Geelong South,0.005555,no2,7,ppm,January 2020
3,10869,Geelong South,0.305779,co,8,ppm,January 2020
4,10869,Geelong South,0.000632,so2,9,ppm,January 2020


In [None]:
# Splitting the formatted_month column to month and year
Geelong[['months','year']] = Geelong['formatted_month'].str.split(' ',expand=True)
Geelong 

# Removing the formatted_month column
Geelong = Geelong.drop('formatted_month',axis=1)
Geelong.head()

# Export the DataFrame to a CSV file
csv_file_path = 'Geelong_modified.csv'
Geelong.to_csv(csv_file_path, index=False)

#### Origanizing the dataframe in a way that the parameters are the columns

In [None]:
# Step 1: Assess data completeness
completeness = Geelong.groupby(['name', 'parameter'])['average'].count().unstack()
print("Data completeness (number of measurements):")
print(completeness)

# Calculate the percentage of missing data for each city-parameter combination
missing_percentage = (1 - completeness.divide(completeness.max())) * 100
print("\nPercentage of missing data:")
print(missing_percentage)

# Step 2: Filter out parameters or Geelong with too much missing data
threshold = 50
valid_parameters = missing_percentage.columns[missing_percentage.mean() < threshold]
valid_Geelong = missing_percentage.index[missing_percentage.mean(axis=1) < threshold]

# Filter the original dataset
Geelong_filtered = Geelong[
    (Geelong['name'].isin(valid_Geelong)) & 
    (Geelong['parameter'].isin(valid_parameters))
]

# Step 3: Handle remaining missing values
Geelong_pivoted = Geelong_filtered.pivot_table(
    index=['name', 'months', 'year'],
    columns='parameter',
    values='average'
).reset_index()

Geelong_cleaned = Geelong_pivoted.groupby('name').apply(
    lambda x: x.sort_values(['year', 'months']).ffill()
).reset_index(drop=True)

# Convert 'months' to datetime for proper sorting
Geelong_cleaned['date'] = pd.to_datetime(Geelong_cleaned['year'].astype(str) + ' ' + Geelong_cleaned['months'], format='%Y %B')
Geelong_cleaned = Geelong_cleaned.sort_values(['name', 'date'])

print("\nCleaned data shape:", Geelong_cleaned.shape)
print(Geelong_cleaned.head())

# Get the list of parameters
parameters = Geelong_cleaned.columns.drop(['name', 'months', 'year', 'date'])

# Export the resulting DataFrame to a CSV file
Geelong_cleaned.to_csv('cleaned_Geelong_data.csv', index=False)

Data completeness (number of measurements):
parameter      co  no2  o3  pm10  pm25  so2
name                                       
Geelong South  17   17  16    17    17   16

Percentage of missing data:
parameter       co  no2   o3  pm10  pm25  so2
name                                         
Geelong South  0.0  0.0  0.0   0.0   0.0  0.0

Cleaned data shape: (17, 10)
parameter           name    months  year        co       no2        o3  \
4          Geelong South   January  2020  0.305779  0.005555  0.023987   
3          Geelong South  February  2020  0.109537  0.003271  0.015526   
7          Geelong South     March  2020  0.110571  0.005414  0.017661   
0          Geelong South     April  2020  0.129982  0.005697  0.017437   
8          Geelong South       May  2020  0.182470  0.007664  0.015403   

parameter       pm10       pm25       so2       date  
4          44.870594  24.428586  0.000632 2020-01-01  
3          16.761081   5.659619  0.000282 2020-02-01  
7          19.994

  Geelong_cleaned = Geelong_pivoted.groupby('name').apply(


In [None]:
# Load the dataset
file_path = 'cleaned_Geelong_data.csv'  
df = pd.read_csv(file_path)

# Conversion factors (from ppm to µg/m³)
molar_masses = {
    'no2': 46.01,  # g/mol for NO2
    'so2': 64.07,  # g/mol for SO2
    'co': 28.01,   # g/mol for CO
    'o3': 48.00    # g/mol for O3
}

# Standard volume at STP in liters
V_m = 22.414

# Convert from ppm to µg/m³
for gas in molar_masses.keys():
    df[gas] = df[gas] * molar_masses[gas] * 1000 / V_m

# Save the converted dataset
df.to_csv('converted_g_city_data.csv', index=False)

print("Conversion completed and saved as 'converted_g_city_data.csv'")


Conversion completed and saved as 'converted_g_city_data.csv'


##### Calculating AQI

In [None]:
def calculate_aqi(concentration, breakpoints):
    for low, high, i_low, i_high in breakpoints:
        if low <= concentration <= high:
            return ((i_high - i_low) / (high - low)) * (concentration - low) + i_low
    return None

def calculate_overall_aqi(row, breakpoints):
    aqi_values = []
    for pollutant in ['pm25', 'pm10', 'o3', 'no2','so2','co']:
        if pd.notnull(row[pollutant]):
            aqi = calculate_aqi(row[pollutant], breakpoints[pollutant])
            if aqi is not None:
                aqi_values.append(aqi)
    return max(aqi_values) if aqi_values else None

# Load your CSV file
realD = pd.read_csv('converted_g_city_data.csv') 

# Define the breakpoints
# Define the breakpoints
breakpoints = {
    'pm10': [
        (0, 54, 0, 50),(55, 154, 51, 100),(155, 254, 101, 150),(255, 354, 151, 200),
        (355, 424, 201, 300),(425, 504, 301, 400),(505, 604, 401, 500)],
    'pm25': [
        (0.0, 12.0, 0, 50),(12.1, 35.4, 51, 100),(35.5, 55.4, 101, 150),(55.5, 150.4, 151, 200),
        (150.5, 250.4, 201, 300), (250.5, 350.4, 301, 400), (350.5, 500.4, 401, 500)],
    'no2': [
        (0, 53, 0, 50),(54, 100, 51, 100), (101, 360, 101, 150), (361, 649, 151, 200),
        (650, 1249, 201, 300),(1250, 1649, 301, 400),(1650, 2049, 401, 500)],
    'co': [ (0.0, 4.4, 0, 50), (4.5, 9.4, 51, 100), (9.5, 12.4, 101, 150),
        (12.5, 15.4, 151, 200),(15.5, 30.4, 201, 300),(30.5, 40.4, 301, 400),(40.5, 50.4, 401, 500)],
    'so2': [ (0, 35, 0, 50),(36, 75, 51, 100),(76, 185, 101, 150),(186, 304, 151, 200),
        (305, 604, 201, 300),(605, 804, 301, 400),(805, 1004, 401, 500)],
    'o3': [
        (0, 54, 0, 50),(55, 70, 51, 100),(71, 85, 101, 150),(86, 105, 151, 200),(106, 200, 201, 300),(201, 504, 301, 500)
    ]}


# Calculate AQI for each pollutant and overall AQI
for pollutant in ['pm25', 'pm10', 'o3', 'no2','so2','co']:
    realD[f'{pollutant}_aqi'] = realD[pollutant].apply(lambda x: calculate_aqi(x, breakpoints[pollutant]))

realD['overall_aqi'] = realD.apply(lambda row: calculate_overall_aqi(row, breakpoints), axis=1)

# Save the results
realD.to_csv('g_aqi_results.csv', index=False)

print(realD.head())  # Display the first few rows of the results

            name    months  year          co        no2         o3       pm10  \
0  Geelong South   January  2020  382.120947  11.403471  51.368301  44.870594   
1  Geelong South  February  2020  136.884835   6.714926  33.249185  16.761081   
2  Geelong South     March  2020  138.176736  11.112518  37.820873  19.994307   
3  Geelong South     April  2020  162.434382  11.694516  37.341007  15.455450   
4  Geelong South       May  2020  228.026930  15.732806  32.986780  14.324800   

        pm25       so2        date   pm25_aqi   pm10_aqi     o3_aqi  \
0  24.428586  1.805970  2020-01-01  76.927069  41.546846  47.563241   
1   5.659619  0.804863  2020-02-01  23.581745  15.519519  30.786282   
2   4.709389  1.677218  2020-03-01  19.622455  18.513248  35.019326   
3   6.191056  0.564660  2020-04-01  25.796066  14.310602  34.575006   
4   6.543255  1.408298  2020-05-01  27.263561  13.263704  30.543315   

     no2_aqi   so2_aqi co_aqi  overall_aqi  
0  10.757991  2.579957   None    76.92706

##### Rounding the values of the AQI

In [None]:
# Drop specified columns
rnew= pd.read_csv('g_aqi_results.csv')
rnew.drop(columns=['pm10', 'pm25','no2','o3','so2','co','co_aqi'], inplace=True, errors='ignore')

# Round the average column
rnew[['pm25_aqi','pm10_aqi','o3_aqi','no2_aqi','so2_aqi','overall_aqi']] = rnew[['pm25_aqi','pm10_aqi','o3_aqi','no2_aqi','so2_aqi','overall_aqi']].round()

# Export the resulting DataFrame to a CSV file
rnew.to_csv('g_aqii_results.csv', index=False)
rnew.head()

Unnamed: 0,name,months,year,date,pm25_aqi,pm10_aqi,o3_aqi,no2_aqi,so2_aqi,overall_aqi
0,Geelong South,January,2020,2020-01-01,77.0,42.0,48.0,11.0,3.0,77.0
1,Geelong South,February,2020,2020-02-01,24.0,16.0,31.0,6.0,1.0,31.0
2,Geelong South,March,2020,2020-03-01,20.0,19.0,35.0,10.0,2.0,35.0
3,Geelong South,April,2020,2020-04-01,26.0,14.0,35.0,11.0,1.0,35.0
4,Geelong South,May,2020,2020-05-01,27.0,13.0,31.0,15.0,2.0,31.0
