In [1]:
import pandas as pd
import json

In [2]:
with open('openaq_data.json', 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data['results'])

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   location     10000 non-null  object 
 1   parameter    10000 non-null  object 
 2   value        10000 non-null  float64
 3   date         10000 non-null  object 
 4   unit         10000 non-null  object 
 5   coordinates  10000 non-null  object 
 6   country      10000 non-null  object 
 7   city         0 non-null      object 
dtypes: float64(1), object(7)
memory usage: 625.1+ KB


In [4]:
df.head()

Unnamed: 0,location,parameter,value,date,unit,coordinates,country,city
0,LANSING,nox,0.0025,"{'utc': '2024-04-25T19:00:00+00:00', 'local': ...",ppm,"{'latitude': 42.76138, 'longitude': -84.562867}",US,
1,Mooroolbark,pm10,6.53,"{'utc': '2024-04-25T19:00:00+00:00', 'local': ...",µg/m³,"{'latitude': -37.77512, 'longitude': 145.3284}",AU,
2,Footscray,pm25,1.22,"{'utc': '2024-04-25T19:00:00+00:00', 'local': ...",µg/m³,"{'latitude': -37.80266, 'longitude': 144.8778}",AU,
3,Footscray,pm10,6.16,"{'utc': '2024-04-25T19:00:00+00:00', 'local': ...",µg/m³,"{'latitude': -37.80266, 'longitude': 144.8778}",AU,
4,Mooroolbark,pm25,5.45,"{'utc': '2024-04-25T19:00:00+00:00', 'local': ...",µg/m³,"{'latitude': -37.77512, 'longitude': 145.3284}",AU,


In [5]:
df = df.drop(columns=['city'])
df = df.rename(columns={'location': 'city'})

In [6]:
df = df[~df['city'].str.contains(r'[A-Z]+[0-9]+')]

In [7]:
df[['latitude', 'longitude']] = df['coordinates'].apply(lambda x: pd.Series([x['latitude'], x['longitude']]))

In [8]:
df.drop(columns=['coordinates'], inplace=True)

In [9]:
df.drop(columns=['date'], inplace=True)

In [10]:
df.head()

Unnamed: 0,city,parameter,value,unit,country,latitude,longitude
0,LANSING,nox,0.0025,ppm,US,42.76138,-84.562867
1,Mooroolbark,pm10,6.53,µg/m³,AU,-37.77512,145.3284
2,Footscray,pm25,1.22,µg/m³,AU,-37.80266,144.8778
3,Footscray,pm10,6.16,µg/m³,AU,-37.80266,144.8778
4,Mooroolbark,pm25,5.45,µg/m³,AU,-37.77512,145.3284


In [11]:
df["parameter"].unique()

array(['nox', 'pm10', 'pm25', 'o3', 'so2', 'um003', 'pm1',
       'relativehumidity', 'no', 'no2', 'co', 'bc', 'temperature'],
      dtype=object)

In [12]:
df['parameter'].value_counts()

parameter
no2                 1857
pm25                1729
so2                 1286
pm10                1259
o3                   925
nox                  733
no                   722
co                   706
pm1                  145
relativehumidity     144
um003                141
temperature          130
bc                     2
Name: count, dtype: int64

In [13]:
#filter df

In [14]:
filtered_df = df.drop(df[df['parameter'].isin(['pm1', 'relativehumidity', 'um003', 'temperature', 'bc'])].index)

In [15]:
filtered_df['parameter'].value_counts()

parameter
no2     1857
pm25    1729
so2     1286
pm10    1259
o3       925
nox      733
no       722
co       706
Name: count, dtype: int64

In [16]:
#Transformation

In [17]:
def label_parameter(row):
    value = row['value']
    parameter = row['parameter']
    
    if parameter == 'nox':
        if value < 40:
            return 'Good'
        elif value < 80:
            return 'Moderate'
        else:
            return 'Unhealthy'
    elif parameter == 'pm25':
        if value < 12:
            return 'Good'
        elif value < 35.4:
            return 'Moderate'
        else:
            return 'Unhealthy'
    elif parameter == 'pm10':
        if value < 50:
            return 'Good'
        elif value < 150:
            return 'Moderate'
        else:
            return 'Unhealthy'
    elif parameter == 'no2':
        if value < 40:
            return 'Good'
        elif value < 80:
            return 'Moderate'
        else:
            return 'Unhealthy'
    elif parameter == 'so2':
        if value < 40:
            return 'Good'
        elif value < 80:
            return 'Moderate'
        else:
            return 'Unhealthy'
    elif parameter == 'o3':
        if value < 50:
            return 'Good'
        elif value < 100:
            return 'Moderate'
        else:
            return 'Unhealthy'
    elif parameter == 'no':
        if value < 40:
            return 'Good'
        elif value < 100:
            return 'Moderate'
        else:
            return 'Unhealthy'
    elif parameter == 'co':
        if value < 5:
            return 'Good'
        elif value < 10:
            return 'Moderate'
        else:
            return 'Unhealthy'

In [18]:
filtered_df['quality_label'] = filtered_df.apply(label_parameter, axis=1)

In [19]:
filtered_df.head()

Unnamed: 0,city,parameter,value,unit,country,latitude,longitude,quality_label
0,LANSING,nox,0.0025,ppm,US,42.76138,-84.562867,Good
1,Mooroolbark,pm10,6.53,µg/m³,AU,-37.77512,145.3284,Good
2,Footscray,pm25,1.22,µg/m³,AU,-37.80266,144.8778,Good
3,Footscray,pm10,6.16,µg/m³,AU,-37.80266,144.8778,Good
4,Mooroolbark,pm25,5.45,µg/m³,AU,-37.77512,145.3284,Good


In [20]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9217 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   city           9217 non-null   object 
 1   parameter      9217 non-null   object 
 2   value          9217 non-null   float64
 3   unit           9217 non-null   object 
 4   country        9217 non-null   object 
 5   latitude       9217 non-null   float64
 6   longitude      9217 non-null   float64
 7   quality_label  9217 non-null   object 
dtypes: float64(3), object(5)
memory usage: 648.1+ KB
