#### Import Libraries

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("China_Air_Quality.csv")
df.head()

Unnamed: 0,location,coordinates.latitude,coordinates.longitude,unit,co,humidity,no2,o3,pm1,pm10,pm25,pressure,so2,temperature,um003,um005,um010,um025,um050,um100
0,"Kunshan, Suzhou, China",31.40484,120.90408,particles/cm³,,55.0,,,10.8,20.4,18.5,1004.43,,98.0,20.09,5.72,1.34,0.13,0.02,0.02
1,atelierYVF,22.290922,114.19941,mb,,60.0,,,3.3,5.7,4.8,1001.94,,93.0,11.94,2.54,0.33,0.03,0.01,0.0
2,TVA Connected Communities #3,35.991707,-83.89293,particles/cm³,,69.0,,,7.1,10.2,9.8,983.91,,75.0,13.27,3.7,0.62,0.03,0.0,0.0
3,Juarez casa,33.7174,117.90313,µg/m³,,61.0,,,10.3,17.0,15.3,1010.7,,79.0,18.49,5.34,1.08,0.06,0.02,0.01
4,Science Park Shenzhen,22.528564,113.94328,mb,,57.0,,,6.7,7.6,7.5,999.63,,100.0,44.74,4.4,0.16,0.01,0.0,0.0


In [3]:
df.isnull().sum()

location                  0
coordinates.latitude      0
coordinates.longitude     0
unit                      0
co                       26
humidity                 91
no2                      19
o3                       17
pm1                      88
pm10                     10
pm25                      1
pressure                 91
so2                      18
temperature              91
um003                    91
um005                    91
um010                    88
um025                    88
um050                    91
um100                    88
dtype: int64

In [4]:
df.columns

Index(['location', 'coordinates.latitude', 'coordinates.longitude', 'unit',
       'co', 'humidity', 'no2', 'o3', 'pm1', 'pm10', 'pm25', 'pressure', 'so2',
       'temperature', 'um003', 'um005', 'um010', 'um025', 'um050', 'um100'],
      dtype='object')

We can assume that cities without enough informations are far from areas where the air is so poluant.
So we can focus just on the cities that we have enough informations about them.

In [8]:
reduced_df = df[df['co'].notnull() & df['no2'].notnull() & df['o3'].notnull() & df['pm10'].notnull() & df['pm25'].notnull() & df['so2'].notnull()]

In [9]:
reduced_df

Unnamed: 0,location,coordinates.latitude,coordinates.longitude,unit,co,humidity,no2,o3,pm1,pm10,pm25,pressure,so2,temperature,um003,um005,um010,um025,um050,um100
5,Mong Kok,22.322500,114.168333,µg/m³,646.8,,62.9,18.9,,14.7,8.3,,5.0,,,,,,,
6,North,22.496710,114.128240,µg/m³,247.4,,13.2,49.3,,11.3,3.8,,0.7,,,,,,,
8,Central,22.281944,114.158056,µg/m³,274.9,,44.7,26.6,,11.0,8.2,,2.6,,,,,,,
9,Tseung Kwan O,22.317778,114.259444,µg/m³,145.6,,9.3,51.2,,10.0,2.4,,4.6,,,,,,,
11,Tuen Mun,22.391111,113.976667,µg/m³,510.1,,37.1,43.1,,13.0,9.0,,2.9,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,唐尧大酒店,36.101400,111.505000,µg/m³,800.0,,13.0,214.0,,32.0,40.0,,5.0,,,,,,,
93,城南,36.041400,111.503600,µg/m³,800.0,,21.0,205.0,,86.0,49.0,,6.0,,,,,,,
94,外国语学校,29.699400,116.025400,µg/m³,600.0,,18.0,162.0,,64.0,38.0,,4.0,,,,,,,
95,宁德一中,26.656200,119.523100,µg/m³,1700.0,,5.0,137.0,,61.0,35.0,,6.0,,,,,,,


We will reduce the dataframe so it will contain just the parameters we need to calculate the AQI : PM10, PM25, NO2, CO and O3. These parameters are also those who don't contain lot of null values

In [10]:
reduced_df = reduced_df.drop(columns=['unit', 'humidity', 'pm1', 'pressure', 'temperature', 'um003', 'um005', 'um010', 'um025', 'um050', 'um100'])

In [11]:
reduced_df

Unnamed: 0,location,coordinates.latitude,coordinates.longitude,co,no2,o3,pm10,pm25,so2
5,Mong Kok,22.322500,114.168333,646.8,62.9,18.9,14.7,8.3,5.0
6,North,22.496710,114.128240,247.4,13.2,49.3,11.3,3.8,0.7
8,Central,22.281944,114.158056,274.9,44.7,26.6,11.0,8.2,2.6
9,Tseung Kwan O,22.317778,114.259444,145.6,9.3,51.2,10.0,2.4,4.6
11,Tuen Mun,22.391111,113.976667,510.1,37.1,43.1,13.0,9.0,2.9
...,...,...,...,...,...,...,...,...,...
92,唐尧大酒店,36.101400,111.505000,800.0,13.0,214.0,32.0,40.0,5.0
93,城南,36.041400,111.503600,800.0,21.0,205.0,86.0,49.0,6.0
94,外国语学校,29.699400,116.025400,600.0,18.0,162.0,64.0,38.0,4.0
95,宁德一中,26.656200,119.523100,1700.0,5.0,137.0,61.0,35.0,6.0


In [12]:
reduced_df.isnull().sum()

location                 0
coordinates.latitude     0
coordinates.longitude    0
co                       0
no2                      0
o3                       0
pm10                     0
pm25                     0
so2                      0
dtype: int64

# Exploratory data analysis

### Visualize the CO level

In [15]:
import folium

# Create a base map centered at an initial location
map_center = [reduced_df['coordinates.latitude'].mean(), reduced_df['coordinates.longitude'].mean()]  # Center the map based on average latitude and longitude
m = folium.Map(location=map_center, zoom_start=4)

# Add markers to the map
for index, row in reduced_df.iterrows():
    folium.Marker(
        location=[row['coordinates.latitude'], row['coordinates.longitude']],
        popup=f"City: {row['location']}<br>Parameter Value: {row['co']}",
        icon=folium.Icon(color='blue')
    ).add_to(m)

# Display the map
m.save('map_with_markers.html')  # Save the map as an HTML file


In [22]:
import plotly.express as px

# Create a bubble map visualization
fig = px.scatter_mapbox(
    reduced_df,
    lat='coordinates.latitude',
    lon='coordinates.longitude',
    hover_name='location',
    size='co',  # Size of bubbles based on 'value' column
    color='co',  # Color of bubbles based on 'value' column
    color_continuous_scale='Viridis',  # Color scale
    size_max=50,  # Maximum size of bubbles
)

# Update the map layout
fig.update_layout(
    mapbox_style='carto-positron',  # Map style
    mapbox_zoom=4,  # Initial zoom level
    mapbox_center={'lat': reduced_df['coordinates.latitude'].mean(), 'lon': reduced_df['coordinates.longitude'].mean()},  # Map center
)

# Display the plot
fig.show()


In [21]:
# Specify the CO value threshold (above which cities will be shown)
co_threshold = 1000

# Filter the DataFrame to include only cities above the CO threshold
filtered_df = reduced_df[reduced_df['co'] > co_threshold]

# Create a bubble map visualization
fig = px.scatter_mapbox(
    filtered_df,
    lat='coordinates.latitude',
    lon='coordinates.longitude',
    hover_name='location',
    size='co',  # Size of bubbles based on 'value' column
    color='co',  # Color of bubbles based on 'value' column
    color_continuous_scale='Viridis',  # Color scale
    size_max=50,  # Maximum size of bubbles
)

# Update the map layout
fig.update_layout(
    mapbox_style='carto-positron',  # Map style
    mapbox_zoom=4,  # Initial zoom level
    mapbox_center={'lat': filtered_df['coordinates.latitude'].mean(), 'lon': filtered_df['coordinates.longitude'].mean()},  # Map center
)

# Display the plot
fig.show()


We can see clearly that the biggest CO levels are located in the southern east of the country (China), which is normal because it's the industrial area of China where the most factories are set.

I suggest for the next step of the project buildings some models to modelize the air quality of the most poluant city in China

### Visualize the NO2 level

In [30]:
import plotly.express as px

# Create a bubble map visualization
fig = px.scatter_mapbox(
    reduced_df,
    lat='coordinates.latitude',
    lon='coordinates.longitude',
    hover_name='location',
    size='no2',  # Size of bubbles based on 'value' column
    color='no2',  # Color of bubbles based on 'value' column
    color_continuous_scale='Viridis',  # Color scale
    size_max=50,  # Maximum size of bubbles
)

# Update the map layout
fig.update_layout(
    mapbox_style='carto-positron',  # Map style
    mapbox_zoom=4,  # Initial zoom level
    mapbox_center={'lat': reduced_df['coordinates.latitude'].mean(), 'lon': reduced_df['coordinates.longitude'].mean()},  # Map center
)

# Display the plot
fig.show()


For the case of NO2, the high levels are located in southern east provinces. These provinces have many energy intensive heavy industries and rely heavily on coal for heat and power. Some export electricity to other provinces.

### Visulize O3

In [24]:
# Create a bubble map visualization
fig = px.scatter_mapbox(
    reduced_df,
    lat='coordinates.latitude',
    lon='coordinates.longitude',
    hover_name='location',
    size='o3',  # Size of bubbles based on 'value' column
    color='o3',  # Color of bubbles based on 'value' column
    color_continuous_scale='Viridis',  # Color scale
    size_max=50,  # Maximum size of bubbles
)

# Update the map layout
fig.update_layout(
    mapbox_style='carto-positron',  # Map style
    mapbox_zoom=4,  # Initial zoom level
    mapbox_center={'lat': reduced_df['coordinates.latitude'].mean(), 'lon': reduced_df['coordinates.longitude'].mean()},  # Map center
)

# Display the plot
fig.show()

Unlike the CO, the high O3 levels are located in the northern provinces where is less factories and industries.

### Visualize the Particular matter pm10

In [29]:
# Create a bubble map visualization
fig = px.scatter_mapbox(
    reduced_df,
    lat='coordinates.latitude',
    lon='coordinates.longitude',
    hover_name='location',
    size='pm10',  # Size of bubbles based on 'value' column
    color='pm10',  # Color of bubbles based on 'value' column
    color_continuous_scale='Viridis',  # Color scale
    size_max=50,  # Maximum size of bubbles
)

# Update the map layout
fig.update_layout(
    mapbox_style='carto-positron',  # Map style
    mapbox_zoom=4,  # Initial zoom level
    mapbox_center={'lat': reduced_df['coordinates.latitude'].mean(), 'lon': reduced_df['coordinates.longitude'].mean()},  # Map center
)

# Display the plot
fig.show()

### Visualize the PM25

In [31]:
# Create a bubble map visualization
fig = px.scatter_mapbox(
    reduced_df,
    lat='coordinates.latitude',
    lon='coordinates.longitude',
    hover_name='location',
    size='pm25',  # Size of bubbles based on 'value' column
    color='pm25',  # Color of bubbles based on 'value' column
    color_continuous_scale='Viridis',  # Color scale
    size_max=50,  # Maximum size of bubbles
)

# Update the map layout
fig.update_layout(
    mapbox_style='carto-positron',  # Map style
    mapbox_zoom=4,  # Initial zoom level
    mapbox_center={'lat': reduced_df['coordinates.latitude'].mean(), 'lon': reduced_df['coordinates.longitude'].mean()},  # Map center
)

# Display the plot
fig.show()

# Calculate the air quality index (AQI)

The formula for calculating the AQI is the following : 

<p align = 'center'>AQI = [ Σ (Pollutant Data Reading) / Σ (Standards) ] * 100</p>

![Local Image](AQI.png)

Now we calculate the AQI

In [None]:
# Standards for each pollutant
standards = {
    'pm25': 25,
    'pm10': 50,
    'co': 9.0,
    'no2': 0.08,
    'o3': 0.065
}

# Calculate AQI for each city
def calculate_aqi(row):
    pollutants = ['pm25', 'pm10', 'co', 'no2', 'o3']
    total_aqi = []
    for pollutant in pollutants:
        total_aqi.append( (row[pollutant] / standards[pollutant]) * 100 )
    return max(total_aqi)

reduced_df['AQI'] = reduced_df.apply(calculate_aqi, axis=1)

In [38]:
reduced_df

Unnamed: 0,location,coordinates.latitude,coordinates.longitude,co,no2,o3,pm10,pm25,so2,AQI
5,Mong Kok,22.322500,114.168333,646.8,62.9,18.9,14.7,8.3,5.0,78625.000000
6,North,22.496710,114.128240,247.4,13.2,49.3,11.3,3.8,0.7,75846.153846
8,Central,22.281944,114.158056,274.9,44.7,26.6,11.0,8.2,2.6,55875.000000
9,Tseung Kwan O,22.317778,114.259444,145.6,9.3,51.2,10.0,2.4,4.6,78769.230769
11,Tuen Mun,22.391111,113.976667,510.1,37.1,43.1,13.0,9.0,2.9,66307.692308
...,...,...,...,...,...,...,...,...,...,...
92,唐尧大酒店,36.101400,111.505000,800.0,13.0,214.0,32.0,40.0,5.0,329230.769231
93,城南,36.041400,111.503600,800.0,21.0,205.0,86.0,49.0,6.0,315384.615385
94,外国语学校,29.699400,116.025400,600.0,18.0,162.0,64.0,38.0,4.0,249230.769231
95,宁德一中,26.656200,119.523100,1700.0,5.0,137.0,61.0,35.0,6.0,210769.230769


### Visualize the AQI

In [40]:
# Create a bubble map visualization
fig = px.scatter_mapbox(
    reduced_df,
    lat='coordinates.latitude',
    lon='coordinates.longitude',
    hover_name='location',
    size='AQI',  # Size of bubbles based on 'value' column
    color='AQI',  # Color of bubbles based on 'value' column
    color_continuous_scale='Viridis',  # Color scale
    size_max=50,  # Maximum size of bubbles
)

# Update the map layout
fig.update_layout(
    mapbox_style='carto-positron',  # Map style
    mapbox_zoom=4,  # Initial zoom level
    mapbox_center={'lat': reduced_df['coordinates.latitude'].mean(), 'lon': reduced_df['coordinates.longitude'].mean()},  # Map center
)

# Display the plot
fig.show()

For the rest of our project we are going to do some building models to predict the air quality. For that we are going to choose the most pollutant city in china, and we are going to work on it and do some time series on it.

In [42]:
# Find the index of the row with the highest AQI value
highest_aqi_index = reduced_df['AQI'].idxmax()

# Retrieve the entire row with the highest AQI value
row_with_highest_aqi = reduced_df.loc[highest_aqi_index]

print("Row with the highest AQI value:")
print(row_with_highest_aqi)

Row with the highest AQI value:
location                            金胜
coordinates.latitude            37.782
coordinates.longitude         112.4701
co                               600.0
no2                               28.0
o3                               266.0
pm10                              50.0
pm25                              13.0
so2                               13.0
AQI                      409230.769231
Name: 84, dtype: object


So as we can see Jin Sheng is the most pollutant city in china.