<a href="https://colab.research.google.com/github/Kanyarat-Jirarat/DADS6003_ML/blob/main/PM2_5_Pollution_Levels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import the raw CSV file on GitHub

In [1]:
import pandas as pd
import requests
from io import StringIO

# URL of the raw CSV file on GitHub
url = "https://raw.githubusercontent.com/Kanyarat-Jirarat/DADS6003_ML/main/PM2.5_Global_Air_Pollution.csv"

# Fetch the content of the CSV file
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Read the CSV data into a DataFrame
    df = pd.read_csv(StringIO(response.text))
    # Display the DataFrame
    print(df.head())
else:
    print("Failed to fetch data from URL:", url)

  Country Name Country Code       2010       2011       2012       2013  \
0  Afghanistan          AFG  65.245592  66.835727  66.023174  61.366745   
1       Angola          AGO  33.787247  33.104195  33.415495  34.663923   
2      Albania          ALB  21.277828  22.772537  20.578259  19.938517   
3      Andorra          AND  12.807197  13.273506  12.407053  11.813673   
4   Arab World          ARB  53.787001  52.652279  53.297270  54.053822   

        2014       2015       2016       2017  
0  59.010330  61.252656  56.287047  56.910808  
1  32.974025  32.729873  31.785389  32.388505  
2  18.883955  19.512540  18.189934  18.200603  
3  10.830418  11.462178  10.255834  10.307621  
4  52.583603  60.406813  58.764905  58.689259  


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country Name  240 non-null    object 
 1   Country Code  240 non-null    object 
 2   2010          240 non-null    float64
 3   2011          240 non-null    float64
 4   2012          240 non-null    float64
 5   2013          240 non-null    float64
 6   2014          240 non-null    float64
 7   2015          240 non-null    float64
 8   2016          240 non-null    float64
 9   2017          240 non-null    float64
dtypes: float64(8), object(2)
memory usage: 18.9+ KB


In [3]:
import pandas as pd
import plotly.graph_objects as go

In [4]:
# print(df.dtypes)

df.fillna(0, inplace=True)  # Replace NaNs with 0

# Convert numeric columns to appropriate data types
numeric_columns = ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017']
df[numeric_columns] = df[numeric_columns].astype(float)

# Perform groupby and sum operation
ttl = df.melt(id_vars=('Country Name'), value_vars=numeric_columns).groupby('Country Name').sum().sort_values(by='value', ascending=False)

# Sort countries by sum of PM2.5 pollution levels in descending order
ttl = ttl.sort_values(by='value', ascending=False)

# Get the top 20 countries
top20_countries = ttl.head(20).index

# Filter the DataFrame to include only the top 20 countries
is_top20 = df['Country Name'].isin(top20_countries)
top20 = df[is_top20]

# Define colors for the years
colors = {x: 'rgba(0, 0, 0, {})'.format((x - 2009) / 10) for x in range(2010, 2018)}

# Create the data for the bar chart
data = [
    go.Bar(
        name=str(year),
        marker_color=colors[year],
        marker={"line": {"width": 0}},
        x=top20['Country Name'],
        y=top20[str(year)]
    ) for year in range(2010, 2018)
]

# Create the figure
fig = go.Figure(data=data)

# Update layout
fig.update_layout(
    barmode='stack',
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    title='Top 20 Countries by PM2.5 Pollution Levels'
)

# Sort x-axis (country names) in descending order
fig.update_layout(xaxis={'categoryorder':'total descending'})

# Show the figure
fig.show()
