# Data Analysis for plotly/Streamlit

In [40]:
import pandas as pd
import plotly.express as px

In [41]:
# Import dataframe
df = pd.read_csv("../data/load/go_capstone_data.csv")
df

Unnamed: 0,country,year,iso_code,population,gdp,co2,co2_per_capita,cumulative_co2,cumulative_oil_co2,oil_co2,oil_co2_per_capita,region,coal_cons_per_capita,coal_consumption,energy_per_capita
0,Afghanistan,1994,AFG,16250799,7919856640,1.454,0.089,64.941,33.346,1.032,0.064,Asia,6114.397,1416.601,484.347
1,Afghanistan,1995,AFG,17065836,12307525632,1.417,0.083,66.358,34.357,1.011,0.059,Asia,6064.712,1459.240,386.491
2,Afghanistan,1996,AFG,17763265,12070125568,1.370,0.077,67.728,35.343,0.986,0.055,Asia,6015.026,1501.880,368.900
3,Afghanistan,1997,AFG,18452100,11850753024,1.304,0.071,69.032,36.292,0.949,0.051,Asia,5965.341,1544.519,341.219
4,Afghanistan,1998,AFG,19159996,11692171264,1.279,0.067,70.311,37.234,0.942,0.049,Asia,5915.656,1587.159,321.279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4901,Zimbabwe,2019,ZWE,15271377,25146417152,10.263,0.672,774.802,144.767,3.778,0.247,Africa,4872.263,2482.590,2603.889
4902,Zimbabwe,2020,ZWE,15526888,23178706944,8.495,0.547,783.296,147.830,3.063,0.197,Africa,4822.578,2525.230,2163.075
4903,Zimbabwe,2021,ZWE,15797220,25140088832,10.204,0.646,793.500,151.565,3.735,0.236,Africa,4772.892,2567.869,2382.443
4904,Zimbabwe,2022,ZWE,16069061,25901586432,10.425,0.649,803.925,154.944,3.379,0.210,Africa,4723.207,2610.509,3633.079


In [42]:
# df_regions_mean: grouped by region and year with aggregate set as mean
df_regions_mean = df.groupby(['region', 'year'], as_index=False).mean(numeric_only=True)

# df_regions_sum: grouped by region and year with aggregate set as sum
df_regions_sum = df.groupby(['region', 'year'], as_index=False).sum(numeric_only=True)

# df_regions_median: grouped by region and year with aggregate set as median
df_regions_median = df.groupby(['region', 'year'], as_index=False).median(numeric_only=True)

# df_regions_mode: grouped by region and year with aggregate set as mode
df_regions_mode = df.groupby(['region', 'year'], as_index=False).agg(lambda x: x.mode().iloc[0] if not x.mode().empty else pd.NA)

# df_regions_stddev: grouped by region and year with aggregate set as standard deviation
df_regions_stddev = df.groupby(['region', 'year'], as_index=False).std(numeric_only=True)

# df_regions_min: grouped by region and year with aggregate set as minimum
df_regions_min = df.groupby(['region', 'year'], as_index=False).min(numeric_only=True)

# df_regions_max: grouped by region and year with aggregate set as maximum
df_regions_max = df.groupby(['region', 'year'], as_index=False).max(numeric_only=True)

# list of each df_regions
df_regions = [
    df_regions_mean,
    df_regions_sum,
    df_regions_median,
    df_regions_mode,
    df_regions_stddev,
    df_regions_min,
    df_regions_max
]

# list of countries in each region
region_countries = df.groupby('region')['country'].unique()
region_countries

region
Africa           [Algeria, Angola, Benin, Botswana, Burkina Fas...
Asia             [Afghanistan, Armenia, Azerbaijan, Bahrain, Ba...
Europe           [Albania, Austria, Belarus, Belgium, Bosnia an...
North America    [Barbados, Canada, Costa Rica, Cuba, Dominica,...
Oceania                                   [Australia, New Zealand]
Other                                      [Cape Verde, Hong Kong]
South America    [Argentina, Bolivia, Brazil, Chile, Colombia, ...
Name: country, dtype: object

In [43]:
# Track co2_per_capita by year, with a line chart
for df_agg in df_regions:
    fig = px.line(
        df_agg,
        x='year',
        y='co2_per_capita',
        title='CO2 Emissions per Capita Over Time',
        labels={'co2_per_capita': 'CO2 Emissions per Capita', 'year': 'Year'},
        color='region'
    )
    fig.show()