# ***Question 1 - Biggest predictor of a large CO<sub>2</sub> output***

At first I'm importing the required packages so I can write my code clearly

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import duckdb as ddb

In [None]:
pd.set_option(
    'display.max_columns', None,
    'display.max_rows', 1000,
    'display.float_format', lambda x: '%.3f' % x
    )

Here I'm getting the needed data from a raw data source, with checking if the data loads like it should.

I've chosen this data source because these are relations that are commonly announced in media and other information sources.

In [None]:
# import data
df_co2 = pd.read_csv("https://ourworldindata.org/grapher/co2-emissions-per-capita.csv?v=1&csvType=full&useColumnShortNames=false") 
df_gdp = pd.read_csv("https://ourworldindata.org/grapher/gdp-per-capita-worldbank.csv?v=1&csvType=full&useColumnShortNames=false")
df_ff = pd.read_csv("https://ourworldindata.org/grapher/fossil-fuel-cons-per-capita.csv?v=1&csvType=full&useColumnShortNames=false")

# df_co2.head()
# df_gdp.head()
# df_ff.head()


In this tab I will change the names of the columns, to improve the workflow.

In [None]:
# Rename column in df_gdp & df_ff
df_co2.rename(columns={'Annual COâ‚‚ emissions (per capita)': "CO2"}, inplace=True)
df_gdp.rename(columns={'GDP per capita, PPP (constant 2021 international $)': "GDP"}, inplace=True)
df_ff.rename(columns={'Coal per capita (kWh)': "Coal"}, inplace=True)
df_ff.rename(columns={'Oil per capita (kWh)': "Oil"}, inplace=True)
df_ff.rename(columns={'Gas per capita (kWh)': "Gas"}, inplace=True)


Next I will drop a column that is not required for my analysis.

Now I can work with data that has no redundant information.

In [None]:
# Drop World regions according to OWID column in gdp dataframe
df_gdp.drop(columns= ['World regions according to OWID'], inplace=True)

Here I'm merging the different dataframes on the most important data (CO<sub>2</sub>), and drop duplicated column names, I do this to work with one dataframe, wich is needed for correlation and regression purposes.

Also I do a left join because I only need the data on gdp and fossil fuels where there is data on CO<sub>2</sub>.

In [None]:
# merge co2 & gdp dataframe
df_merged = pd.merge(df_co2, df_gdp.drop(columns=['Entity']), how='left', on=['Code', 'Year'])


In [None]:
# Merge gpd & fossil fuel dataframe
df_merged = pd.merge(df_merged, df_ff.drop(columns=['Entity']) ,how='left', on=['Code', 'Year'])


It can be that the influence of each separate fossil fuel source will be less than the combined sources because not every country uses every source.

That's why I've made a combined fuel source column to also be able to see the combined effect on CO<sub>2</sub>

In [None]:
# Create total fossil fuel column
df_merged['Total ff'] = df_merged['Coal'] + df_merged['Oil'] + df_merged['Gas']


In [None]:
df_merged['Total ff MWh'] = df_merged['Total ff'] / 1000  # convert kWh to MWh

I noticed some missing values so I checked if there was any data on "andorra" in the fossil fuel data frame,

There was no data so I'm going to drop data later on.

In [None]:
ddb.query("""
    select *
    from df_ff
    where entity ilike '%andorra%'""")

In [None]:
# Create total fossil fuel column
df_merged['Total ff'] = df_merged['Coal'] + df_merged['Oil'] + df_merged['Gas']

# Copy dataset & Drop rows with missing values
df_final = df_merged.copy()
df_final.dropna(inplace=True)
df_final#.head()

I started with filtering out the non country specific data (World), because World is not a country.

Next I noticed an outlier in plotting so I checked the given data (Kuwait 1991), I've decided to remove this because it was caused by a one time event during the gulf war. 

In [None]:
def clean_data(df_final):
    # Filter rows based on column: 'Entity', 'index'
    df_final = df_final[df_final['Entity'] != "World"]
    df_final = df_final.sort_values('CO2', ascending=False).iloc[1:]
    return df_final

df_final_clean = clean_data(df_final.copy())

In [None]:
df_final_clean

Here I'm checking what the correlation is between CO<sub>2</sub> and each of the independent variables.

With this outcome I'm creating a graph of the highest correlation.

In [None]:
# Calculate correlation between CO2 and GDP, all fossil fuels and total fossil fuel usage

corr_from_df_gdp = df_final[['GDP', 'CO2']].corr(numeric_only=True)
corr_from_df_Total = df_final[['Total ff', 'CO2']].corr(numeric_only=True)
corr_from_df_Coal = df_final[['Coal', 'CO2']].corr(numeric_only=True)
corr_from_df_Oil = df_final[['Oil', 'CO2']].corr(numeric_only=True)
corr_from_df_Gas = df_final[['Gas', 'CO2']].corr(numeric_only=True)

print('\n')
print("correlation with corr = ", corr_from_df_gdp)
print('\n')
print("correlation with corr = ", corr_from_df_Total)
print('\n')
print("correlation with corr = ", corr_from_df_Coal)
print('\n')
print("correlation with corr = ", corr_from_df_Oil)
print('\n')
print("correlation with corr = ", corr_from_df_Gas)


Correlation is a relationship between two variables, if a correlation is stronger it will be either closer to 1 or -1, if there is no correlation it will be 0

As you can see above, the correlation of 'Total ff' is the highest so this is the biggest predictor of CO<sub>2</sub>.

Since the correlation between total fossil fuels and CO<sub>2</sub> is the highest I will choose this one for a regression, to better understand the relationship.

I've adjusted the KWh to MWh because the scale wasn't clear to read when plotting the function

In [None]:
# Calculate correlation between CO2 and total fossil fuel usage

corr_from_df_Total_MWh = df_final[['Total ff MWh', 'CO2']].corr(numeric_only=True)

print("correlation with corr = ", corr_from_df_Total_MWh)


In [None]:
# Visualize the correlation for Total

plt.figure(figsize=(10, 6))
x = df_final_clean['Total ff MWh'].values
y = df_final_clean['CO2'].values

# R-squared for simple linear regression is the square of Pearson's r
r_squared = np.corrcoef(x, y)[0, 1] ** 2
plt.annotate(f"R\u00b2 = {r_squared:.3f}", xy=(0.95, 0.80), xycoords='axes fraction',
             ha='right', fontsize=12, bbox=dict(boxstyle="round", fc="white", alpha=0.7))

slope, intercept = np.polyfit(df_final_clean['Total ff MWh'],df_final_clean['CO2'], 1)
plt.plot(df_final_clean['Total ff MWh'], slope * df_final_clean['Total ff MWh'] + intercept, color='green')

plt.scatter(y=df_final_clean['CO2'],x=df_final_clean['Total ff MWh'], alpha=0.25)

# Add the equation to the plot
equation = f"y = {slope:.2f}x + {intercept:.2f}"
plt.text(0.79, 0.95, equation, transform=plt.gca().transAxes, fontsize=12, va='center', bbox=dict(boxstyle="round", ec=(1., 0.5, 0.5), fc=(1., 0.8, 0.8)))

plt.xlabel('Total fossil fuel usage per Capita (MWh)')
plt.ylabel('CO2 emissions per Capita')
plt.title('Correlation between\n Total fossil fuel (MWh) & CO2\n per capita', fontweight = 'bold')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()

I have added a R<sup>2</sup> to indicate the strength of the predictive value, derived from the model.
With a value of approximately 0.84 it indicates a strong predictive relationship between total fossil fuel usage and CO2 emissions per capita.

# Conclusion

- As shown in the written code I've checked multiple CO<sub>2</sub> sources, I assessed the correlation between each of the possible predictors and CO<sub>2</sub> output.
- The highest correlation was between Total Fossil Fuels, so from the correlation perspective this seems to be the biggest predictor.