# Data Exploration

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

## CO2 Total by Country 1970-2021


### Raw Data / Pathway

In [2]:
CO2_yearly_path = "../raw_data/CO2_YEARLY_DATA_1970-2021.xlsx"

As the data is very clean already this looks like a good framework to use for the other datasets which are very similar to this one.

In [None]:
# Removed the first rows that have no relevance and reset the index.
co2_year = pd.read_excel(CO2_yearly_path, sheet_name="TOTALS BY COUNTRY", skiprows=[0, 1, 2, 3, 4, 5, 6, 7, 8], header=1) 
co2_year.head()


In [None]:
# Drop irrelevant columns
co2_year_df = co2_year.drop(['IPCC_annex', 'IPCC_annex', 'Name', 'C_group_IM24_sh', 'Substance'], axis=1)

# Transform dataframe for easier use in graphs and time-series and rename the columns so as to be better recognisable.
co2_year_df= co2_year_df.melt(id_vars=['Country_code_A3']).rename(columns={'Country_code_A3':'country', 'variable':'year', 'value':'CO2'})

# Remove prefix for year
co2_year_df['year'] = co2_year_df['year'].str.removeprefix("Y_")

# Reset the year column as data type integer.
co2_year_df = co2_year_df.astype({"year":"int"}) 

co2_year_df

In [None]:
co2_year_df.dtypes

In [None]:
# Format df so as to better demonstrate the changes in previous cell.
co2_year_df = co2_year_df.groupby(["country", "year"]).sum().reset_index()
co2_year_df


In [None]:
# Plot the newly cleaned/filtered dataset
fig = px.line(co2_year_df, x="year", y="CO2", color='country', width=1200, height=800)
fig.show()

## Meryem - Data Analysis

## EDA

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(x=co2_year_df['CO2'] )
plt.ylim(top=900)

In [None]:
sns.boxplot(x=co2_year_df['CO2'])

In [None]:
Total_mean=co2_year_df.groupby(['country'])['CO2'].mean().sort_values(ascending=False)[0:100].reset_index()
top_10_countries_mean=co2_year_df.groupby(['country'])['CO2'].mean().sort_values(ascending=False)[0:10].reset_index()
year_mean_10_total=co2_year_df.groupby(['year'])['CO2'].mean().sort_values(ascending=False)[0:10].reset_index()

In [None]:
fig1= px.pie(top_10_countries_mean,values='CO2',names='country',title= "Top 10 Countries by Total Emission")
fig1.show()

In [None]:
fig2= px.pie(year_mean_10_total,values='CO2',names='year',title= "Top 10 Years of Highest Total Emission")
fig2.show()

In [None]:
def chlorepath(data,color):
    fig=px.choropleth(data, locations="country",
                    color= color, 
                    hover_name="country",
                    color_continuous_scale=px.colors.sequential.Plasma)
    fig.show()

In [None]:
fig3=chlorepath(Total_mean,'CO2')

## Trend Analysis


In [None]:
countries= co2_year_df['country'].unique()

In [None]:
for country in countries:
    new_df = co2_year_df[co2_year_df['country'] == country]
    plt.plot(new_df['year'], new_df['CO2'])
    plt.xlabel('Year')
    plt.ylabel('CO2 Emissions')
    plt.title('CO2 Emissions Trend for ' + country)
    plt.show()

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose


In [None]:
# Additive Decomposition (y = Trend + Seasonal + Residuals)
#result_add = seasonal_decompose(co2_yea['value'], model='additive')
#result_add.plot();

In [None]:
#multiplicative

In [None]:
co2_year_df

In [None]:
#movingaverage basemodel 10years pandas 
co2_year_df['year'] = pd.to_datetime(co2_year_df['year'], format='%Y')

# Set the 'year' column as the index
co2_year_df.set_index('year', inplace=True)

# Calculate the 10-year moving average and add it as a new column
co2_year_df['MA_10'] = co2_year_df['CO2'].rolling(10).mean()

# Create a line plot of the original CO2 data and the moving average
plt.plot(co2_year_df.index, co2_year_df['CO2'], label='CO2 Emissions')
plt.plot(co2_year_df.index, co2_year_df['MA_10'], label='10-Year Moving Average')
plt.xlabel('Year')
plt.ylabel('CO2 Emissions')
plt.title('CO2 Emissions Trend with 10-Year Moving Average')
plt.legend()
plt.show()

In [None]:
#plot actual versus predictions

In [None]:
#mape per country or mdape