# CO2 Emissions - Exploratory Data Analysis
## Tracking Global Carbon Footprints

This notebook explores global CO2 emissions data to answer key questions:
1. Which countries emit the most CO2?
2. What are per capita emission patterns?
3. How have emissions changed over time?
4. What sectors contribute most to emissions?

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Import custom module
import sys
sys.path.append('../src')
from data_processing import CO2DataProcessor

## 1. Load and Explore Data

In [None]:
# Initialize processor
processor = CO2DataProcessor('../data/raw/dataset.csv')

# Load data
df_raw = processor.load_data()
df_raw.head(10)

In [None]:
# Data info
print("Dataset Shape:", df_raw.shape)
print("\nColumn Types:")
print(df_raw.dtypes)
print("\nMissing Values:")
print(df_raw.isnull().sum())
print("\nBasic Statistics:")
df_raw.describe()

In [None]:
# Unique values
print(f"Number of Countries: {df_raw['country'].nunique()}")
print(f"Number of Sectors: {df_raw['sector'].nunique()}")
print(f"\nUnique Sectors:\n{df_raw['sector'].unique()}")
print(f"\nDate Range: {df_raw['date'].min()} to {df_raw['date'].max()}")

## 2. Data Cleaning

In [None]:
# Clean data
df = processor.clean_data()
print(f"\nCleaned Data Shape: {df.shape}")
df.head()

In [None]:
# Check year range
print(f"Year Range: {df['year'].min()} - {df['year'].max()}")
print(f"\nRecords per Year:")
print(df['year'].value_counts().sort_index())

## 3. Top Emitting Countries (Total Emissions)

In [None]:
# Get top 20 emitters (all time)
top_emitters = processor.get_top_emitters(n=20)
print("Top 20 CO2 Emitting Countries (All Time):")
print(top_emitters)

# Visualize
fig = px.bar(top_emitters, 
             x='total_emissions', 
             y='country',
             orientation='h',
             title='Top 20 CO2 Emitting Countries (Total)',
             labels={'total_emissions': 'Total CO2 Emissions', 'country': 'Country'},
             color='total_emissions',
             color_continuous_scale='Reds')
fig.update_layout(height=600, showlegend=False)
fig.show()

In [None]:
# Top emitters by most recent year
latest_year = df['year'].max()
top_emitters_recent = processor.get_top_emitters(n=15, by_year=latest_year)
print(f"\nTop 15 Emitters in {latest_year}:")
print(top_emitters_recent)

# Pie chart
fig = px.pie(top_emitters_recent, 
             values='total_emissions', 
             names='country',
             title=f'Top 15 CO2 Emitters - {latest_year}',
             hole=0.4)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

## 4. Temporal Trends - Emissions Over Time

In [None]:
# Global emissions trend
global_trend = df.groupby('year')['value'].sum().reset_index()
global_trend.columns = ['year', 'total_emissions']

fig = px.line(global_trend, 
              x='year', 
              y='total_emissions',
              title='Global CO2 Emissions Trend Over Time',
              labels={'total_emissions': 'Total CO2 Emissions', 'year': 'Year'},
              markers=True)
fig.update_traces(line_color='#d62728', line_width=3)
fig.update_layout(hovermode='x unified')
fig.show()

In [None]:
# Top 10 countries emissions over time
top_10_countries = top_emitters.head(10)['country'].tolist()
df_top10 = df[df['country'].isin(top_10_countries)]
trend_top10 = df_top10.groupby(['year', 'country'])['value'].sum().reset_index()

fig = px.line(trend_top10, 
              x='year', 
              y='value',
              color='country',
              title='CO2 Emissions Trend - Top 10 Emitting Countries',
              labels={'value': 'CO2 Emissions', 'year': 'Year'},
              markers=True)
fig.update_layout(hovermode='x unified', height=600)
fig.show()

## 5. Sectoral Breakdown

In [None]:
# Global sectoral breakdown
sectoral_global = processor.get_sectoral_breakdown()
print("Global Emissions by Sector:")
print(sectoral_global)

# Visualize
fig = px.bar(sectoral_global, 
             x='sector', 
             y='total_emissions',
             title='Global CO2 Emissions by Sector',
             labels={'total_emissions': 'Total CO2 Emissions', 'sector': 'Sector'},
             color='total_emissions',
             color_continuous_scale='Viridis')
fig.update_layout(xaxis_tickangle=-45, height=500)
fig.show()

In [None]:
# Sectoral trends over time
sectoral_trend = df.groupby(['year', 'sector'])['value'].sum().reset_index()

fig = px.area(sectoral_trend, 
              x='year', 
              y='value',
              color='sector',
              title='CO2 Emissions by Sector Over Time (Stacked Area)',
              labels={'value': 'CO2 Emissions', 'year': 'Year'})
fig.update_layout(hovermode='x unified', height=600)
fig.show()

## 6. Growth Rate Analysis

In [None]:
# Calculate growth rates
growth_rates = processor.calculate_growth_rates()
growth_rates.head(20)

In [None]:
# Average growth rate by country (last 5 years)
recent_years = df['year'].max() - 5
recent_growth = growth_rates[growth_rates['year'] >= recent_years]
avg_growth = recent_growth.groupby('country')['growth_rate'].mean().reset_index()
avg_growth = avg_growth.sort_values('growth_rate', ascending=False)

print("\nTop 15 Countries by Average Growth Rate (Last 5 Years):")
print(avg_growth.head(15))

print("\nBottom 15 Countries by Average Growth Rate (Last 5 Years):")
print(avg_growth.tail(15))

In [None]:
# High-growth countries (>5% annual growth)
high_growth = avg_growth[avg_growth['growth_rate'] > 5]
print(f"\nCountries with >5% Average Annual Growth (Last 5 Years): {len(high_growth)}")
print(high_growth)

# Visualize
if len(high_growth) > 0:
    fig = px.bar(high_growth.head(20), 
                 x='growth_rate', 
                 y='country',
                 orientation='h',
                 title='Countries with >5% Average Annual CO2 Growth Rate',
                 labels={'growth_rate': 'Average Growth Rate (%)', 'country': 'Country'},
                 color='growth_rate',
                 color_continuous_scale='Reds')
    fig.update_layout(height=600)
    fig.show()

## 7. Animated Choropleth Map

In [None]:
# Prepare data for animated map
yearly_emissions = df.groupby(['country', 'year'])['value'].sum().reset_index()
yearly_emissions.columns = ['country', 'year', 'emissions']

# Create animated choropleth
fig = px.choropleth(yearly_emissions,
                    locations='country',
                    locationmode='country names',
                    color='emissions',
                    hover_name='country',
                    animation_frame='year',
                    color_continuous_scale='Reds',
                    title='Global CO2 Emissions Over Time (Animated Map)',
                    labels={'emissions': 'CO2 Emissions'})

fig.update_layout(height=600)
fig.show()

## 8. Key Insights Summary

Based on the exploratory analysis:

### Top Emitters
- Identify the top 10-15 countries responsible for the majority of global emissions
- Analyze concentration of emissions among few countries

### Temporal Trends
- Overall global emission trends (increasing/decreasing)
- Country-specific trajectories
- Sectoral evolution over time

### High-Growth Countries
- Countries with >5% annual growth rate need immediate attention
- Potential for clean energy intervention

### Sectoral Insights
- Which sectors contribute most to emissions
- Sector-specific policy recommendations

**Next Steps:**
1. Add population and GDP data for per capita analysis
2. Identify low-emission leaders
3. Develop correlation analysis
4. Create policy recommendations

In [None]:
# Save processed data for dashboard
processor.save_processed_data()