<a href="https://colab.research.google.com/github/Isioma04/gittutorial/blob/main/Global_Climate_Data_Explorer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# SETUPS, LOADING & EXPLORATION

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime, timedelta
import requests

# URL for CO2 emissions data from Our World in Data
co2_url = 'https://raw.githubusercontent.com/owid/co2-data/master/owid-co2-data.csv'

# Download the dataset
co2_df = pd.read_csv(co2_url)

# Display the first few rows and information about the dataframe
display(co2_df.head())
co2_df.info()

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,...,share_global_other_co2,share_of_temperature_change_from_ghg,temperature_change_from_ch4,temperature_change_from_co2,temperature_change_from_ghg,temperature_change_from_n2o,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share
0,Afghanistan,1750,AFG,2802560.0,,0.0,0.0,,,,...,,,,,,,,,,
1,Afghanistan,1751,AFG,,,0.0,,,,,...,,,,,,,,,,
2,Afghanistan,1752,AFG,,,0.0,,,,,...,,,,,,,,,,
3,Afghanistan,1753,AFG,,,0.0,,,,,...,,,,,,,,,,
4,Afghanistan,1754,AFG,,,0.0,,,,,...,,,,,,,,,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50191 entries, 0 to 50190
Data columns (total 79 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   country                                    50191 non-null  object 
 1   year                                       50191 non-null  int64  
 2   iso_code                                   42262 non-null  object 
 3   population                                 41019 non-null  float64
 4   gdp                                        15251 non-null  float64
 5   cement_co2                                 28863 non-null  float64
 6   cement_co2_per_capita                      25358 non-null  float64
 7   co2                                        29137 non-null  float64
 8   co2_growth_abs                             26981 non-null  float64
 9   co2_growth_prct                            26002 non-null  float64
 10  co2_including_luc     

In [3]:
# Clean and preprocess

# Display the number of missing values per column
print("Missing values before cleaning:")
print(co2_df.isnull().sum())

# Impute missing 'population' and 'gdp' using forward fill within each country
# This assumes that population and gdp values are relatively stable year-on-year within a country
co2_df['population'] = co2_df.groupby('country')['population'].ffill()
co2_df['gdp'] = co2_df.groupby('country')['gdp'].ffill()


# For CO2 emission related columns, filling NaN with 0 might be appropriate if NaN indicates no emissions
# However, it's important to be cautious as NaN might also mean missing data.
# Let's fill with 0 for now for selected columns, assuming NaN means no recorded emissions for that year/country
co2_emission_cols = [col for col in co2_df.columns if 'co2' in col or 'ghg' in col]
for col in co2_emission_cols:
    co2_df[col] = co2_df[col].fillna(0)

# For other numerical columns, imputing with the mean or median might be an option,
# but given the time-series and country-specific nature, forward fill might be better
# Let's apply forward fill for other numerical columns as well, excluding those already handled
numerical_cols = co2_df.select_dtypes(include=np.number).columns.tolist()
cols_to_ffill = [col for col in numerical_cols if col not in ['year', 'population', 'gdp'] + co2_emission_cols]

for col in cols_to_ffill:
    co2_df[col] = co2_df.groupby('country')[col].ffill()
    # After forward fill, there might still be NaNs at the beginning of a country's time series,
    # so we can backfill those
    co2_df[col] = co2_df.groupby('country')[col].bfill()


# Convert 'year' to datetime objects for easier time series analysis
co2_df['year'] = pd.to_datetime(co2_df['year'], format='%Y')

# Display the number of missing values after cleaning
print("\nMissing values after cleaning:")
print(co2_df.isnull().sum())

# Display info and head of the cleaned dataframe
co2_df.info()
display(co2_df.head())

Missing values before cleaning:
country                            0
year                               0
iso_code                        7929
population                      9172
gdp                            34940
                               ...  
temperature_change_from_n2o    12131
total_ghg                      12781
total_ghg_excluding_lucf       12955
trade_co2                      45656
trade_co2_share                45656
Length: 79, dtype: int64

Missing values after cleaning:
country                            0
year                               0
iso_code                        7929
population                      5459
gdp                            31572
                               ...  
temperature_change_from_n2o     6070
total_ghg                          0
total_ghg_excluding_lucf           0
trade_co2                          0
trade_co2_share                    0
Length: 79, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50191 entries, 0 to 50

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,...,share_global_other_co2,share_of_temperature_change_from_ghg,temperature_change_from_ch4,temperature_change_from_co2,temperature_change_from_ghg,temperature_change_from_n2o,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share
0,Afghanistan,1750-01-01,AFG,2802560.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Afghanistan,1751-01-01,AFG,2802560.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Afghanistan,1752-01-01,AFG,2802560.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Afghanistan,1753-01-01,AFG,2802560.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Afghanistan,1754-01-01,AFG,2802560.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Data Integration

# Assuming you have another DataFrame with temperature data, for example:
# temperature_df = pd.read_csv('temperature_data.csv')
# Make sure the 'year' column in temperature_df is also in datetime format
# temperature_df['year'] = pd.to_datetime(temperature_df['year'], format='%Y')

# For demonstration purposes, let's create a dummy temperature DataFrame
data = {'country': ['Afghanistan', 'Afghanistan', 'Albania', 'Albania'],
        'year': [datetime(1750, 1, 1), datetime(1751, 1, 1), datetime(1750, 1, 1), datetime(1751, 1, 1)],
        'average_temperature': [15.0, 15.2, 12.0, 12.5]}
temperature_df = pd.DataFrame(data)

# Merge the CO2 data with the temperature data
# We'll use an outer merge to keep all data from both dataframes
merged_df = pd.merge(co2_df, temperature_df, on=['country', 'year'], how='outer')

# Display the first few rows of the merged dataframe
display(merged_df.head())

# Display information about the merged dataframe to see the new column
merged_df.info()

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,...,share_of_temperature_change_from_ghg,temperature_change_from_ch4,temperature_change_from_co2,temperature_change_from_ghg,temperature_change_from_n2o,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share,average_temperature
0,Afghanistan,1750-01-01,AFG,2802560.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0
1,Afghanistan,1751-01-01,AFG,2802560.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.2
2,Afghanistan,1752-01-01,AFG,2802560.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,Afghanistan,1753-01-01,AFG,2802560.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,Afghanistan,1754-01-01,AFG,2802560.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50193 entries, 0 to 50192
Data columns (total 80 columns):
 #   Column                                     Non-Null Count  Dtype         
---  ------                                     --------------  -----         
 0   country                                    50193 non-null  object        
 1   year                                       50193 non-null  datetime64[ns]
 2   iso_code                                   42262 non-null  object        
 3   population                                 44732 non-null  float64       
 4   gdp                                        18619 non-null  float64       
 5   cement_co2                                 50191 non-null  float64       
 6   cement_co2_per_capita                      50191 non-null  float64       
 7   co2                                        50191 non-null  float64       
 8   co2_growth_abs                             50191 non-null  float64       
 9   co2_growth_prct  

In [5]:
# Exploratory Data Analysis (EDA)

# Calculate descriptive statistics for numerical columns
display(merged_df.describe())

# Get information about the data types and non-null values
merged_df.info()

# Analyze trends over time for a specific variable (e.g., global CO2 emissions)
# Need to filter for 'World' to get global data
global_co2_trend = merged_df[merged_df['country'] == 'World'][['year', 'co2']]

# Plot global CO2 emissions over time
fig = px.line(global_co2_trend, x='year', y='co2', title='Global CO2 Emissions Over Time')
fig.show()

# Analyze the distribution of CO2 emissions across countries in a recent year
recent_year = merged_df['year'].max()
co2_recent_year = merged_df[(merged_df['year'] == recent_year) & (merged_df['country'] != 'World')]

# Plot a histogram of CO2 emissions in the most recent year
fig = px.histogram(co2_recent_year, x='co2', title=f'Distribution of CO2 Emissions Across Countries in {recent_year.year}')
fig.show()

# Analyze the relationship between CO2 emissions and GDP
# Filter out rows with NaN in 'co2' or 'gdp' for a specific recent year for scatter plot
co2_gdp_recent_year = merged_df[(merged_df['year'] == recent_year) & (merged_df['country'] != 'World')].dropna(subset=['co2', 'gdp'])

# Create a scatter plot of CO2 emissions vs. GDP
fig = px.scatter(co2_gdp_recent_year, x='gdp', y='co2',
                 title=f'CO2 Emissions vs. GDP Across Countries in {recent_year.year}',
                 hover_name='country')
fig.show()

Unnamed: 0,year,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,co2_including_luc,co2_including_luc_growth_abs,...,share_of_temperature_change_from_ghg,temperature_change_from_ch4,temperature_change_from_co2,temperature_change_from_ghg,temperature_change_from_n2o,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share,average_temperature
count,50193,44732.0,18619.0,50191.0,50191.0,50191.0,50191.0,50191.0,50191.0,50191.0,...,50191.0,44121.0,50191.0,50191.0,44121.0,50191.0,50191.0,50191.0,50191.0,4.0
mean,1919-11-17 15:20:57.235072768,60111110.0,412198500000.0,4.466945,0.029827,241.322106,3.337687,22.330741,251.672265,3.347055,...,1.853778,0.00261,0.006266,0.009005,0.000439,364.136292,234.535038,-0.653482,1.854483,13.675
min,1750-01-01 00:00:00,215.0,49980000.0,0.0,0.0,0.0,-1977.75,-100.0,-99.693,-2325.5,...,-0.81,-0.001,0.0,-0.001,0.0,-14.961,0.0,-2195.952,-98.849,12.0
25%,1875-01-01 00:00:00,246285.2,5196397000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.375
50%,1924-01-01 00:00:00,2080328.0,20517270000.0,0.0,0.0,0.062,0.0,0.0,0.0,0.0,...,0.029,0.0,0.0,0.0,0.0,4.466,0.475,0.0,0.0,13.75
75%,1974-01-01 00:00:00,9724647.0,98615770000.0,0.011,0.001,9.406,0.076,4.1735,23.4175,0.021,...,0.236,0.0,0.001,0.001,0.0,44.3185,10.957,0.0,0.0,15.05
max,2023-01-01 00:00:00,8091735000.0,130112600000000.0,1696.308,2.484,37791.57,1865.208,180870.0,41416.48,2340.184,...,100.0,0.422,1.161,1.668,0.085,53816.852,44114.785,1798.999,568.635,15.2
std,,319136000.0,3740364000000.0,47.622553,0.090478,1496.69204,45.79858,1245.323926,1533.079862,67.762947,...,8.465024,0.015378,0.039603,0.05611,0.002831,2076.535254,1590.523257,75.361062,16.909858,1.66007


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50193 entries, 0 to 50192
Data columns (total 80 columns):
 #   Column                                     Non-Null Count  Dtype         
---  ------                                     --------------  -----         
 0   country                                    50193 non-null  object        
 1   year                                       50193 non-null  datetime64[ns]
 2   iso_code                                   42262 non-null  object        
 3   population                                 44732 non-null  float64       
 4   gdp                                        18619 non-null  float64       
 5   cement_co2                                 50191 non-null  float64       
 6   cement_co2_per_capita                      50191 non-null  float64       
 7   co2                                        50191 non-null  float64       
 8   co2_growth_abs                             50191 non-null  float64       
 9   co2_growth_prct  

In [6]:
# Advanced Analytics

# Example 1: Time Series Analysis (Decomposition of Global CO2 Emissions)
# We'll decompose the global CO2 emissions time series to identify trend, seasonality, and residual components.
# Requires statsmodels library, so let's install it if not already present.
try:
    from statsmodels.tsa.seasonal import seasonal_decompose
except ImportError:
    %pip install statsmodels
    from statsmodels.tsa.seasonal import seasonal_decompose

# Filter for 'World' and set 'year' as the index
global_co2_ts = merged_df[merged_df['country'] == 'World'].set_index('year')['co2']

# Decompose the time series (assuming additive model)
# The frequency needs to be set for seasonal decomposition. Since data is annual, seasonality might not be evident,
# but we can still decompose to see trend and residuals. Setting freq=1 for annual data.
decomposition = seasonal_decompose(global_co2_ts.dropna(), model='additive', period=1)

# Plot the decomposed components
fig = make_subplots(rows=4, cols=1, subplot_titles=['Original', 'Trend', 'Seasonal', 'Residual'])

fig.add_trace(go.Scatter(x=decomposition.seasonal.index, y=decomposition.observed, mode='lines', name='Original'), row=1, col=1)
fig.add_trace(go.Scatter(x=decomposition.trend.index, y=decomposition.trend, mode='lines', name='Trend'), row=2, col=1)
fig.add_trace(go.Scatter(x=decomposition.seasonal.index, y=decomposition.seasonal, mode='lines', name='Seasonal'), row=3, col=1)
fig.add_trace(go.Scatter(x=decomposition.resid.index, y=decomposition.resid, mode='lines', name='Residual'), row=4, col=1)

fig.update_layout(height=900, title='Time Series Decomposition of Global CO2 Emissions')
fig.show()


# Example 2: Correlation Analysis (CO2 vs. GDP for the most recent year)
# We already have the co2_gdp_recent_year dataframe from EDA
correlation = co2_gdp_recent_year[['co2', 'gdp']].corr()
print("\nCorrelation Matrix (CO2 vs. GDP):")
display(correlation)


# Example 3: Regression Analysis (Modeling CO2 Emissions based on GDP for the most recent year)
# We'll use statsmodels to perform a simple linear regression
try:
    import statsmodels.api as sm
except ImportError:
    %pip install statsmodels
    import statsmodels.api as sm

# Define dependent and independent variables
X = co2_gdp_recent_year['gdp']
y = co2_gdp_recent_year['co2']

# Add a constant to the independent variable for the intercept
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Print the regression summary
print("\nRegression Analysis (CO2 vs. GDP):")
print(model.summary())


Correlation Matrix (CO2 vs. GDP):


Unnamed: 0,co2,gdp
co2,1.0,0.960876
gdp,0.960876,1.0



Regression Analysis (CO2 vs. GDP):
                            OLS Regression Results                            
Dep. Variable:                    co2   R-squared:                       0.923
Model:                            OLS   Adj. R-squared:                  0.923
Method:                 Least Squares   F-statistic:                     1950.
Date:                Tue, 09 Sep 2025   Prob (F-statistic):           3.09e-92
Time:                        23:58:58   Log-Likelihood:                -1161.0
No. Observations:                 164   AIC:                             2326.
Df Residuals:                     162   BIC:                             2332.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -62

In [7]:
# Visualization

# Example 1: Interactive Line Plot of CO2 Emissions over Time for Multiple Countries
# Let's select a few countries to compare their CO2 emission trends
selected_countries = ['World', 'United States', 'China', 'India', 'Germany']

# Filter the data for the selected countries
co2_trends_selected = merged_df[merged_df['country'].isin(selected_countries)]

fig = px.line(co2_trends_selected, x='year', y='co2', color='country',
              title='CO2 Emissions Over Time for Selected Countries')
fig.show()

# Example 2: Interactive Bar Chart of CO2 Emissions by Country in a Recent Year
# We already have the co2_recent_year dataframe from EDA
# Let's sort by CO2 emissions to see the top emitters
co2_recent_year_sorted = co2_recent_year.sort_values('co2', ascending=False)

fig = px.bar(co2_recent_year_sorted.head(20), x='country', y='co2',
             title=f'Top 20 Countries by CO2 Emissions in {recent_year.year}')
fig.show()

# Example 3: Interactive Choropleth Map of CO2 Emissions by Country in a Recent Year
# We need iso_code for the map. Let's use the co2_recent_year dataframe and drop rows with missing iso_code
co2_recent_year_map = co2_recent_year.dropna(subset=['iso_code'])

fig = px.choropleth(co2_recent_year_map, locations="iso_code",
                    color="co2",
                    hover_name="country",
                    animation_frame="year", # If you have data for multiple recent years, you can use this
                    title=f'CO2 Emissions by Country in {recent_year.year}',
                    color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

**Setting up a streamlit environment**






In [8]:
# Install Streamlit and other libraries
%pip install streamlit pandas numpy matplotlib plotly statsmodels requests

Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.49.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m132.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.49.1


**Structuring the streamlit app using Gemini**  


In [9]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import requests
from datetime import datetime
import statsmodels.api as sm


# Set up the Streamlit app title and description
st.title("Global Climate Explorer")
st.write("Explore global climate data, including CO2 emissions and temperature trends.")

# Data Sourcing & Acquisition and Cleaning & Preprocessing
@st.cache_data
def load_data():
    co2_url = 'https://raw.githubusercontent.com/owid/co2-data/master/owid-co2-data.csv'
    co2_df = pd.read_csv(co2_url)

    # Clean and preprocess data (from notebook cell 4SORNwZEvYzH)
    co2_df['population'] = co2_df.groupby('country')['population'].ffill()
    co2_df['gdp'] = co2_df.groupby('country')['gdp'].ffill()

    co2_emission_cols = [col for col in co2_df.columns if 'co2' in col or 'ghg' in col]
    for col in co2_emission_cols:
        co2_df[col] = co2_df[col].fillna(0)

    numerical_cols = co2_df.select_dtypes(include=np.number).columns.tolist()
    cols_to_ffill = [col for col in numerical_cols if col not in ['year', 'population', 'gdp'] + co2_emission_cols]

    for col in cols_to_ffill:
        co2_df[col] = co2_df.groupby('country')[col].ffill()
        co2_df[col] = co2_df.groupby('country')[col].bfill()

    co2_df['year'] = pd.to_datetime(co2_df['year'], format='%Y')

    # Data Integration (from notebook cell c5c44f91 - using dummy data for now)
    # Replace with actual temperature data loading and merging when available
    data = {'country': ['Afghanistan', 'Afghanistan', 'Albania', 'Albania'],
            'year': [datetime(1750, 1, 1), datetime(1751, 1, 1), datetime(1750, 1, 1), datetime(1751, 1, 1)],
            'average_temperature': [15.0, 15.2, 12.0, 12.5]}
    temperature_df = pd.DataFrame(data)
    merged_df = pd.merge(co2_df, temperature_df, on=['country', 'year'], how='outer')


    return merged_df

merged_df = load_data()


# Add interactive widgets to the sidebar
st.sidebar.header("Settings")

# Get a list of unique countries for the selectbox, sorting them and adding 'World' at the beginning
countries = sorted(merged_df['country'].unique().tolist())
countries.insert(0, 'World')
selected_country = st.sidebar.selectbox("Select a Country", countries)

# Get the minimum and maximum year from the dataset
min_year = merged_df['year'].min().year
max_year = merged_df['year'].max().year

# Create a slider for selecting the year range
selected_year_range = st.sidebar.slider("Select Year Range", min_year, max_year, (min_year, max_year))

# Filter data based on user input
filtered_df = merged_df[
    (merged_df['country'] == selected_country) &
    (merged_df['year'].dt.year >= selected_year_range[0]) &
    (merged_df['year'].dt.year <= selected_year_range[1])
]


st.header("Climate Data Analysis")

# Add storytelling for country-specific analysis
st.markdown(f"""
**Exploring Climate Trends for {selected_country}**

Use the widgets in the sidebar to select a country and a year range to see the climate data and analysis for that specific region.
""")

# Generate dynamic visualizations
if not filtered_df.empty:
    st.subheader(f"Analysis for {selected_country}")

    # Example: Line plot of CO2 emissions over time for the selected country
    st.markdown("""
    **CO2 Emissions Over Time**

    This line chart shows the trend of CO2 emissions for the selected country over the chosen year range. Observe how emissions have changed historically.
    """)
    fig_co2_time = px.line(filtered_df, x='year', y='co2',
                           title=f'CO2 Emissions Over Time for {selected_country}')
    st.plotly_chart(fig_co2_time, use_container_width=True)

    # Example: Scatter plot of CO2 vs GDP for the selected country
    # Filter out rows with NaN in 'co2' or 'gdp' for the scatter plot
    filtered_df_gdp = filtered_df.dropna(subset=['co2', 'gdp'])
    if not filtered_df_gdp.empty:
        st.markdown("""
        **CO2 Emissions vs. GDP**

        This scatter plot illustrates the relationship between CO2 emissions and Gross Domestic Product (GDP) for the selected country. Higher GDP is often associated with higher energy consumption and emissions, but this relationship can vary.
        """)
        fig_co2_gdp = px.scatter(filtered_df_gdp, x='gdp', y='co2',
                                 title=f'CO2 Emissions vs. GDP for {selected_country}')
        st.plotly_chart(fig_co2_gdp, use_container_width=True)

        # Advanced Analytics: Correlation and Regression Analysis
        st.subheader("Advanced Analytics")

        st.markdown("""
        **Correlation and Regression Analysis (CO2 vs. GDP)**

        Below are the results of statistical analyses examining the relationship between CO2 emissions and GDP for the selected country and year range. The correlation matrix shows the strength and direction of the linear relationship, while the regression analysis models how CO2 emissions change with GDP.
        """)

        # Correlation Analysis
        st.write("Correlation Matrix (CO2 vs. GDP):")
        correlation = filtered_df_gdp[['co2', 'gdp']].corr()
        st.write(correlation)

        # Regression Analysis
        st.write("Regression Analysis (CO2 vs. GDP):")
        X = filtered_df_gdp['gdp']
        y = filtered_df_gdp['co2']
        X = sm.add_constant(X)
        model = sm.OLS(y, X).fit()
        st.text(model.summary()) # Use st.text to display the summary

    else:
        st.write(f"No data available for CO2 vs. GDP for {selected_country} in the selected year range.")

    # Add more visualizations here based on available data and user selections
    # Example: Time Series Decomposition (only for 'World' as it was done in notebook)
    if selected_country == 'World':
        st.subheader("Time Series Decomposition (Global CO2 Emissions)")
        st.markdown("""
        **Time Series Decomposition of Global CO2 Emissions**

        This decomposition breaks down the global CO2 emissions time series into its underlying components: the overall trend, seasonality (if any), and the remaining residuals. This helps to understand the long-term patterns and any recurring cycles in global emissions.
        """)
        try:
            from statsmodels.tsa.seasonal import seasonal_decompose
            global_co2_ts = filtered_df.set_index('year')['co2']
            decomposition = seasonal_decompose(global_co2_ts.dropna(), model='additive', period=1)

            fig = make_subplots(rows=4, cols=1, subplot_titles=['Original', 'Trend', 'Seasonal', 'Residual'])
            fig.add_trace(go.Scatter(x=decomposition.seasonal.index, y=decomposition.observed, mode='lines', name='Original'), row=1, col=1)
            fig.add_trace(go.Scatter(x=decomposition.trend.index, y=decomposition.trend, mode='lines', name='Trend'), row=2, col=1)
            fig.add_trace(go.Scatter(x=decomposition.seasonal.index, y=decomposition.seasonal, mode='lines', name='Seasonal'), row=3, col=1)
            fig.add_trace(go.Scatter(x=decomposition.resid.index, y=decomposition.resid, mode='lines', name='Residual'), row=4, col=1)
            fig.update_layout(height=900, title='Time Series Decomposition of Global CO2 Emissions')
            st.plotly_chart(fig, use_container_width=True)
        except ImportError:
            st.write("Statsmodels not installed. Please install it to see time series decomposition (`%pip install statsmodels`).")


else:
    st.write("No data available for the selected country and year range.")

# Example: Global visualizations (these don't depend on the selected country/year range filter above)
st.header("Global Overview")

st.markdown("""
**Global Climate Insights**

Explore visualizations that provide a global perspective on CO2 emissions, highlighting the distribution across countries and the top emitters in a recent year.
""")

# Analyze the distribution of CO2 emissions across countries in a recent year (using the full merged_df)
recent_year_full = merged_df['year'].max()
co2_recent_year_full = merged_df[(merged_df['year'] == recent_year_full) & (merged_df['country'] != 'World')]

if not co2_recent_year_full.empty:
    st.subheader(f"Distribution of CO2 Emissions Across Countries in {recent_year_full.year}")
    st.markdown("""
    This histogram shows how CO2 emissions are distributed among different countries in the most recent year available.
    """)
    fig_hist_global = px.histogram(co2_recent_year_full, x='co2', title=f'Distribution of CO2 Emissions Across Countries in {recent_year_full.year}')
    st.plotly_chart(fig_hist_global, use_container_width=True)

    # Interactive Bar Chart of CO2 Emissions by Country in a Recent Year (Top 20)
    st.subheader(f"Top 20 Countries by CO2 Emissions in {recent_year_full.year}")
    st.markdown("""
    This bar chart highlights the top 20 countries with the highest CO2 emissions in the most recent year.
    """)
    co2_recent_year_sorted_full = co2_recent_year_full.sort_values('co2', ascending=False)
    fig_bar_top20 = px.bar(co2_recent_year_sorted_full.head(20), x='country', y='co2',
                          title=f'Top 20 Countries by CO2 Emissions in {recent_year_full.year}')
    st.plotly_chart(fig_bar_top20, use_container_width=True)

    # Interactive Choropleth Map of CO2 Emissions by Country in a Recent Year
    st.subheader(f"Global CO2 Emissions Map in {recent_year_full.year}")
    st.markdown("""
    This interactive map visualizes CO2 emissions by country in the most recent year, allowing for a geographical comparison of emission levels.
    """)
    co2_recent_year_map_full = co2_recent_year_full.dropna(subset=['iso_code'])
    if not co2_recent_year_map_full.empty:
        fig_map_global = px.choropleth(co2_recent_year_map_full, locations="iso_code",
                                      color="co2",
                                      hover_name="country",
                                      title=f'CO2 Emissions by Country in {recent_year_full.year}',
                                      color_continuous_scale=px.colors.sequential.Plasma)
        st.plotly_chart(fig_map_global, use_container_width=True)
    else:
        st.write("No data available with ISO codes for mapping in the most recent year.")

else:
    st.write(f"No data available for the most recent year ({recent_year_full.year}) for global visualizations.")

Writing app.py


**Running and testing the streamlit app locally**

In [10]:
!streamlit run app.py &>/dev/null&

In [11]:
!pip install ngrok

Collecting ngrok
  Downloading ngrok-1.5.1-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading ngrok-1.5.1-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ngrok
Successfully installed ngrok-1.5.1


In [12]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.3.0


In [13]:
from google.colab import userdata
userdata.get('Ngroktoken')

SecretNotFoundError: Secret Ngroktoken does not exist.

In [None]:
import subprocess
import time
from google.colab import userdata
import os
from pyngrok import ngrok # Import ngrok from pyngrok

# Get ngrok authtoken from Colab Secrets
# Add NGROK_AUTHTOKEN to environment variables for pyngrok to pick it up
# Make sure you have a secret named 'NGROK_AUTHTOKEN' in Colab Secrets
NGROK_AUTHTOKEN = userdata.get('Ngroktoken')
os.environ['Ngroktoken'] = NGROK_AUTHTOKEN


if not NGROK_AUTHTOKEN:
    print("ngrok authtoken not found in Colab Secrets. Please add it.")
else:
    # Set the authtoken for pyngrok
    ngrok.set_auth_token(NGROK_AUTHTOKEN)

    # Start Streamlit in the background
    streamlit_process = subprocess.Popen(['streamlit', 'run', 'app.py'])

    # Give Streamlit a moment to start
    time.sleep(5)

    # Use pyngrok to connect
    try:
        print("Attempting to connect ngrok using pyngrok...")
        # Connect to the Streamlit port (8501) using pyngrok
        public_url = ngrok.connect(8501).public_url
        print(f"Your Streamlit app is available at: {public_url}")
    except Exception as e:
        print(f"Error starting ngrok tunnel using pyngrok: {e}")
        streamlit_process.terminate() # Terminate streamlit if ngrok fails

    # Keep the cell running to keep Streamlit and ngrok alive
    try:
        streamlit_process.wait()
    except KeyboardInterrupt:
        print("Stopping Streamlit and ngrok...")
        ngrok.kill() # Kill ngrok process started by pyngrok
        streamlit_process.terminate()