In [None]:
"""
Gross Domestic Product (GDP) Analysis - Kenya's Economic Performance
======================================================================

Comprehensive analysis of Kenya's Gross Domestic Product (GDP) including:
- Overall GDP growth trends and cycles
- Sectoral contribution to GDP
- Identifying key growth-driving sectors
- GDP per capita and economic welfare analysis
- Forecasting future GDP growth
"""

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Advanced analytics libraries
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.api import Holt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

class GDPAnalyzer:
    """Advanced GDP Analysis and Forecasting Engine"""
    
    def __init__(self):
        self.gdp_data = None
        self.sectoral_data = None
        
    def load_gdp_data(self, data_path="data/raw/"):
        """Load all GDP-related datasets"""
        
        try:
            self.gdp_data = pd.read_csv(f"{data_path}Annual GDP.csv", skiprows=2)
            print("✅ Loaded Annual GDP Data")
        except Exception as e:
            print(f"❌ Could not load Annual GDP Data: {e}")
            
        # Assuming sectoral data is within the same file or another specific file
        # For this example, we'll assume columns in 'Annual GDP.csv' represent sectors
        if self.gdp_data is not None:
            self.sectoral_data = self.gdp_data.copy()
            print("✅ Loaded Sectoral GDP Data")

    def clean_gdp_data(self):
        """Clean and standardize GDP data"""
        
        if self.gdp_data is not None:
            # Handle non-numeric values and formatting
            for col in self.gdp_data.columns:
                if self.gdp_data[col].dtype == 'object':
                    self.gdp_data[col] = pd.to_numeric(self.gdp_data[col].str.replace(',', ''), errors='coerce')
            
            # Handle missing values - forward fill is common for economic series
            self.gdp_data = self.gdp_data.fillna(method='ffill').dropna(how='all', axis=1)
            
            # Ensure a time index if a 'Year' or 'Period' column exists
            year_col = [col for col in self.gdp_data.columns if 'Year' in col or 'Period' in col]
            if year_col:
                self.gdp_data = self.gdp_data.set_index(year_col[0])
                self.gdp_data.index = pd.to_datetime(self.gdp_data.index, format='%Y')
        
        if self.sectoral_data is not None:
            self.sectoral_data = self.sectoral_data.fillna(method='ffill').dropna(how='all', axis=1)
            if year_col:
                self.sectoral_data = self.sectoral_data.set_index(year_col[0])
                self.sectoral_data.index = pd.to_datetime(self.sectoral_data.index, format='%Y')

        return self
    
    def analyze_gdp_growth(self):
        """Analyze overall GDP growth trends"""
        
        if self.gdp_data is None:
            print("GDP data not available")
            return None
        
        gdp_analysis = {}
        
        # Assuming the main GDP series is the first numeric column
        gdp_series_name = self.gdp_data.select_dtypes(include=np.number).columns[0]
        gdp_series = self.gdp_data[gdp_series_name]
        
        # Calculate growth rates
        gdp_analysis['annual_growth_rate'] = gdp_series.pct_change() * 100
        
        # Key metrics
        gdp_analysis['metrics'] = {
            'latest_gdp': gdp_series.iloc[-1],
            'latest_growth_rate': gdp_analysis['annual_growth_rate'].iloc[-1],
            'average_growth_rate': gdp_analysis['annual_growth_rate'].mean(),
            '5yr_avg_growth': gdp_analysis['annual_growth_rate'].tail(5).mean(),
            'gdp_volatility': gdp_analysis['annual_growth_rate'].std(),
            'best_growth_year': gdp_analysis['annual_growth_rate'].idxmax().year,
            'worst_growth_year': gdp_analysis['annual_growth_rate'].idxmin().year
        }
        
        # Trend analysis using linear regression
        X = np.array(gdp_series.index.year).reshape(-1, 1)
        y = gdp_series.values
        model = LinearRegression()
        model.fit(X, y)
        gdp_analysis['trend'] = {
            'slope': model.coef_[0],
            'intercept': model.intercept_,
            'r_squared': model.score(X, y)
        }
        
        return gdp_analysis

    def analyze_sectoral_contribution(self):
        """Analyze the contribution of different sectors to GDP"""
        
        if self.sectoral_data is None:
            print("Sectoral data not available")
            return None
        
        sectoral_analysis = {}
        df = self.sectoral_data.select_dtypes(include=np.number)
        
        # Calculate total GDP if not present
        if 'Total_GDP' not in df.columns:
            df['Total_GDP'] = df.sum(axis=1)
            
        # Calculate percentage contribution of each sector
        sectoral_contribution = df.div(df['Total_GDP'], axis=0) * 100
        
        # Latest contribution
        latest_contribution = sectoral_contribution.iloc[-1].sort_values(ascending=False)
        sectoral_analysis['latest_contribution'] = latest_contribution
        
        # Average contribution over time
        avg_contribution = sectoral_contribution.mean().sort_values(ascending=False)
        sectoral_analysis['average_contribution'] = avg_contribution
        
        # Identify key sectors (e.g., >10% contribution)
        key_sectors = latest_contribution[latest_contribution > 10].index.tolist()
        sectoral_analysis['key_sectors'] = key_sectors
        
        # Sectoral growth analysis
        sectoral_growth = df.pct_change().mean() * 100  # Average annual growth
        sectoral_analysis['sectoral_growth_rates'] = sectoral_growth.sort_values(ascending=False)
        
        # Identify fastest growing sectors
        fastest_growing = sectoral_growth.nlargest(5).index.tolist()
        sectoral_analysis['fastest_growing_sectors'] = fastest_growing
        
        return sectoral_analysis

    def forecast_gdp(self, periods=5):
        """Forecast future GDP using time series models"""
        
        if self.gdp_data is None:
            return None
            
        gdp_series_name = self.gdp_data.select_dtypes(include=np.number).columns[0]
        gdp_series = self.gdp_data[gdp_series_name]
        
        # Holt's Linear Trend Model
        fit_holt = Holt(gdp_series, initialization_method="estimated").fit()
        forecast_holt = fit_holt.forecast(periods)
        
        # Polynomial Trend Model
        X = np.arange(len(gdp_series)).reshape(-1, 1)
        y = gdp_series.values
        poly_model = make_pipeline(PolynomialFeatures(2), LinearRegression())
        poly_model.fit(X, y)
        
        future_X = np.arange(len(gdp_series), len(gdp_series) + periods).reshape(-1, 1)
        forecast_poly = poly_model.predict(future_X)
        
        # Create forecast DataFrame
        last_year = gdp_series.index.max().year
        forecast_index = pd.to_datetime([last_year + i for i in range(1, periods + 1)], format='%Y')
        
        forecast_df = pd.DataFrame({
            'Holt_Forecast': forecast_holt.values,
            'Polynomial_Forecast': forecast_poly
        }, index=forecast_index)
        
        return forecast_df

    def create_gdp_visualizations(self):
        """Create comprehensive GDP analysis visualizations"""
        
        visualizations = {}
        
        # 1. GDP Growth Trend
        if self.gdp_data is not None:
            gdp_series_name = self.gdp_data.select_dtypes(include=np.number).columns[0]
            gdp_series = self.gdp_data[gdp_series_name]
            growth_rate = gdp_series.pct_change() * 100
            
            fig_gdp = make_subplots(specs=[[{"secondary_y": True}]])
            
            # Add GDP series
            fig_gdp.add_trace(
                go.Scatter(x=gdp_series.index, y=gdp_series, name='Nominal GDP',
                           line=dict(color='royalblue', width=3)),
                secondary_y=False,
            )
            
            # Add growth rate series
            fig_gdp.add_trace(
                go.Bar(x=growth_rate.index, y=growth_rate, name='Annual Growth Rate (%)',
                       marker_color='lightcoral'),
                secondary_y=True,
            )
            
            fig_gdp.update_layout(
                title_text='Kenya Annual GDP and Growth Rate',
                template='plotly_white',
                height=600
            )
            fig_gdp.update_yaxes(title_text="Nominal GDP (KES Billions)", secondary_y=False)
            fig_gdp.update_yaxes(title_text="Annual Growth Rate (%)", secondary_y=True)
            
            visualizations['gdp_growth_trend'] = fig_gdp

        # 2. Sectoral Contribution to GDP (Pie Chart)
        sectoral_analysis = self.analyze_sectoral_contribution()
        if sectoral_analysis:
            latest_contribution = sectoral_analysis['latest_contribution']
            
            fig_pie = px.pie(
                values=latest_contribution.values, 
                names=latest_contribution.index,
                title=f'Sectoral Contribution to GDP ({latest_contribution.name.year})',
                hole=0.3
            )
            fig_pie.update_traces(textposition='inside', textinfo='percent+label')
            visualizations['sectoral_pie_chart'] = fig_pie
            
        # 3. Sectoral Contribution Over Time (Area Chart)
        if self.sectoral_data is not None:
            df_sectoral = self.sectoral_data.select_dtypes(include=np.number)
            if 'Total_GDP' not in df_sectoral.columns:
                df_sectoral['Total_GDP'] = df_sectoral.sum(axis=1)
            
            contribution_df = (df_sectoral.div(df_sectoral['Total_GDP'], axis=0) * 100).drop('Total_GDP', axis=1)
            
            fig_area = px.area(
                contribution_df,
                x=contribution_df.index,
                y=contribution_df.columns,
                title='Evolution of Sectoral Contribution to GDP',
                labels={'value': 'Contribution (%)', 'variable': 'Sector'}
            )
            visualizations['sectoral_area_chart'] = fig_area
            
        # 4. GDP Forecast
        forecast_df = self.forecast_gdp()
        if forecast_df is not None:
            gdp_series_name = self.gdp_data.select_dtypes(include=np.number).columns[0]
            gdp_series = self.gdp_data[gdp_series_name]
            
            fig_forecast = go.Figure()
            
            # Historical data
            fig_forecast.add_trace(go.Scatter(
                x=gdp_series.index, y=gdp_series,
                mode='lines', name='Historical GDP'
            ))
            
            # Forecast data
            fig_forecast.add_trace(go.Scatter(
                x=forecast_df.index, y=forecast_df['Holt_Forecast'],
                mode='lines+markers', name="Holt's Forecast", line=dict(dash='dash')
            ))
            fig_forecast.add_trace(go.Scatter(
                x=forecast_df.index, y=forecast_df['Polynomial_Forecast'],
                mode='lines+markers', name="Polynomial Trend Forecast", line=dict(dash='dot')
            ))
            
            fig_forecast.update_layout(
                title='GDP Forecast (Next 5 Years)',
                xaxis_title='Year',
                yaxis_title='Nominal GDP (KES Billions)',
                template='plotly_white'
            )
            visualizations['gdp_forecast'] = fig_forecast
            
        return visualizations

    def generate_gdp_insights(self):
        """Generate key insights from GDP analysis"""
        
        insights = []
        
        # Growth insights
        gdp_analysis = self.analyze_gdp_growth()
        if gdp_analysis:
            metrics = gdp_analysis['metrics']
            avg_growth = metrics['average_growth_rate']
            latest_growth = metrics['latest_growth_rate']
            
            if latest_growth > avg_growth:
                insights.append(f"Economic momentum is strong: latest growth ({latest_growth:.1f}%) is above the historical average ({avg_growth:.1f}%).")
            else:
                insights.append(f"Economic growth has slowed: latest growth ({latest_growth:.1f}%) is below the historical average ({avg_growth:.1f}%).")
            
            if gdp_analysis['trend']['slope'] > 0:
                insights.append(f"The long-term GDP trend is positive, with an average increase of {gdp_analysis['trend']['slope'] / 1e9:.2f} B KES per year.")
            else:
                insights.append("Warning: The long-term GDP trend is negative, indicating structural economic challenges.")

        # Sectoral insights
        sectoral_analysis = self.analyze_sectoral_contribution()
        if sectoral_analysis:
            key_sectors = sectoral_analysis['key_sectors']
            fastest_growing = sectoral_analysis['fastest_growing_sectors']
            
            insights.append(f"The economy is primarily driven by the following sectors: {', '.join(key_sectors)}.")
            
            # Check for overlap between key and fast-growing sectors
            growth_drivers = set(key_sectors) & set(fastest_growing)
            if growth_drivers:
                insights.append(f"Key growth engines include: {', '.join(growth_drivers)}, which are both large and fast-growing.")
            else:
                insights.append("Potential structural shift: The fastest-growing sectors are not yet the largest contributors to GDP.")
        
        return insights

# --- Notebook Execution ---
if __name__ == '__main__':
    print("📈 Gross Domestic Product (GDP) Analysis Notebook")
    print("=" * 50)
    print("Analyzing Kenya's economic performance through its GDP data.")
    print()

    # Initialize and run analyzer
    analyzer = GDPAnalyzer()
    
    print("📊 Loading and cleaning GDP data...")
    analyzer.load_gdp_data()
    analyzer.clean_gdp_data()
    
    print("\n🚀 Analyzing GDP growth trends...")
    gdp_growth_analysis = analyzer.analyze_gdp_growth()
    if gdp_growth_analysis:
        print(f"  - Latest Annual Growth: {gdp_growth_analysis['metrics']['latest_growth_rate']:.2f}%")
        print(f"  - Average Annual Growth: {gdp_growth_analysis['metrics']['average_growth_rate']:.2f}%")

    print("\n🏭 Analyzing sectoral contributions...")
    sectoral_analysis = analyzer.analyze_sectoral_contribution()
    if sectoral_analysis:
        print(f"  - Top 3 Sectors (Latest): {', '.join(sectoral_analysis['latest_contribution'].head(3).index)}")
        print(f"  - Top 3 Fastest Growing: {', '.join(sectoral_analysis['fastest_growing_sectors'][:3])}")

    print("\n🔮 Forecasting future GDP...")
    forecast = analyzer.forecast_gdp()
    if forecast is not None:
        print("  - 5-Year GDP forecast generated.")
        print(forecast.head())

    print("\n💡 Generating key insights...")
    insights = analyzer.generate_gdp_insights()
    for i, insight in enumerate(insights, 1):
        print(f"  {i}. {insight}")

    print("\n🎨 Creating visualizations...")
    visualizations = analyzer.create_gdp_visualizations()
    print(f"  - Generated {len(visualizations)} visualizations.")
    
    # To display figures in a real notebook environment, you would call fig.show()
    # for fig in visualizations.values():
    #     fig.show()

    print("\n✅ GDP Analysis Complete!")
    print("Key outputs available: growth trends, sectoral analysis, forecasts, and visualizations.")