In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.cm as cm
import os
#step1 data loading
os.makedirs('visualizations', exist_ok=True)

print("Step 1: Loading and preprocessing the data")

renewables_df = pd.read_csv('../datasets/clean_countries_renewables_1985_2021.csv')
gdp_df = pd.read_csv('../datasets/gdp_data_news.csv')

renewable_countries = renewables_df[['Entity', 'Code']].drop_duplicates()
gdp_countries = gdp_df[['Country Name', 'Country Code']].drop_duplicates()

matched_df = pd.merge(
    renewable_countries,
    gdp_countries,
    left_on='Code',
    right_on='Country Code',
    how='inner'
)

matching_codes = matched_df['Code'].tolist()
print(f"Found {len(matching_codes)} countries that match between datasets")

filtered_renewables = renewables_df[renewables_df['Code'].isin(matching_codes)]

pivot_df = filtered_renewables.pivot(
    index='Code',
    columns='Year',
    values='Renewables (% electricity)'
)

# STEP 2: Process GDP Data Comprehensively
print("\nStep 2: Processing GDP Data")

gdp_years = [str(year) for year in range(1985, 2022)]
filtered_gdp = gdp_df[gdp_df['Country Code'].isin(matching_codes)]
filtered_gdp = filtered_gdp[['Country Code', 'Country Name'] + gdp_years]

gdp_pivot = filtered_gdp.set_index('Country Code')
gdp_pivot = gdp_pivot.drop('Country Name', axis=1)

gdp_metrics = pd.DataFrame(index=gdp_pivot.index)

early_years = gdp_years[:5]
gdp_metrics['Early_GDP'] = gdp_pivot[early_years].mean(axis=1)

recent_years = gdp_years[-5:]
gdp_metrics['Recent_GDP'] = gdp_pivot[recent_years].mean(axis=1)

gdp_metrics['GDP_Growth'] = (gdp_metrics['Recent_GDP'] / gdp_metrics['Early_GDP']) ** (1/35) - 1

gdp_metrics['GDP_Volatility'] = gdp_pivot[gdp_years].std(axis=1) / gdp_pivot[gdp_years].mean(axis=1)
sample_countries = gdp_pivot.index[:10]
plt.figure(figsize=(14, 8))

for country in sample_countries:
    plt.plot(
        [int(year) for year in gdp_years],
        gdp_pivot.loc[country, gdp_years],
        marker='.',
        linewidth=2,
        label=country
    )

plt.title('GDP Trajectories Over Time (Sample Countries)')
plt.xlabel('Year')
plt.ylabel('GDP')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('visualizations/2a_gdp_trajectories.png')
plt.close()

plt.figure(figsize=(12, 8))

renewable_growth = (pivot_df[2021] - pivot_df[1985]) / 36

plt.scatter(
    gdp_metrics['GDP_Growth'] * 100,
    renewable_growth,
    alpha=0.7,
    s=80
)

for code in gdp_metrics.index:
    if code in renewable_growth.index:
        plt.annotate(
            code,
            (gdp_metrics.loc[code, 'GDP_Growth'] * 100, renewable_growth.loc[code]),
            xytext=(5, 5),
            textcoords='offset points'
        )

plt.title('GDP Growth Rate vs Renewable Energy Growth (1985-2021)')
plt.xlabel('GDP CAGR (%)')
plt.ylabel('Annual Increase in Renewables (% electricity)')
plt.grid(True, alpha=0.3)
plt.savefig('visualizations/2b_gdp_vs_renewable_growth.png')
plt.close()

pivot_df_filled = pivot_df.fillna(pivot_df.mean())

# STEP 3: Data Preparation for Clustering
print("\nStep 3: Data Preparation for Clustering")

year_columns = [col for col in pivot_df_filled.columns if isinstance(col, (int, float))]
features_for_clustering = pivot_df_filled[year_columns]

scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_for_clustering)

scaled_df = pd.DataFrame(
    scaled_features,
    index=features_for_clustering.index,
    columns=features_for_clustering.columns
)

plt.figure(figsize=(15, 10))
sns.heatmap(
    scaled_df.iloc[:15],
    cmap='viridis',
    yticklabels=True
)
plt.title('Standardized Renewable Energy Features (Sample Countries)')
plt.xlabel('Year')
plt.ylabel('Country Code')
plt.savefig('visualizations/3_standardized_features.png')
plt.close()

# STEP 4: PCA Analysis
print("\nStep 4: PCA Analysis")

# Apply PCA to reduce dimensions
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_features)

pca_df = pd.DataFrame(
    data=pca_result,
    columns=['PC1', 'PC2'],
    index=features_for_clustering.index
)

pca_df = pca_df.join(gdp_metrics)

explained_variance = pca.explained_variance_ratio_
print(f"Explained variance: PC1={explained_variance[0]:.2%}, PC2={explained_variance[1]:.2%}")
print(f"Total explained variance: {sum(explained_variance):.2%}")

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.bar(range(len(year_columns)), pca.components_[0])
plt.xticks(range(len(year_columns)), year_columns, rotation=90)
plt.title('PCA Component 1 (Year Contributions)')
plt.xlabel('Year')
plt.ylabel('Contribution')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.bar(range(len(year_columns)), pca.components_[1])
plt.xticks(range(len(year_columns)), year_columns, rotation=90)
plt.title('PCA Component 2 (Year Contributions)')
plt.xlabel('Year')
plt.ylabel('Contribution')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('visualizations/4_pca_components.png')
plt.close()

plt.figure(figsize=(12, 10))
scatter = plt.scatter(
    pca_df['PC1'],
    pca_df['PC2'],
    c=pca_df['Recent_GDP'],
    cmap='viridis',
    alpha=0.7,
    s=100
)

for idx, row in pca_df.iterrows():
    plt.annotate(
        idx,
        (row['PC1'], row['PC2']),
        xytext=(5, 5),
        textcoords='offset points',
        fontsize=9
    )

plt.colorbar(scatter, label='Recent GDP (avg of last 5 years)')
plt.title('PCA of Renewable Energy Adoption Patterns (1985-2021)')
plt.xlabel(f'PC1 ({explained_variance[0]:.2%} variance explained)')
plt.ylabel(f'PC2 ({explained_variance[1]:.2%} variance explained)')
plt.grid(True, alpha=0.3)
plt.savefig('visualizations/5a_pca_scatter_recent_gdp.png')
plt.close()

plt.figure(figsize=(12, 10))
scatter = plt.scatter(
    pca_df['PC1'],
    pca_df['PC2'],
    c=pca_df['GDP_Growth'],
    cmap='viridis',
    alpha=0.7,
    s=100
)

for idx, row in pca_df.iterrows():
    plt.annotate(
        idx,
        (row['PC1'], row['PC2']),
        xytext=(5, 5),
        textcoords='offset points',
        fontsize=9
    )

plt.colorbar(scatter, label='GDP Growth Rate (CAGR 1985-2021)')
plt.title('PCA of Renewable Energy Adoption Patterns (1985-2021)')
plt.xlabel(f'PC1 ({explained_variance[0]:.2%} variance explained)')
plt.ylabel(f'PC2 ({explained_variance[1]:.2%} variance explained)')
plt.grid(True, alpha=0.3)
plt.savefig('visualizations/5b_pca_scatter_gdp_growth.png')
plt.close()

# STEP 5: Determine Optimal Number of Clusters
print("\nStep 5: Finding Optimal Number of Clusters")

wcss = []
silhouette_scores = []
max_clusters = 10

for i in range(2, max_clusters + 1):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(scaled_features)
    wcss.append(kmeans.inertia_)

    labels = kmeans.labels_
    silhouette_avg = silhouette_score(scaled_features, labels)
    silhouette_scores.append(silhouette_avg)

    print(f"  K={i}: WCSS={kmeans.inertia_:.2f}, Silhouette Score={silhouette_avg:.4f}")

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.plot(range(2, max_clusters + 1), wcss, marker='o', linestyle='-')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(range(2, max_clusters + 1), silhouette_scores, marker='o', linestyle='-')
plt.title('Silhouette Scores for Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('visualizations/6_optimal_clusters.png')
plt.close()
best_k = np.argmax(silhouette_scores) + 2
print(f"Best number of clusters based on silhouette score: {best_k}")

# STEP 6: Perform K-means Clustering with the optimal k
print(f"\nStep 6: Performing K-means Clustering with k={best_k}")

# Apply k-means with the best k
kmeans = KMeans(n_clusters=best_k, init='k-means++', max_iter=300, n_init=10, random_state=42)
cluster_labels = kmeans.fit_predict(scaled_features)

pca_df['Cluster'] = cluster_labels
features_with_clusters = features_for_clustering.copy()
features_with_clusters['Cluster'] = cluster_labels

features_with_clusters = features_with_clusters.join(gdp_metrics)

plt.figure(figsize=(14, 10))

colors = cm.nipy_spectral(np.linspace(0, 1, best_k))

for i in range(best_k):
    cluster_points = pca_df[pca_df['Cluster'] == i]

    plt.scatter(
        cluster_points['PC1'],
        cluster_points['PC2'],
        s=100,
        c=[colors[i]],
        label=f'Cluster {i+1}'
    )


    for idx, row in cluster_points.iterrows():
        plt.annotate(
            idx,
            (row['PC1'], row['PC2']),
            xytext=(5, 5),
            textcoords='offset points',
            fontsize=9
        )

plt.title(f'K-means Clustering (k={best_k}) in PCA Space')
plt.xlabel(f'PC1 ({explained_variance[0]:.2%} variance explained)')
plt.ylabel(f'PC2 ({explained_variance[1]:.2%} variance explained)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(f'visualizations/7_clusters_k{best_k}_pca.png')
plt.close()

# STEP 7: Analyze Cluster Characteristics
print("\nStep 7: Analyzing Cluster Characteristics")

# Create a list to store cluster statistics
cluster_stats = []

for i in range(best_k):
    # Get countries in this cluster
    cluster_countries = features_with_clusters[features_with_clusters['Cluster'] == i].index.tolist()

    # Get data for this cluster
    cluster_data = features_with_clusters[features_with_clusters['Cluster'] == i]

    # Calculate statistics
    cluster_size = len(cluster_countries)
    early_gdp = cluster_data['Early_GDP'].mean()
    recent_gdp = cluster_data['Recent_GDP'].mean()
    gdp_growth = cluster_data['GDP_Growth'].mean()

    # Calculate adoption metrics across years
    year_columns = [col for col in cluster_data.columns if isinstance(col, (int, float))
                   and col not in ['Cluster', 'Early_GDP', 'Recent_GDP', 'GDP_Growth', 'GDP_Volatility']]

    early_years = sorted(year_columns)[:5]  # First 5 years
    recent_years = sorted(year_columns)[-5:]  # Last 5 years

    early_adoption = cluster_data[early_years].mean().mean()
    recent_adoption = cluster_data[recent_years].mean().mean()
    adoption_growth = recent_adoption - early_adoption

    # Calculate variability
    adoption_std = cluster_data[year_columns].std(axis=1).mean()

    # Store statistics
    cluster_stats.append({
        'Cluster': i+1,
        'Size': cluster_size,
        'Countries': ", ".join(cluster_countries),
        'Early_GDP': early_gdp,
        'Recent_GDP': recent_gdp,
        'GDP_Growth': gdp_growth * 100,  # Convert to percentage
        'Early_Adoption': early_adoption,
        'Recent_Adoption': recent_adoption,
        'Adoption_Growth': adoption_growth,
        'Variability': adoption_std
    })

# Create a DataFrame with cluster statistics
cluster_stats_df = pd.DataFrame(cluster_stats)
print("\nCluster Statistics:")
print(cluster_stats_df[['Cluster', 'Size', 'Early_GDP', 'Recent_GDP', 'GDP_Growth', 'Early_Adoption', 'Recent_Adoption', 'Adoption_Growth']])

# Visualization: Cluster characteristics
plt.figure(figsize=(20, 16))

# Create subplots for different metrics
plt.subplot(3, 3, 1)
plt.bar(cluster_stats_df['Cluster'], cluster_stats_df['Size'])
plt.title('Number of Countries per Cluster')
plt.xlabel('Cluster')
plt.ylabel('Number of Countries')
plt.grid(True, alpha=0.3)

plt.subplot(3, 3, 2)
plt.bar(cluster_stats_df['Cluster'], cluster_stats_df['Early_GDP'])
plt.title('Early GDP per Cluster (1985-1990 avg)')
plt.xlabel('Cluster')
plt.ylabel('Early GDP')
plt.grid(True, alpha=0.3)

plt.subplot(3, 3, 3)
plt.bar(cluster_stats_df['Cluster'], cluster_stats_df['Recent_GDP'])
plt.title('Recent GDP per Cluster (2017-2021 avg)')
plt.xlabel('Cluster')
plt.ylabel('Recent GDP')
plt.grid(True, alpha=0.3)

plt.subplot(3, 3, 4)
plt.bar(cluster_stats_df['Cluster'], cluster_stats_df['GDP_Growth'])
plt.title('GDP Growth Rate per Cluster (% CAGR)')
plt.xlabel('Cluster')
plt.ylabel('GDP Growth (%)')
plt.grid(True, alpha=0.3)

plt.subplot(3, 3, 5)
plt.bar(cluster_stats_df['Cluster'], cluster_stats_df['Early_Adoption'])
plt.title('Early Renewable Adoption per Cluster')
plt.xlabel('Cluster')
plt.ylabel('Early Renewable %')
plt.grid(True, alpha=0.3)

plt.subplot(3, 3, 6)
plt.bar(cluster_stats_df['Cluster'], cluster_stats_df['Recent_Adoption'])
plt.title('Recent Renewable Adoption per Cluster')
plt.xlabel('Cluster')
plt.ylabel('Recent Renewable %')
plt.grid(True, alpha=0.3)

plt.subplot(3, 3, 7)
plt.bar(cluster_stats_df['Cluster'], cluster_stats_df['Adoption_Growth'])
plt.title('Renewable Adoption Growth per Cluster')
plt.xlabel('Cluster')
plt.ylabel('Growth in Renewable %')
plt.grid(True, alpha=0.3)

plt.subplot(3, 3, 8)
plt.bar(cluster_stats_df['Cluster'], cluster_stats_df['Variability'])
plt.title('Variability in Renewable Adoption per Cluster')
plt.xlabel('Cluster')
plt.ylabel('Variability (Std Dev)')
plt.grid(True, alpha=0.3)

plt.subplot(3, 3, 9)
plt.scatter(cluster_stats_df['GDP_Growth'], cluster_stats_df['Adoption_Growth'])
for i, row in cluster_stats_df.iterrows():
    plt.annotate(
        f"Cluster {row['Cluster']}",
        (row['GDP_Growth'], row['Adoption_Growth']),
        xytext=(5, 5),
        textcoords='offset points'
    )
plt.title('GDP Growth vs Renewable Growth by Cluster')
plt.xlabel('GDP Growth Rate (%)')
plt.ylabel('Renewable Adoption Growth')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('visualizations/8_cluster_characteristics.png')
plt.close()

# STEP 8: Visualize Adoption Patterns Over Time for Each Cluster
print("\nStep 8: Visualizing Adoption Patterns Over Time for Each Cluster")

plt.figure(figsize=(16, 10))

# For each cluster, calculate the average renewable percentage for each year
for i in range(best_k):

    cluster_data = features_with_clusters[features_with_clusters['Cluster'] == i]


    year_cols = [col for col in cluster_data.columns if isinstance(col, (int, float))
                and col not in ['Cluster', 'Early_GDP', 'Recent_GDP', 'GDP_Growth', 'GDP_Volatility']]


    yearly_avg = cluster_data[year_cols].mean()

    plt.plot(
        yearly_avg.index.astype(int),
        yearly_avg.values,
        marker='o',
        linewidth=3,
        label=f'Cluster {i+1} (n={len(cluster_data)})'
    )

plt.title('Renewable Energy Adoption Patterns by Cluster (1985-2021)')
plt.xlabel('Year')
plt.ylabel('Renewables (% electricity)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('visualizations/9_cluster_patterns_over_time.png')
plt.close()

# STEP 9: Visualize GDP Growth vs Renewable Growth for Each Cluster
print("\nStep 9: Visualizing GDP vs Renewable Growth by Cluster")

plt.figure(figsize=(14, 10))

for i in range(best_k):
    cluster_data = features_with_clusters[features_with_clusters['Cluster'] == i]

    renewable_growth = []
    for country in cluster_data.index:
        years = [col for col in cluster_data.columns if isinstance(col, (int, float))
                and col not in ['Cluster', 'Early_GDP', 'Recent_GDP', 'GDP_Growth', 'GDP_Volatility']]
        early = cluster_data.loc[country, years[:5]].mean()
        recent = cluster_data.loc[country, years[-5:]].mean()
        growth = recent - early
        renewable_growth.append(growth)

    plt.scatter(
        cluster_data['GDP_Growth'] * 100,
        renewable_growth,
        s=100,
        alpha=0.7,
        label=f'Cluster {i+1}'
    )

    for j, country in enumerate(cluster_data.index):
        plt.annotate(
            country,
            (cluster_data.loc[country, 'GDP_Growth'] * 100, renewable_growth[j]),
            xytext=(5, 5),
            textcoords='offset points',
            fontsize=9
        )

plt.title('GDP Growth vs Renewable Energy Growth by Cluster (1985-2021)')
plt.xlabel('GDP Growth Rate (% CAGR)')
plt.ylabel('Increase in Renewable Energy (percentage points)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('visualizations/10_gdp_vs_renewable_growth_by_cluster.png')
plt.close()

# STEP 10: Visualize Individual Countries Grouped by Cluster
print("\nStep 10: Visualizing Individual Countries Grouped by Cluster")

for i in range(best_k):
    plt.figure(figsize=(16, 10))


    cluster_data = features_with_clusters[features_with_clusters['Cluster'] == i]


    year_cols = [col for col in cluster_data.columns if isinstance(col, (int, float))
                and col not in ['Cluster', 'Early_GDP', 'Recent_GDP', 'GDP_Growth', 'GDP_Volatility']]


    for country in cluster_data.index:
        plt.plot(
            [int(year) for year in year_cols],
            cluster_data.loc[country, year_cols],
            marker='.',
            alpha=0.7,
            linewidth=1,
            label=country
        )


    cluster_avg = cluster_data[year_cols].mean()
    plt.plot(
        [int(year) for year in year_cols],
        cluster_avg,
        marker='o',
        color='black',
        linewidth=4,
        label='Cluster Average'
    )

    plt.title(f'Countries in Cluster {i+1} - Renewable Energy Adoption (1985-2021)')
    plt.xlabel('Year')
    plt.ylabel('Renewables (% electricity)')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)
    plt.savefig(f'visualizations/11_cluster_{i+1}_countries.png')
    plt.close()

# STEP 11: Create a Comprehensive Dashboard for Each Cluster
print("\nStep 11: Creating Cluster Dashboards")

for i in range(best_k):
    print(f"Creating dashboard for Cluster {i+1}...")
    plt.figure(figsize=(20, 16))

    cluster_data = features_with_clusters[features_with_clusters['Cluster'] == i]
    cluster_countries = cluster_data.index.tolist()

    year_cols = [col for col in cluster_data.columns if isinstance(col, (int, float))
                and col not in ['Cluster', 'Early_GDP', 'Recent_GDP', 'GDP_Growth', 'GDP_Volatility']]

    plt.subplot(2, 2, 1)

    for country in cluster_countries:
        plt.plot(
            [int(year) for year in year_cols],
            cluster_data.loc[country, year_cols],
            marker='.',
            alpha=0.4,
            linewidth=1
        )
    cluster_avg = cluster_data[year_cols].mean()
    plt.plot(
        [int(year) for year in year_cols],
        cluster_avg,
        marker='o',
        color='red',
        linewidth=3,
        label='Cluster Average'
    )

    plt.title(f'Renewable Energy Adoption Trajectories - Cluster {i+1}')
    plt.xlabel('Year')
    plt.ylabel('Renewables (% electricity)')
    plt.legend()
    plt.grid(True, alpha=0.3)

    # 2. GDP Trajectories
    plt.subplot(2, 2, 2)

    cluster_gdp = gdp_pivot.loc[cluster_countries]

    for country in cluster_countries:
        if country in gdp_pivot.index:
            plt.plot(
                [int(year) for year in gdp_years],
                gdp_pivot.loc[country, gdp_years],
                marker='.',
                alpha=0.4,
                linewidth=1
            )

    cluster_gdp_avg = cluster_gdp.mean()
    plt.plot(
        [int(year) for year in gdp_years],
        cluster_gdp_avg,
        marker='o',
        color='red',
        linewidth=3,
        label='Cluster Average'
    )

    plt.title(f'GDP Trajectories - Cluster {i+1}')
    plt.xlabel('Year')
    plt.ylabel('GDP')
    plt.legend()
    plt.grid(True, alpha=0.3)

    plt.subplot(2, 2, 3)

    renewable_growth = []
    for country in cluster_countries:
        if country in cluster_data.index:
            early = cluster_data.loc[country, year_cols[:5]].mean()
            recent = cluster_data.loc[country, year_cols[-5:]].mean()
            growth = recent - early
            renewable_growth.append(growth)

    plt.scatter(
        cluster_data['GDP_Growth'] * 100,
        renewable_growth,
        s=80,
        alpha=0.7
    )

    for j, country in enumerate(cluster_countries):
        if j < len(renewable_growth):
            plt.annotate(
                country,
                (cluster_data.loc[country, 'GDP_Growth'] * 100, renewable_growth[j]),
                xytext=(5, 5),
                textcoords='offset points',
                fontsize=9
            )

    plt.title(f'GDP Growth vs Renewable Growth - Cluster {i+1}')
    plt.xlabel('GDP Growth Rate (% CAGR)')
    plt.ylabel('Increase in Renewable Energy (percentage points)')
    plt.grid(True, alpha=0.3)

    plt.subplot(2, 2, 4)

    cluster_stats_i = cluster_stats_df[cluster_stats_df['Cluster'] == i+1].iloc[0]

    stats_to_show = {k: v for k, v in cluster_stats_i.items() if k != 'Countries'}

    y_pos = np.arange(len(stats_to_show) - 1)
    values = list(stats_to_show.values())[1:]
    labels = list(stats_to_show.keys())[1:]

    plt.barh(y_pos, values)
    plt.yticks(y_pos, labels)
    plt.title(f'Key Statistics - Cluster {i+1}')
    plt.xlabel('Value')
    plt.tight_layout()

    plt.figtext(0.5, 0.01,
                f"Countries in Cluster {i+1}: {', '.join(cluster_countries)}",
                ha='center',
                fontsize=10,
                bbox={'facecolor':'white', 'alpha':0.8, 'pad':5})

    plt.tight_layout(rect=[0, 0.05, 1, 0.95])
    plt.savefig(f'visualizations/12_cluster_{i+1}_dashboard.png')
    plt.close()

print("\nStep 12: Creating Final Report")

cluster_profiles = pd.DataFrame(cluster_stats)

cluster_descriptions = []

for i in range(best_k):
    cluster_i = cluster_stats_df[cluster_stats_df['Cluster'] == i+1].iloc[0]

    early_adoption = cluster_i['Early_Adoption']
    recent_adoption = cluster_i['Recent_Adoption']
    adoption_growth = cluster_i['Adoption_Growth']
    gdp_growth = cluster_i['GDP_Growth']

    if early_adoption > 20:
        base_pattern = "Early High Adopters"
    elif early_adoption > 10:
        base_pattern = "Early Moderate Adopters"
    elif early_adoption < 5:
        base_pattern = "Late Starters"
    else:
        base_pattern = "Average Starters"

    if adoption_growth > 20:
        growth_pattern = "with Dramatic Growth"
    elif adoption_growth > 10:
        growth_pattern = "with Strong Growth"
    elif adoption_growth < 2:
        growth_pattern = "with Limited Growth"
    elif adoption_growth < 0:
        growth_pattern = "with Declining Renewables"
    else:
        growth_pattern = "with Moderate Growth"

    if gdp_growth > 3:
        gdp_pattern = "and High Economic Growth"
    elif gdp_growth > 2:
        gdp_pattern = "and Moderate Economic Growth"
    else:
        gdp_pattern = "and Slow Economic Growth"

    pattern_description = f"{base_pattern} {growth_pattern} {gdp_pattern}"

    cluster_descriptions.append({
        'Cluster': i+1,
        'Pattern_Type': pattern_description,
        'Key_Characteristics': f"Starting at {early_adoption:.1f}% renewables, ending at {recent_adoption:.1f}%, with {gdp_growth:.1f}% GDP CAGR",
        'Country_Count': len(cluster_i['Countries'].split(', ')),
        'Example_Countries': ', '.join(cluster_i['Countries'].split(', ')[:3]) + "..."
    })

descriptions_df = pd.DataFrame(cluster_descriptions)

# Save the final report
descriptions_df.to_csv('cluster_pattern_descriptions.csv', index=False)
cluster_stats_df.to_csv('cluster_detailed_statistics.csv', index=False)

print("\nClustering analysis complete! Visualizations saved in the 'visualizations' folder.")
print(f"Found {best_k} distinct renewable energy adoption patterns:")
for i, row in descriptions_df.iterrows():
    print(f"Cluster {row['Cluster']}: {row['Pattern_Type']} - {row['Key_Characteristics']}")
    print(f"   Example countries: {row['Example_Countries']}")
    print("")

print("For detailed information, refer to the CSV files and visualizations created.")

Step 1: Loading and preprocessing the data
Found 60 countries that match between datasets

Step 2: Processing GDP Data

Step 3: Data Preparation for Clustering

Step 4: PCA Analysis
Explained variance: PC1=92.31%, PC2=5.03%
Total explained variance: 97.34%

Step 5: Finding Optimal Number of Clusters




  K=2: WCSS=610.42, Silhouette Score=0.6606
  K=3: WCSS=391.42, Silhouette Score=0.5184
  K=4: WCSS=285.12, Silhouette Score=0.4751
  K=5: WCSS=228.78, Silhouette Score=0.4099
  K=6: WCSS=182.10, Silhouette Score=0.4083




  K=7: WCSS=146.89, Silhouette Score=0.4251
  K=8: WCSS=129.05, Silhouette Score=0.4279
  K=9: WCSS=109.17, Silhouette Score=0.4203
  K=10: WCSS=94.28, Silhouette Score=0.4259
Best number of clusters based on silhouette score: 2

Step 6: Performing K-means Clustering with k=2





Step 7: Analyzing Cluster Characteristics

Cluster Statistics:
   Cluster  Size     Early_GDP    Recent_GDP  GDP_Growth  Early_Adoption  \
0        1    46  3.102706e+11  1.589218e+12    5.495840       12.833468   
1        2    14  1.046570e+11  5.218411e+11    5.530529       74.798701   

   Recent_Adoption  Adoption_Growth  
0        20.313274         7.479807  
1        68.711356        -6.087345  

Step 8: Visualizing Adoption Patterns Over Time for Each Cluster

Step 9: Visualizing GDP vs Renewable Growth by Cluster

Step 10: Visualizing Individual Countries Grouped by Cluster

Step 11: Creating Cluster Dashboards
Creating dashboard for Cluster 1...
Creating dashboard for Cluster 2...

Step 12: Creating Final Report

Clustering analysis complete! Visualizations saved in the 'visualizations' folder.
Found 2 distinct renewable energy adoption patterns:
Cluster 1: Early Moderate Adopters with Moderate Growth and High Economic Growth - Starting at 12.8% renewables, ending at 20.3%, 