In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import linregress  # 導入 linregress 函數
from sklearn.metrics import mean_squared_error, r2_score  # 導入 MSE 和 R² 計算函數
import os
from datetime import datetime

# 創建示例數據
def generate_sample_data():
    """Generate sample education data for visualization"""
    np.random.seed(42)

    countries = {
        'East Asia': ['China', 'Japan', 'South Korea', 'Taiwan'],
        'Southeast Asia': ['Thailand', 'Vietnam', 'Malaysia', 'Indonesia'],
        'South Asia': ['India', 'Pakistan', 'Bangladesh', 'Sri Lanka'],
    }

    years = range(2000, 2021)
    data_rows = []

    for region, country_list in countries.items():
        base_score = np.random.uniform(60, 90)
        for country in country_list:
            trend = np.random.uniform(-0.5, 1.5)
            noise = np.random.normal(0, 2, len(years))

            for i, year in enumerate(years):
                score = base_score + trend * i + noise[i]
                score = min(100, max(0, score))  # Ensure score is between 0 and 100

                data_rows.append({
                    'year': year,
                    'country': country,
                    'region': region,
                    'education_score': score,
                    'gdp_per_capita': np.random.normal(30000, 10000) + trend * i * 1000,
                    'enrollment_rate': min(100, max(50, np.random.normal(85, 5) + trend * i))
                })

    return pd.DataFrame(data_rows)

class EducationVisualizer:
    def __init__(self, data):
        self.data = data
        self.setup_style()
        self.create_output_directory()

    def setup_style(self):
        """Set up plotting style"""
        plt.style.use('seaborn-v0_8-darkgrid')
        self.colors = sns.color_palette("husl", 8)

    def create_output_directory(self):
        """Create directory for saving visualizations"""
        self.output_dir = 'education_analysis_output'
        os.makedirs(self.output_dir, exist_ok=True)

    def create_trend_analysis(self):
        """Create trend analysis visualization with linear regression"""
        plt.figure(figsize=(15, 10))
        plt.grid(False)  # 清除背景網格線

        for region in self.data['region'].unique():
            region_data = self.data[self.data['region'] == region]
            mean_scores = region_data.groupby('year')['education_score'].mean()
            
            # 繪製原始數據
            plt.plot(mean_scores.index, mean_scores.values, marker='o', label=region)
            
            # 計算線性回歸
            slope, intercept, r_value, p_value, std_err = linregress(mean_scores.index, mean_scores.values)
            regression_line = intercept + slope * mean_scores.index
            
            # 繪製回歸線
            plt.plot(mean_scores.index, regression_line, linestyle='--', label=f'{region} Trend')

        plt.title('Regional Education Score Trends with Linear Regression (2000–2020)', pad=20)
        plt.xlabel('Year (2000-2020)')  # 添加單位
        plt.ylabel('Average Education Score (0-100)')  # 添加單位
        
        # 去除 Y 軸小數點
        plt.yticks(range(0, 101, 10))  # 設置 Y 軸刻度為 0 到 100，間隔為 10
        plt.xticks(range(2000, 2022, 2))
        
        plt.legend(title='Region', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()

        plt.savefig(f'{self.output_dir}/regional_trends.png', dpi=300, bbox_inches='tight')
        plt.close()

    def create_distribution_analysis(self):
        """Create distribution analysis visualization"""
        fig, axes = plt.subplots(2, 1, figsize=(15, 12))
        plt.grid(False)  # 清除背景網格線

        # Boxplot by region
        sns.boxplot(data=self.data, x='region', y='education_score', ax=axes[0])
        axes[0].set_title('Education Score Distribution by Region')
        axes[0].set_ylabel('Education Score (0-100)')  # 添加單位
        axes[0].tick_params(axis='x', rotation=45)

        # Violin plot with individual points
        sns.violinplot(data=self.data, x='region', y='education_score', ax=axes[1])
        sns.stripplot(data=self.data, x='region', y='education_score', color='red', alpha=0.3, ax=axes[1])
        axes[1].set_title('Detailed Score Distribution with Individual Points')
        axes[1].set_ylabel('Education Score (0-100)')  # 添加單位
        axes[1].tick_params(axis='x', rotation=45)

        plt.tight_layout()
        plt.savefig(f'{self.output_dir}/score_distributions.png', dpi=300)
        plt.close()

    def create_correlation_analysis(self):
        """Create correlation analysis visualization with linear regression, MSE, and R-squared"""
        plt.figure(figsize=(12, 8))
        plt.grid(False)  # 清除背景網格線

        # 繪製散點圖
        sns.scatterplot(data=self.data, x='gdp_per_capita', y='education_score', hue='region', alpha=0.6)

        # 為每個區域計算線性回歸並繪製回歸線
        for region in self.data['region'].unique():
            region_data = self.data[self.data['region'] == region]
            
            # 計算線性回歸
            slope, intercept, r_value, p_value, std_err = linregress(region_data['gdp_per_capita'], region_data['education_score'])
            regression_line = intercept + slope * region_data['gdp_per_capita']
            
            # 繪製回歸線
            plt.plot(region_data['gdp_per_capita'], regression_line, linestyle='--', label=f'{region} Trend')

            # 計算 MSE 和 R²
            mse = mean_squared_error(region_data['education_score'], regression_line)
            r2 = r2_score(region_data['education_score'], regression_line)
            
            # 在圖表上顯示 MSE 和 R²
            plt.text(
                region_data['gdp_per_capita'].max(),  # 文字顯示在區域數據的最右側
                regression_line.max(),  # 文字顯示在回歸線的最高點
                f'{region}\nMSE: {mse:.2f}\nR²: {r2:.2f}',  # 顯示 MSE 和 R²
                fontsize=9,
                ha='right',
                va='bottom'
            )

        plt.title('Education Score vs GDP per Capita with Linear Regression, MSE, and R-squared')
        plt.xlabel('GDP per Capita (USD)')  # 添加單位
        plt.ylabel('Education Score (0-100)')  # 添加單位
        plt.legend(title='Region', bbox_to_anchor=(1.05, 1), loc='upper left')

        plt.tight_layout()
        plt.savefig(f'{self.output_dir}/score_gdp_correlation.png', dpi=300, bbox_inches='tight')
        plt.close()

    def create_heatmap(self):
        """Create correlation heatmap"""
        plt.figure(figsize=(10, 8))
        plt.grid(False)  # 清除背景網格線

        correlation_matrix = self.data[['education_score', 'gdp_per_capita', 'enrollment_rate']].corr()
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)

        plt.title('Correlation Heatmap of Education Metrics')
        plt.tight_layout()
        plt.savefig(f'{self.output_dir}/correlation_heatmap.png', dpi=300)
        plt.close()

    def generate_summary_statistics(self):
        """Generate and save summary statistics"""
        summary_stats = self.data.groupby('region').agg({
            'education_score': ['mean', 'std', 'min', 'max'],
            'enrollment_rate': ['mean', 'std'],
            'gdp_per_capita': ['mean', 'std']
        }).round(2)

        summary_stats.to_csv(f'{self.output_dir}/summary_statistics.csv')

        return summary_stats

    def run_complete_analysis(self):
        """Run all visualizations and analysis"""
        print("Starting educational data analysis...")

        print("1. Creating trend analysis...")
        self.create_trend_analysis()

        print("2. Creating distribution analysis...")
        self.create_distribution_analysis()

        print("3. Creating correlation analysis...")
        self.create_correlation_analysis()

        print("4. Creating correlation heatmap...")
        self.create_heatmap()

        print("5. Generating summary statistics...")
        summary_stats = self.generate_summary_statistics()

        print("\nAnalysis complete! All visualizations have been saved to:", self.output_dir)
        return summary_stats

# 主執行代碼
if __name__ == "__main__":
    # 生成示例數據
    print("Generating sample data...")
    education_data = generate_sample_data()

    # 創建可視化器並運行分析
    visualizer = EducationVisualizer(education_data)
    summary_statistics = visualizer.run_complete_analysis()

    # 打印匯總統計
    print("\nSummary Statistics:")
    print(summary_statistics)

Generating sample data...
Starting educational data analysis...
1. Creating trend analysis...
2. Creating distribution analysis...
3. Creating correlation analysis...
4. Creating correlation heatmap...
5. Generating summary statistics...

Analysis complete! All visualizations have been saved to: education_analysis_output

Summary Statistics:
               education_score                      enrollment_rate        \
                          mean   std    min     max            mean   std   
region                                                                      
East Asia                83.53  8.05  70.43  100.00           94.42  5.80   
South Asia               68.97  7.56  61.37   92.79           87.44  6.99   
Southeast Asia           87.91  6.33  78.25  100.00           90.93  6.55   

               gdp_per_capita            
                         mean       std  
region                                   
East Asia            42741.25  12510.77  
South Asia           3364