In [None]:
# # 필수 라이브러리 설치 (최초 1회 실행)
# !pip install wbdata pandas matplotlib
# !pip install wbgapi pandas matplotlib

import wbdata
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import wbgapi as wb
import pandas as pd
import matplotlib.pyplot as plt
import wbgapi as wb
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:


# 1. 분석 지표 설정
INDICATORS = {
    'NY.GDP.MKTP.KD.ZG': 'GDP_growth',  # GDP 성장률(%)
    'EG.ELC.ACCS.ZS': 'Electricity_access',  # 전기 접근률
    'IT.NET.USER.ZS': 'Internet_users'  # 인터넷 사용자 비율
}

COUNTRIES = ['USA', 'CHN', 'IND', 'BRA', 'ZAF']  # ISO3 국가 코드
YEAR_RANGE = range(2000, 2024)  # 2000-2023년 데이터

# 2. 데이터 추출
df = wb.data.DataFrame(
    INDICATORS.keys(),
    economy=COUNTRIES,
    time=YEAR_RANGE,
    columns='series',  # 컬럼 구조 최적화
    skipBlanks=True,
    numericTimeKeys=True
)

# 3. 데이터 정제
df = df.rename(columns=INDICATORS)  # 컬럼명 변경
df = df.stack().reset_index()  # 다단계 인덱스 해제
df.columns = ['Country', 'Year', 'Indicator', 'Value']  # 컬럼명 지정

# 피벗 테이블 생성
pivot_df = df.pivot_table(
    index=['Country', 'Year'],
    columns='Indicator',
    values='Value'
).reset_index()

# 4. 시각화
plt.figure(figsize=(14, 8))

# GDP 성장률 vs 전기 접근률
plt.subplot(1, 2, 1)
for country in COUNTRIES:
    country_data = pivot_df[pivot_df['Country'] == country]
    plt.scatter(
        country_data['Electricity_access'],
        country_data['GDP_growth'],
        label=country,
        alpha=0.7
    )
plt.xlabel('전기 접근률 (%)')
plt.ylabel('GDP 성장률 (%)')
plt.title('전기 인프라 vs 경제 성장')
plt.legend()

# GDP 성장률 vs 인터넷 사용자
plt.subplot(1, 2, 2)
for country in COUNTRIES:
    country_data = pivot_df[pivot_df['Country'] == country]
    plt.scatter(
        country_data['Internet_users'],
        country_data['GDP_growth'],
        label=country,
        alpha=0.7
    )
plt.xlabel('인터넷 사용자 비율 (%)')
plt.ylabel('GDP 성장률 (%)')
plt.title('디지털 인프라 vs 경제 성장')
plt.tight_layout()
plt.show()


In [None]:
# 1. 분석 지표 선정 (인프라 5개 분야 + GDP 성장률)
INFRA_INDICATORS = {
    # 에너지 인프라
    'EG.ELC.ACCS.ZS': '전기접근률', 
    # ICT 인프라
    'IT.NET.USER.ZS': '인터넷사용자',  
    'IT.CEL.SETS.P2': '휴대폰보급률',
    # 교통 인프라
    'IS.RRS.TOTL.KM': '철도총연장',  
    'IS.ROD.DNST.K2': '도로밀도',
    # GDP 성장률
    'NY.GDP.MKTP.KD.ZG': 'GDP성장률'  
}

# 2. 데이터 수집 (최근 5년 평균)
years = list(range(2018, 2023))
df = wb.data.DataFrame(
    list(INFRA_INDICATORS.keys()),
    time=years,
    numericTimeKeys=True,
    skipBlanks=True,
    labels=True
).reset_index()

In [None]:
# 데이터 수집 후 컬럼 확인
print("원본 데이터 컬럼:", df.columns.tolist())

# wbgapi 특성상 국가 컬럼이 'economy'로 표기되는 경우
df = df.rename(columns={'economy': 'country'})

# 시리즈(지표) 컬럼명 확인
df = df.rename(columns={'series': 'indicator'})

In [20]:
# 단일 컬럼 삭제
df = df.drop('country', axis=1)

In [21]:
df

Unnamed: 0,indicator,Country,Series,2018,2019,2020,2021,2022
0,EG.ELC.ACCS.ZS,Zimbabwe,Access to electricity (% of population),45.400000,46.700000,52.700000,49.000000,50.100000
1,EG.ELC.ACCS.ZS,Zambia,Access to electricity (% of population),40.200000,43.000000,44.600000,46.700000,47.800000
2,EG.ELC.ACCS.ZS,"Yemen, Rep.",Access to electricity (% of population),62.000000,72.800000,73.900000,74.900000,76.000000
3,EG.ELC.ACCS.ZS,West Bank and Gaza,Access to electricity (% of population),100.000000,100.000000,100.000000,100.000000,100.000000
4,EG.ELC.ACCS.ZS,Virgin Islands (U.S.),Access to electricity (% of population),100.000000,100.000000,100.000000,100.000000,100.000000
...,...,...,...,...,...,...,...,...
1093,NY.GDP.MKTP.KD.ZG,Central Europe and the Baltics,GDP growth (annual %),5.040764,4.005993,-3.329069,6.532150,4.067238
1094,NY.GDP.MKTP.KD.ZG,Caribbean small states,GDP growth (annual %),2.778989,1.055088,-9.602316,10.442956,23.067604
1095,NY.GDP.MKTP.KD.ZG,Arab World,GDP growth (annual %),2.630040,1.583715,-4.711784,4.031106,5.721433
1096,NY.GDP.MKTP.KD.ZG,Africa Western and Central,GDP growth (annual %),2.844755,3.232903,-1.003994,4.037630,3.789680


In [22]:
# 기본 그룹화 및 평균
grouped_mean = df.groupby('indicator').mean(numeric_only=True)

In [25]:
df = df.transpose()

In [26]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1088,1089,1090,1091,1092,1093,1094,1095,1096,1097
indicator,EG.ELC.ACCS.ZS,EG.ELC.ACCS.ZS,EG.ELC.ACCS.ZS,EG.ELC.ACCS.ZS,EG.ELC.ACCS.ZS,EG.ELC.ACCS.ZS,EG.ELC.ACCS.ZS,EG.ELC.ACCS.ZS,EG.ELC.ACCS.ZS,EG.ELC.ACCS.ZS,...,NY.GDP.MKTP.KD.ZG,NY.GDP.MKTP.KD.ZG,NY.GDP.MKTP.KD.ZG,NY.GDP.MKTP.KD.ZG,NY.GDP.MKTP.KD.ZG,NY.GDP.MKTP.KD.ZG,NY.GDP.MKTP.KD.ZG,NY.GDP.MKTP.KD.ZG,NY.GDP.MKTP.KD.ZG,NY.GDP.MKTP.KD.ZG
Country,Zimbabwe,Zambia,"Yemen, Rep.",West Bank and Gaza,Virgin Islands (U.S.),Viet Nam,"Venezuela, RB",Vanuatu,Uzbekistan,Uruguay,...,Euro area,East Asia & Pacific (IDA & IBRD countries),East Asia & Pacific (excluding high income),East Asia & Pacific,Early-demographic dividend,Central Europe and the Baltics,Caribbean small states,Arab World,Africa Western and Central,Africa Eastern and Southern
Series,Access to electricity (% of population),Access to electricity (% of population),Access to electricity (% of population),Access to electricity (% of population),Access to electricity (% of population),Access to electricity (% of population),Access to electricity (% of population),Access to electricity (% of population),Access to electricity (% of population),Access to electricity (% of population),...,GDP growth (annual %),GDP growth (annual %),GDP growth (annual %),GDP growth (annual %),GDP growth (annual %),GDP growth (annual %),GDP growth (annual %),GDP growth (annual %),GDP growth (annual %),GDP growth (annual %)
2018,45.4,40.2,62.0,100.0,100.0,100.0,99.9,61.6,100.0,99.8,...,1.768486,6.527781,6.527938,4.759994,3.842496,5.040764,2.778989,2.63004,2.844755,2.666632
2019,46.7,43.0,72.8,100.0,100.0,99.4,99.8,64.5,100.0,99.9,...,1.628395,5.784202,5.784281,3.961915,2.357443,4.005993,1.055088,1.583715,3.232903,2.194319
2020,52.7,44.6,73.9,100.0,100.0,99.8,99.9,67.3,100.0,100.0,...,-6.048673,1.243567,1.243678,-0.134039,-4.176095,-3.329069,-9.602316,-4.711784,-1.003994,-2.864293
2021,49.0,46.7,74.9,100.0,100.0,100.0,100.0,70.0,99.9,100.0,...,6.361156,7.602005,7.60232,6.2204,7.408846,6.53215,10.442956,4.031106,4.03763,4.576393
2022,50.1,47.8,76.0,100.0,100.0,100.0,100.0,70.0,100.0,100.0,...,3.546043,3.386153,3.38621,2.868762,5.520736,4.067238,23.067604,5.721433,3.78968,3.553878


In [24]:
unique_values_A = df['indicator'].unique()
print(unique_values_A)  # [1 2 3]

['EG.ELC.ACCS.ZS' 'IT.NET.USER.ZS' 'IT.CEL.SETS.P2' 'IS.RRS.TOTL.KM'
 'NY.GDP.MKTP.KD.ZG']


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['category_encoded'] = le.fit_transform(df[['indicator','Country', '']])

In [None]:
from sklearn.preprocessing import LabelEncoder

# 국가 코드 컬럼 선택
country_codes = df['country', 'indicator', 'Country']

# 레이블 인코딩 실행
le = LabelEncoder()
df['country_encoded'] = le.fit_transform(country_codes)


In [None]:
# 3. 데이터 클렌징
def clean_data(df):
    # 컬럼명 강제 지정 (wbapi 버전 차이 대응)
    required_columns = ['economy', 'series', 'time', 'value']
    df.columns = required_columns[:len(df.columns)]
    
    # 국가별 연평균 계산
    df_avg = df.groupby(['economy', 'series'])['value'].mean().unstack()
    
    # 결측치 처리
    df_clean = df_avg.dropna(thresh=len(INFRA_INDICATORS)//2)
    
    # 표준화
    scaler = StandardScaler()
    return pd.DataFrame(
        scaler.fit_transform(df_clean),
        index=df_clean.index,
        columns=df_clean.columns
    )

In [None]:
# 4. 인사이트 도출 모듈
class InfrastructureInsights:
    def __init__(self, data):
        self.df = data
        
    def correlation_analysis(self):
        """인프라-성장률 상관관계 분석"""
        corr_matrix = self.df.corr()
        return corr_matrix['GDP성장률'].sort_values(ascending=False)
    
    def cluster_analysis(self, n_clusters=4):
        """국가 클러스터링 분석"""
        kmeans = KMeans(n_clusters=n_clusters)
        clusters = kmeans.fit_predict(self.df)
        pca = PCA(n_components=2)
        principal_components = pca.fit_transform(self.df)
        return pd.DataFrame(principal_components, columns=['PC1', 'PC2'], index=self.df.index).assign(Cluster=clusters)
    
    def plot_global_trend(self):
        """세계적 추세 시각화"""
        return px.parallel_coordinates(
            self.df.reset_index(),
            color='GDP성장률',
            labels=INFRA_INDICATORS,
            color_continuous_scale=px.colors.diverging.Tealrose
        )

In [None]:
# 5. 분석 실행
insight_engine = InfrastructureInsights(df)

# 상관관계 결과
corr_results = insight_engine.correlation_analysis()
print(f"GDP 성장률과의 상관계수:\n{corr_results}")

# 클러스터링 결과 시각화
cluster_df = insight_engine.cluster_analysis()
fig = px.scatter(
    cluster_df, 
    x='PC1', y='PC2', 
    color='Cluster',
    hover_data=[cluster_df.index]
)
fig.update_layout(title='국가별 인프라-성장 프로파일 클러스터링')
fig.show()

# 글로벌 트렌드 시각화
insight_engine.plot_global_trend().show()
