# 2013년부터 2022년까지의 전체 패널 데이터를 대상으로,
- 종속변수: GDP
- 후보 독립변수: Corporate Tax, GERD, Institutions, Internet Usage 조합
- 최적의 모델(AIC 기준)이 선택

In [4]:
import pandas as pd
import numpy as np
import itertools
import math
import statsmodels.api as sm
from linearmodels.panel import PanelOLS

- 로그우도, 표본 수, 추정 파라미터 수를 이용하여 AIC 또는 BIC를 계산

In [5]:
def calculate_criterion(loglik, nobs, k, criterion='AIC'):
    if criterion == 'AIC':
        return -2 * loglik + 2 * k
    elif criterion == 'BIC':
        return -2 * loglik + k * math.log(nobs)
    else:
        raise ValueError("Use 'AIC' or 'BIC'")

- panel_df: MultiIndex (Country, Year) DataFrame, 각 변수는 wide 형식
- dep_candidates: 종속변수 후보 목록 e.g.: ['GDP', 'Unemployment Rate']
- indep_candidates: 독립변수 후보 목록 e.g.: ['Corporate Tax', 'GERD', 'Institutions', 'Internet Usage']
- entity_effects: 고정효과 포함 여부 (If true -> 고정효과 모델)
- criterion: 모델 선택 기준 ('AIC' | 'BIC')
- 가능한 모든 조합을 시도해 기준이 최소인 모델 정보를 반환

In [6]:
def best_subset_panel_all(panel_df, dep_candidates, indep_candidates, entity_effects=True, criterion='AIC'):

    best_model_info = None
    
    # 후보 종속변수와 독립변수의 모든 조합 탐색
    for dep_var in dep_candidates:
        for r in range(1, len(indep_candidates) + 1):
            for combo in itertools.combinations(indep_candidates, r):
                vars_to_use = [dep_var] + list(combo)
                # 결측치 제거
                temp = panel_df[vars_to_use].dropna()
                if temp.empty:
                    continue
                y = temp[dep_var]
                X = temp[list(combo)]
                X = sm.add_constant(X)
                
                try:
                    model = PanelOLS(y, X, entity_effects=entity_effects)
                    res = model.fit(cov_type='clustered', cluster_entity=True)
                    # 파라미터 수: 상수 포함
                    k = len(res.params)
                    score = calculate_criterion(res.loglik, res.nobs, k, criterion=criterion)
                    
                    if best_model_info is None or score < best_model_info['score']:
                        best_model_info = {
                            'dep_var': dep_var,
                            'indep_vars': combo,
                            'score': score,
                            'model': res,
                            'nobs': res.nobs,
                            'criterion': criterion
                        }
                except Exception as e:
                    print(f"Model estimation failed for {dep_var} with independents {combo}. Error: {e}")
    return best_model_info

In [7]:
if __name__ == "__main__":
    # 1. 데이터 로드
    df_master = pd.read_csv("../data/master_data_by_category.csv")
    # long format으로 변환
    years = [str(y) for y in range(2013, 2023)]
    df_long = df_master.melt(id_vars=['Country', 'category'], value_vars=years,
                            var_name='Year', value_name='Value')
    df_long['Year'] = df_long['Year'].astype(int)
    # pivot: 인덱스 = (Country, Year), 컬럼 = category, 값 = Value
    panel_df = df_long.pivot(index=['Country', 'Year'], columns='category', values='Value')
    panel_df.sort_index(level=['Country', 'Year'], inplace=True)
    
    # 2. 후보 변수 설정
    dep_candidates = ['GDP', 'Unemployment Rate']  # 종속변수 후보
    indep_candidates = ['Corporate Tax', 'GERD', 'Institutions', 'Internet Usage']  # 독립변수 후보
    
    # 3. Best Subset Selection 수행
    best_model = best_subset_panel_all(panel_df, dep_candidates, indep_candidates, entity_effects=True, criterion='AIC')
    
    if best_model:
        print("=== Best PanelOLS Model Selected ===")
        print("Dependent Variable:", best_model['dep_var'])
        print("Independent Variables:", best_model['indep_vars'])
        print("Criterion Used:", best_model['criterion'])
        print("Score:", best_model['score'])
        print("Number of Observations:", best_model['nobs'])
        print(best_model['model'].summary)
    else:
        print("No valid model could be estimated.")

=== Best PanelOLS Model Selected ===
Dependent Variable: Unemployment Rate
Independent Variables: ('Corporate Tax', 'GERD')
Criterion Used: AIC
Score: 171.79321908830423
Number of Observations: 70
                          PanelOLS Estimation Summary                           
Dep. Variable:      Unemployment Rate   R-squared:                        0.2197
Estimator:                   PanelOLS   R-squared (Between):              0.3877
No. Observations:                  70   R-squared (Within):               0.2197
Date:                Thu, Mar 13 2025   R-squared (Overall):              0.3559
Time:                        19:21:04   Log-likelihood                   -82.897
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      8.5878
Entities:                           7   P-value                           0.0005
Avg Obs:                      10.0000   Distribution:                    F