# 전체 기간(2013~2022) 동태적 효과 고려 코드

In [17]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/master_data_by_category.csv")

In [18]:
# 1. long format으로 변환
years = [str(y) for y in range(2013, 2023)]
df_long = df.melt(id_vars=['Country', 'category'],
                  value_vars=years,
                  var_name='Year',
                  value_name='Value')
df_long['Year'] = df_long['Year'].astype(int)

# 2. wide format: 인덱스 = (Country, Year), 컬럼 = category
panel_df = df_long.pivot(index=['Country', 'Year'], columns='category', values='Value')
panel_df.sort_index(level=['Country', 'Year'], inplace=True)

## 고정 효과 모델

In [19]:
import statsmodels.api as sm
from linearmodels.panel import PanelOLS

fe_df = panel_df[['GDP', 'Corporate Tax', 'GERD', 'Institutions']].dropna()
exog_vars = ['Corporate Tax', 'GERD', 'Institutions']

# 상수항
X = fe_df[exog_vars]
X = sm.add_constant(X)
y = fe_df['GDP']

# 패널 구조: index = (Country, Year)
fe_df = fe_df.reset_index()
fe_df = fe_df.set_index(['Country', 'Year'])

fe_model = PanelOLS(y, X, entity_effects=True)
fe_res = fe_model.fit(cov_type='clustered', cluster_entity=True)
print("\n=== Fixed Effects Model ===")
print(fe_res.summary)


=== Fixed Effects Model ===
                          PanelOLS Estimation Summary                           
Dep. Variable:                    GDP   R-squared:                        0.4871
Estimator:                   PanelOLS   R-squared (Between):             -0.3811
No. Observations:                  70   R-squared (Within):               0.4871
Date:                Tue, Mar 11 2025   R-squared (Overall):             -0.3393
Time:                        22:01:24   Log-likelihood                   -2037.1
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      18.996
Entities:                           7   P-value                           0.0000
Avg Obs:                      10.0000   Distribution:                    F(3,60)
Min Obs:                      10.0000                                           
Max Obs:                      10.0000   F-statistic (robust):             3.8199

## 랜덤효과 모델 + Hausman 모델

In [20]:
from linearmodels.panel import RandomEffects
from linearmodels.panel import compare

# 랜덤효과 모델
re_model = RandomEffects(y, X)
re_res = re_model.fit()
print("\n=== Random Effects Model ===")
print(re_res.summary)


=== Random Effects Model ===
                        RandomEffects Estimation Summary                        
Dep. Variable:                    GDP   R-squared:                        0.4623
Estimator:              RandomEffects   R-squared (Between):             -0.3710
No. Observations:                  70   R-squared (Within):               0.4870
Date:                Tue, Mar 11 2025   R-squared (Overall):             -0.3297
Time:                        22:01:24   Log-likelihood                   -2039.8
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      18.918
Entities:                           7   P-value                           0.0000
Avg Obs:                      10.0000   Distribution:                    F(3,66)
Min Obs:                      10.0000                                           
Max Obs:                      10.0000   F-statistic (robust):             18.91

In [21]:
# Hausman 테스트: 고정효과 vs 랜덤효과 중 어느 쪽이 일관적인지 판단
# linearmodels.panel.compare 함수 사용
comp = compare({'FE': fe_res, 'RE': re_res})
print("\n=== Hausman Test (compare FE vs RE) ===")
print(comp)


=== Hausman Test (compare FE vs RE) ===
                    Model Comparison                    
                                    FE                RE
--------------------------------------------------------
Dep. Variable                      GDP               GDP
Estimator                     PanelOLS     RandomEffects
No. Observations                    70                70
Cov. Est.                    Clustered        Unadjusted
R-squared                       0.4871            0.4623
R-Squared (Within)              0.4871            0.4870
R-Squared (Between)            -0.3811           -0.3710
R-Squared (Overall)            -0.3393           -0.3297
F-statistic                     18.996            18.918
P-value (F-stat)                0.0000            0.0000
const                       -8.096e+12        -7.689e+12
                             (-0.8106)         (-1.5221)
Corporate Tax               -1.326e+11        -1.327e+11
                             (-1.8680)         

## 동태적 패널데이터 분석(첫 차분 변환)

In [25]:
# 각 카테고리별 데이터를 pivot하여 Country별로 wide 형식
categories_to_use = ['GDP', 'Corporate Tax', 'GERD', 'Institutions']
dfs_list = []
for cat in categories_to_use:
    temp = df_long[df_long['category'] == cat].copy()
    temp = temp.pivot(index=['Country', 'Year'], columns='category', values='Value')
    dfs_list.append(temp)

# Country, Year 기준으로 merge (outer join)
panel_df = dfs_list[0]
for df_temp in dfs_list[1:]:
    panel_df = panel_df.merge(df_temp, on=['Country', 'Year'], how='outer')

# 확인
print("패널 데이터 형태:", panel_df.shape)
print(panel_df.head())

패널 데이터 형태: (70, 4)
category               GDP  Corporate Tax    GERD  Institutions
Country Year                                                   
China   2013  9.570470e+12           25.0  1.9979          48.3
        2014  1.047560e+13           25.0  2.0224          48.3
        2015  1.106160e+13           25.0  2.0570          54.0
        2016  1.123330e+13           25.0  2.1003          55.2
        2017  1.231050e+13           25.0  2.1160          54.8


### 2. 동태적 패널 데이터 분석을 위한 첫차분 변환
#### 2-1. GDP의 1시차 항 생성

In [26]:
panel_df = panel_df.sort_index(level=['Country', 'Year'])
panel_df['GDP_lag'] = panel_df.groupby(level='Country')['GDP'].shift(1)

# 첫차분 계산: 각 변수의 변화량(diff)
diff_vars = ['GDP', 'GDP_lag', 'Corporate Tax', 'GERD', 'Institutions']
for var in diff_vars:
    panel_df[f'diff_{var}'] = panel_df.groupby(level='Country')[var].diff()

# 2-3. 각 국가의 첫 해(차분 불가) 제거 -> 첫차분 데이터를 사용하기 위함
df_diff = panel_df.dropna(subset=[f'diff_{var}' for var in diff_vars]).copy()
print("첫차분 데이터 형태:", df_diff.shape)
print(df_diff.head())

첫차분 데이터 형태: (56, 10)
category               GDP  Corporate Tax    GERD  Institutions       GDP_lag  \
Country Year                                                                    
China   2015  1.106160e+13           25.0  2.0570          54.0  1.047560e+13   
        2016  1.123330e+13           25.0  2.1003          55.2  1.106160e+13   
        2017  1.231050e+13           25.0  2.1160          54.8  1.123330e+13   
        2018  1.389490e+13           25.0  2.1406          59.4  1.231050e+13   
        2019  1.428000e+13           25.0  2.2446          64.1  1.389490e+13   

category          diff_GDP  diff_GDP_lag  diff_Corporate Tax  diff_GERD  \
Country Year                                                              
China   2015  5.860000e+11  9.051300e+11                 0.0     0.0346   
        2016  1.717000e+11  5.860000e+11                 0.0     0.0433   
        2017  1.077200e+12  1.717000e+11                 0.0     0.0157   
        2018  1.584400e+12  1.077200

### 3. OLS를 사용한 첫차분 회귀 분석 (Dynamic Panel via First-Difference OLS)
#### 종속변수: diff_GDP, 독립변수: diff_GDP_lag, diff_Corporate Tax, diff_GERD, diff_Institutions

In [27]:
y_diff = df_diff['diff_GDP']
X_diff = df_diff[['diff_GDP_lag', 'diff_Corporate Tax', 'diff_GERD', 'diff_Institutions']]
X_diff = sm.add_constant(X_diff)
model_diff = sm.OLS(y_diff, X_diff).fit()
print("\n=== First-Difference OLS Regression Results ===")
print(model_diff.summary())


=== First-Difference OLS Regression Results ===
                            OLS Regression Results                            
Dep. Variable:               diff_GDP   R-squared:                       0.122
Model:                            OLS   Adj. R-squared:                  0.054
Method:                 Least Squares   F-statistic:                     1.778
Date:                Tue, 11 Mar 2025   Prob (F-statistic):              0.148
Time:                        22:08:20   Log-Likelihood:                -1602.3
No. Observations:                  56   AIC:                             3215.
Df Residuals:                      51   BIC:                             3225.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------