# 다중회귀분석

## 2022년 단면 자료

In [5]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from linearmodels.panel import PanelOLS, RandomEffects

In [6]:
df = pd.read_csv("../data/master_data_by_category.csv")

In [7]:
df_2022 = df[['Country', 'category', '2022']].copy()
df_2022.rename(columns={'2022': 'Value'}, inplace=True)
# Pivot: 각 Country별로 각 category가 컬럼이 되도록
df_wide = df_2022.pivot(index='Country', columns='category', values='Value').reset_index()
print(df_wide.columns.tolist())

['Country', 'Business sophistication', 'Corporate Tax', 'Creative outputs', 'GDP', 'GDP_per_capita_PPP', 'GERD', 'GNI_per_capita', 'General Revenue', 'Global Innovation Index', 'Human capital and research', 'Infrastructure', 'Institutions', 'Internet Usage', 'Knowledge and technology outputs', 'Market sophistication', 'Patent Publications', 'Unemployment Rate', 'WIPO Tax']


#### 1-2. 종속변수와 독립변수 선택
- 종속변수: 'GDP'
- 독립변수: 'Corporate Tax', 'GERD', 'Institutions'

In [None]:
dependent_var = 'GDP'
independent_vars = ['Corporate Tax', 'GERD', 'Institutions']

df_reg = df_wide.dropna(subset=[dependent_var] + independent_vars).copy()
print("2022 단면 자료:")
print(df_reg[['Country'] + [dependent_var] + independent_vars].head())

2022 단면 자료 미리보기:
category  Country           GDP  Corporate Tax    GERD  Institutions
0           China  1.788180e+13           25.0  2.5552          64.8
1          France  2.796300e+12           25.8  2.2331          77.0
2         Germany  4.163600e+12           29.9  3.1324          76.5
3           Japan  4.256410e+12           29.7  3.4054          75.8
4           Korea  1.673920e+12           27.5  5.2108          70.5


#### 1-3. 변수 간 다중 공선성 확인 (VIF)

In [9]:
X = df_reg[independent_vars]
X_const = sm.add_constant(X)
vif_data = pd.DataFrame({
    "feature": X_const.columns,
    "VIF": [variance_inflation_factor(X_const.values, i) for i in range(X_const.shape[1])]
})
print(vif_data)

         feature         VIF
0          const  278.878771
1  Corporate Tax    1.130578
2           GERD    1.114769
3   Institutions    1.025536


#### 1-4. OLS 다중 회귀 분석 수행

In [10]:
y = df_reg[dependent_var]
model = sm.OLS(y, X_const).fit()
print("OLS 회귀 분석 결과:")
print(model.summary())

OLS 회귀 분석 결과:
                            OLS Regression Results                            
Dep. Variable:                    GDP   R-squared:                       0.020
Model:                            OLS   Adj. R-squared:                 -0.960
Method:                 Least Squares   F-statistic:                   0.02053
Date:                Tue, 11 Mar 2025   Prob (F-statistic):              0.995
Time:                        21:40:20   Log-Likelihood:                -218.48
No. Observations:                   7   AIC:                             445.0
Df Residuals:                       3   BIC:                             444.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const          6.044e+12   8.38e

  warn("omni_normtest is not valid with less than 8 observations; %i "


# 패널 데이터 분석

#### 2-1. 원본 데이터를 long format으로 변환 (Country, category, Year, Value)

In [11]:
year_cols = [str(y) for y in range(2013, 2023)]
df_long = df.melt(id_vars=['Country', 'category'], value_vars=year_cols,
                  var_name='Year', value_name='Value')
df_long['Year'] = df_long['Year'].astype(int)

#### 2-2. 패널 데이터 형태 
- pivot: index = [Country, Year], columns = category, values = Value

In [12]:
panel_df = df_long.pivot(index=['Country', 'Year'], columns='category', values='Value')
print("패널 데이터")
print(panel_df.head())

패널 데이터
category      Business sophistication  Corporate Tax  Creative outputs  \
Country Year                                                             
China   2013                     42.9           25.0              31.9   
        2014                     41.8           25.0              35.7   
        2015                     44.9           25.0              35.1   
        2016                     53.8           25.0              42.7   
        2017                     54.5           25.0              45.3   

category               GDP  GDP_per_capita_PPP    GERD  GNI_per_capita  \
Country Year                                                             
China   2013  9.570470e+12          11872.4974  1.9979          6740.0   
        2014  1.047560e+13          12480.3385  2.0224          7470.0   
        2015  1.106160e+13          12897.5023  2.0570          7890.0   
        2016  1.123330e+13          13483.3773  2.1003          8210.0   
        2017  1.231050e+13    

In [13]:
# 결측치 제거
panel_reg = panel_df.dropna(subset=[dependent_var] + independent_vars).copy()

#### 2-4. 고정효과 모델 (Fixed Effects Model)
 - 독립변수에 상수항 추가 (linearmodels의 PanelOLS는 자동으로 상수항을 포함하지 않음)

In [14]:
exog = panel_reg[independent_vars]
exog = sm.add_constant(exog)
dep = panel_reg[dependent_var]

fe_model = PanelOLS(dep, exog, entity_effects=True)
fe_results = fe_model.fit(cov_type='clustered', cluster_entity=True)
print("Fixed Effects Model 결과:")
print(fe_results.summary)

Fixed Effects Model 결과:
                          PanelOLS Estimation Summary                           
Dep. Variable:                    GDP   R-squared:                        0.4871
Estimator:                   PanelOLS   R-squared (Between):             -0.3811
No. Observations:                  70   R-squared (Within):               0.4871
Date:                Tue, Mar 11 2025   R-squared (Overall):             -0.3393
Time:                        21:43:00   Log-likelihood                   -2037.1
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      18.996
Entities:                           7   P-value                           0.0000
Avg Obs:                      10.0000   Distribution:                    F(3,60)
Min Obs:                      10.0000                                           
Max Obs:                      10.0000   F-statistic (robust):             3.8199
    

#### 2-5. 랜덤효과 모델 (Random Effects Model)

In [15]:
re_model = RandomEffects(dep, exog)
re_results = re_model.fit()
print("Random Effects Model 결과:")
print(re_results.summary)

Random Effects Model 결과:
                        RandomEffects Estimation Summary                        
Dep. Variable:                    GDP   R-squared:                        0.4623
Estimator:              RandomEffects   R-squared (Between):             -0.3710
No. Observations:                  70   R-squared (Within):               0.4870
Date:                Tue, Mar 11 2025   R-squared (Overall):             -0.3297
Time:                        21:43:10   Log-likelihood                   -2039.8
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      18.918
Entities:                           7   P-value                           0.0000
Avg Obs:                      10.0000   Distribution:                    F(3,66)
Min Obs:                      10.0000                                           
Max Obs:                      10.0000   F-statistic (robust):             18.918
   