# Wrapper Method(R 이용)

In [None]:
import pandas as pd

df_robust = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KUBIG/23-2 KDIS 장기 프로젝트 분반/df_robust')

# Features와 Target 분리
X = df_robust.drop(columns=['Country Code', 'Year', 'Y'])
y = df_robust['Y']

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression

def evaluate_RFmodel(columns):
    features = df_robust[columns]
    target = df_robust['Y']

    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    print(f'<Random Forest Regressor 학습 성능>\n')
    print('MAE : {:.6f}'.format(mean_absolute_error(y_test, y_pred)))
    print('MSE : {:.6f}'.format(mean_squared_error(y_test, y_pred)))
    print('R2 : {:.6f}'.format(r2_score(y_test, y_pred)))
    print('\n')

def evaluate_LRmodel(columns):
    features = df_robust[columns]
    target = df_robust['Y']

    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    print(f'<Linear Regression 학습 성능>\n')
    print('MAE : {:.6f}'.format(mean_absolute_error(y_test, y_pred)))
    print('MSE : {:.6f}'.format(mean_squared_error(y_test, y_pred)))
    print('R2 : {:.6f}'.format(r2_score(y_test, y_pred)))

## 1) 전진선택법(forward)

*   중요하다고 생각되는 설명변수 하나부터 차례 변수를 추가해가면서 성능지표를 비교
*   장점 : 이해하기 쉽고 변수의 개수가 많은 경우에도 사용 가능
*   단점 : 변수값의 작은 변동에도 결과가 크게 달라져 안정성 부족


- 통계적으로 유의하며 추정치의 절댓값이 높은 순으로 정렬한 결과
    - Life expectancy at birth: -7120.788
    - Survival to 65_female: -6562.008
    - GNI per capita: 5767.538
    - Life expectancy at 65: 5349.201
    - Survival to 65_male: 5203.545
    - Tax revenue (% of GDP): -4772.584
    - Average wages: -3261.010
    - Medium and high-tech exports (% manufactured exports): 3071.156
    - R&D spending: 2778.634
    - Population density: 2525.467
    - Military spending: 2420.866
    - Business extent of disclosure index (0-10): 2096.494
    - General government spending: 1932.426
    - Population ages 15-64 (% of total population): -1848.227
    - Merchandise trade (% of GDP): -1389.305
    - International tourism, receipts (% of total exports): 1363.922
    - Urban population: -1340.061
    - Labor force, female (% of total labor force): -1319.946
    - Industry (including construction), value added (% of GDP): 1258.450
    - Population growth (annual %): 1191.094
    - women parliaments: 1113.704
    - Maternal mortality ratio: 1032.937
    - Renewable energy consumption: -788.243
    - Unemployment rate: 769.830
    - ICT service exports: -727.162
    - Current account balance: 570.413
    - Fuel exports (% of merchandise exports): -507.711
    - Ores and metals exports (% of merchandise exports): 404.488
    - Exchange rates: -61.355

In [None]:
forward = [
    'Life expectancy at birth',
    'Survival to 65_female',
    'GNI per capita',
    'Life expectancy at 65',
    'Survival to 65_male',
    'Tax revenue (% of GDP)',
    'Average wages',
    'Medium and high-tech exports (% manufactured exports)',
    'R&D spending',
    'Population density',
    'Military spending',
    'Business extent of disclosure index (0-10)',
    'General government spending',
    'Population ages 15-64 (% of total population)',
    'Merchandise trade (% of GDP)',
    'International tourism, receipts (% of total exports)',
    'Urban population',
    'Labor force, female (% of total labor force)',
    'Industry (including construction), value added (% of GDP)',
    'Population growth (annual %)',
    'women parliaments',
    'Maternal mortality ratio',
    'Renewable energy consumption',
    'Unemployment rate',
    'ICT service exports',
    'Current account balance',
    'Fuel exports (% of merchandise exports)',
    'Ores and metals exports (% of merchandise exports)',
    'Exchange rates']
len(forward)

29

In [None]:
evaluate_RFmodel(forward)
evaluate_LRmodel(forward)

<Random Forest Regressor 학습 성능>

MAE : 762.142019
MSE : 2112808.805497
R2 : 0.970416


<Linear Regression 학습 성능>

MAE : 2195.553194
MSE : 8820390.462697
R2 : 0.876494


In [None]:
forward_top20 = [
    "Life expectancy at birth",
    "Survival to 65_female",
    "GNI per capita",
    "Life expectancy at 65",
    "Survival to 65_male",
    "Tax revenue (% of GDP)",
    "Average wages",
    "Medium and high-tech exports (% manufactured exports)",
    "R&D spending",
    "Population density",
    "Military spending",
    "Business extent of disclosure index (0-10)",
    "General government spending",
    "Population ages 15-64 (% of total population)",
    "Merchandise trade (% of GDP)",
    "International tourism, receipts (% of total exports)",
    "Urban population",
    "Labor force, female (% of total labor force)",
    "Industry (including construction), value added (% of GDP)",
    "Population growth (annual %)"]

## 2) 후진소거법(backward)

*   가장 적은 영향을 주는 변수부터 하나씩 제거해가면서 성능지표를 비교
*   장점 : 전체 변수들의 정보를 이용할 수 있음
*   단점 : 변수의 개수가 많으면 사용하기 어려움



- Trade: -11790.498
- Exports of goods and services (% of GDP): 9855.689
- Life expectancy at birth: -6193.919
- Survival to 65_female: -6080.499
- Life expectancy at 65: 5071.972
- Tax revenue (% of GDP): -4561.893
- Survival to 65_male: 4411.818
- Medium and high-tech exports (% manufactured exports): 3322.561
- R&D spending: 3313.767
- Employment rate: -3291.365
- Average wages: -3150.394
- GDP per capita: 3072.874
- Labor force: 3022.578
- Old-age dependency ratio: 2948.707
- GNI per capita: 2775.809
- Population ages 0-14 (% of total population): 2624.665
- Military spending: 2547.310
- Population density: 2447.607
- Business extent of disclosure index (0-10): 1731.282
- General government spending: 1639.298
- Urban population: -1617.637
- Labor force, female (% of total labor force): -1534.132
- International tourism, receipts (% of total exports): 1290.671
- Population growth (annual %): 1179.593
- Maternal mortality ratio: 1118.328
- Renewable energy consumption: -837.601
- High-technology exports (% of manufactured exports): -833.706
- Gini index: -792.129
- Education spending: 773.878
- ICT service exports: -729.681
- Fuel exports (% of merchandise exports): -331.467
- Ores and metals exports (% of merchandise exports): 327.365
- Exchange rates: -56.077

In [None]:
backward = [
    "Trade",
    "Exports of goods and services (% of GDP)",
    "Life expectancy at birth",
    "Survival to 65_female",
    "Life expectancy at 65",
    "Tax revenue (% of GDP)",
    "Survival to 65_male",
    "Medium and high-tech exports (% manufactured exports)",
    "R&D spending",
    "Employment rate",
    "Average wages",
    "GDP per capita",
    "Labor force",
    "Old-age dependency ratio",
    "GNI per capita",
    "Population ages 0-14 (% of total population)",
    "Military spending",
    "Population density",
    "Business extent of disclosure index (0-10)",
    "General government spending",
    "Urban population",
    "Labor force, female (% of total labor force)",
    "International tourism, receipts (% of total exports)",
    "Population growth (annual %)",
    "Maternal mortality ratio",
    "Renewable energy consumption",
    "High-technology exports (% of manufactured exports)",
    "Gini index",
    "Education spending",
    "ICT service exports",
    "Fuel exports (% of merchandise exports)",
    "Ores and metals exports (% of merchandise exports)",
    "Exchange rates"]
len(backward)

33

In [None]:
evaluate_RFmodel(backward)
evaluate_LRmodel(backward)

<Random Forest Regressor 학습 성능>

MAE : 792.282074
MSE : 2478570.439437
R2 : 0.965294


<Linear Regression 학습 성능>

MAE : 2181.988450
MSE : 9070987.804571
R2 : 0.872985


In [None]:
backward_top20 = [
    "Trade",
    "Exports of goods and services (% of GDP)",
    "Life expectancy at birth",
    "Survival to 65_female",
    "Life expectancy at 65",
    "Tax revenue (% of GDP)",
    "Survival to 65_male",
    "Medium and high-tech exports (% manufactured exports)",
    "R&D spending",
    "Employment rate",
    "Average wages",
    "GDP per capita",
    "Labor force",
    "Old-age dependency ratio",
    "GNI per capita",
    "Population ages 0-14 (% of total population)",
    "Military spending",
    "Population density",
    "Business extent of disclosure index (0-10)",
    "General government spending"]

## 3) 단계선택법(stepwise)

*   전진선택법에 의해 변수를 추가해가지만, 중요하지 않은 변수가 포함될 수 있기에, 각 단계에서 변수 중요도를 확인하면서 성능지표를 비교 -> 기존 변수 중요도가 악화되면 해당 변수 제거



- Life expectancy at birth: -6580.103
- Survival to 65_female: -6572.746
- GNI per capita: 5896.515
- Life expectancy at 65: 5341.505
- Survival to 65_male: 5054.514
- Tax revenue (% of GDP): -4784.247
- Average wages: -3570.896
- Medium and high-tech exports (% manufactured exports): 3422.856
- R&D spending: 3107.909
- Population density: 2609.874
- Military spending: 2436.077
- General government spending: 2022.596
- Business extent of disclosure index (0-10): 1937.677
- Population ages 15-64 (% of total population): -1790.117
- International tourism, receipts (% of total exports): 1483.375
- Labor force, female (% of total labor force): -1462.024
- Industry (including construction), value added (% of GDP): 1320.024
- Merchandise trade (% of GDP): -1270.251
- Urban population: -1243.462
- Population growth (annual %): 1158.809
- women parliaments: 1147.974
- Maternal mortality ratio: 997.609
- Renewable energy consumption: -937.431
- ICT service exports: -738.448
- High-technology exports (% of manufactured exports): -731.565
- Unemployment rate: 674.430
- Current account balance: 632.301
- Fuel exports (% of merchandise exports): -460.165
- Education spending: 452.145
- Ores and metals exports (% of merchandise exports): 419.912
- Exchange rates: -56.952

In [None]:
stepwise = [
    "Life expectancy at birth",
    "Survival to 65_female",
    "GNI per capita",
    "Life expectancy at 65",
    "Survival to 65_male",
    "Tax revenue (% of GDP)",
    "Average wages",
    "Medium and high-tech exports (% manufactured exports)",
    "R&D spending",
    "Population density",
    "Military spending",
    "General government spending",
    "Business extent of disclosure index (0-10)",
    "Population ages 15-64 (% of total population)",
    "International tourism, receipts (% of total exports)",
    "Labor force, female (% of total labor force)",
    "Industry (including construction), value added (% of GDP)",
    "Merchandise trade (% of GDP)",
    "Urban population",
    "Population growth (annual %)",
    "women parliaments",
    "Maternal mortality ratio",
    "Renewable energy consumption",
    "ICT service exports",
    "High-technology exports (% of manufactured exports)",
    "Unemployment rate",
    "Current account balance",
    "Fuel exports (% of merchandise exports)",
    "Education spending",
    "Ores and metals exports (% of merchandise exports)",
    "Exchange rates"]
len(stepwise)

31

In [None]:
evaluate_RFmodel(stepwise)
evaluate_LRmodel(stepwise)

<Random Forest Regressor 학습 성능>

MAE : 770.656516
MSE : 2286303.861017
R2 : 0.967986


<Linear Regression 학습 성능>

MAE : 2200.421333
MSE : 8909244.739049
R2 : 0.875250


In [None]:
#상위 20개만 저장
stepwise_top20 = [
    "Life expectancy at birth",
    "Survival to 65_female",
    "GNI per capita",
    "Life expectancy at 65",
    "Survival to 65_male",
    "Tax revenue (% of GDP)",
    "Average wages",
    "Medium and high-tech exports (% manufactured exports)",
    "R&D spending",
    "Population density",
    "Military spending",
    "General government spending",
    "Business extent of disclosure index (0-10)",
    "Population ages 15-64 (% of total population)",
    "International tourism, receipts (% of total exports)",
    "Labor force, female (% of total labor force)",
    "Industry (including construction), value added (% of GDP)",
    "Merchandise trade (% of GDP)",
    "Urban population",
    "Population growth (annual %)"]