In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LassoCV
import statsmodels.api as sm

import numpy as np
import pandas as pd

In [2]:
def read_data():
    x_train, y_train, x_test = [], [], []
    with open('a3-train-features.txt', 'rt') as f:
        for line in f:
            x_train.append([float(s) for s in line.strip().split('\t')])
    with open('a3-train-outcome.txt.', 'rt') as f:
        for line in f:
            y_train.append(float(line.strip()))
    with open('a3-test-features.txt', 'rt') as f:
        for line in f:
            x_test.append([float(s) for s in line.strip().split('\t')])
    return np.array(x_train), np.array(y_train), np.array(x_test)

In [3]:
def stepwise_variable_selection(x_train, y_train, th_in = 0.01, th_out = 0.05, 
                               ):
    x_train = pd.DataFrame(x_train, 
                           columns = [f'variable_{i}' for i in range(x_train.shape[1])])
    variables = x_train.columns
    
    sl_vars = []
    sv_per_step = []
    adjusted_r_squared = []
    steps = []
    step = 0
    
    
    while len(variables) > 0:
        remainder = list(set(variables) - set(sl_vars))
        pval = pd.Series(index = remainder)
    
        for col in remainder:
            X = x_train.loc[:, sl_vars + [col]]
            X = sm.add_constant(X)
            model = sm.OLS(y_train, X).fit(disp = 0)
            pval.loc[col] = model.pvalues[-1]
        
        if pval.min() < th_in:
            sl_vars.append(pval.idxmin())
    
            while len(sl_vars) > 0:
                X = x_train.loc[:, sl_vars]
                X = sm.add_constant(X)
                sl_pval = pd.Series(sm.OLS(y_train, X).fit(disp = 0).pvalues[1:], 
                                    index = sl_vars)
                
                if sl_pval.max() >= th_out:
                    sl_vars.remove(sl_pval.idxmax())
                else:
                    break

            
            step += 1
            steps.append(step)
            adj_r_squared = sm.OLS(y_train, sm.add_constant(x_train.loc[:, sl_vars])).fit(disp = 0).rsquared_adj
            adjusted_r_squared.append(adj_r_squared)
            sv_per_step.append(sl_vars.copy())
        else:
            break 
                
    return sl_vars

In [35]:
def sfs_selection(X_train, Y_train, X_val):
    sfs = SequentialFeatureSelector(Lasso(), 
                                   n_features_to_select = 'auto', 
                                   tol = 0.05, 
                                   direction = 'forward', 
                                   scoring = 'r2', 
                                   cv = 5)
    X_train = sfs.fit_transform(X_train, Y_train)
    X_val = sfs.transform(X_val)
    
    return X_train, X_val

#### 데이터 읽기 

In [6]:
x_train, y_train, x_test = read_data()

#### 데이터 분리

In [29]:
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train)

#### 실험 
- stepwise variable selection 
- sfs

- stepwise variable selection

In [44]:
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train)

index = stepwise_variable_selection(X_train, Y_train)
X_train, X_val = pd.DataFrame(X_train, columns = [f'variable_{i}' for i in range(X_train.shape[1])]), pd.DataFrame(X_val, columns = [f'variable_{i}' for i in range(X_val.shape[1])])
X_train, X_val = np.array(X_train.loc[:, index]), np.array(X_val.loc[:, index])


alphas = np.logspace(-4, 4, 100)
lasso_cv1 = LassoCV(alphas = alphas, cv = 5, random_state = 516, max_iter = 3000)
lasso_cv1.fit(X_train, Y_train)
lasso_cv1.score(X_val, Y_val)

  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pval

  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pval

  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]


  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]


  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pval

  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pval

  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pval

  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pval

  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pval

  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pval

  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]
  pval.loc[col] = model.pvalues[-1]


0.8945476542492333

In [45]:
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train)

X_train, X_val = pd.DataFrame(X_train, columns = [f'variable_{i}' for i in range(X_train.shape[1])]), pd.DataFrame(X_val, columns = [f'variable_{i}' for i in range(X_val.shape[1])])
X_train, X_val = sfs_selection(X_train, Y_train, X_val)

alphas = np.logspace(-4, 4, 100)
lasso_cv2 = LassoCV(alphas = alphas, cv = 5, random_state = 516, max_iter = 3000)
lasso_cv2.fit(X_train, Y_train)
lasso_cv2.score(X_val, Y_val)

0.9097965164576285

In [47]:
if __name__ == '__main__':
    
    x_train, y_train, x_test = read_data()
    x_train, x_test = pd.DataFrame(x_train, columns = [f'variable_{i}' for i in range(x_train.shape[1])]), pd.DataFrame(x_test, columns = [f'variable_{i}' for i in range(x_test.shape[1])]) 

    x_train, x_test = sfs_selection(x_train, y_train, x_test)
    
    
    alphas = np.logspace(-4, 4, 100)
    lasso_cv = LassoCV(alphas = alphas, cv = 5, random_state = 42, max_iter = 3000)
    lasso_cv.fit(x_train, y_train)
    pred = lasso_cv.predict(x_test)
    
    with open('predictions.txt', 'wt') as fout:
        for p in pred:
            print(p, file = fout)     

-----------------------------------------------------------
## SFS의 모델 의존적 다중공선성 처리 방식 설명

**핵심 개념**:  
SFS(Sequential Feature Selector)는 사용된 머신러닝 모델의 **자체 특성**을 통해 다중공선성을 간접적으로 처리합니다. 이 방식은 통계적 방법(VIF, 상관계수 분석 등)과 달리 명시적인 다중공선성 검증 절차 없이 모델 성능 최적화에 초점을 맞춥니다[1][6].

---

### 주요 작동 메커니즘
#### 1. **모델 내재적 처리 능력 활용**  
| 모델 유형       | 다중공선성 처리 방식                | 예시                     |
|-----------------|--------------------------------------|--------------------------|
| 트리 기반 모델  | 특성 중요도 자동 가중치 조정        | RandomForest, XGBoost   |
| 정규화 회귀     | L1/L2 패널티로 계수 크기 제한       | Lasso, Ridge            |
| 선형 회귀       | 명시적 처리 없음 → 취약성 존재      | OLS                     

**예시 코드**:  
```python
# Lasso 모델 사용 시 자동 다중공선성 완화
from sklearn.linear_model import LassoCV
sfs = SequentialFeatureSelector(
    estimator=LassoCV(), 
    n_features_to_select=15
)
```

#### 2. **성능 기반 특성 제거**  
교차 검증 과정에서 **상관관계가 높은 특성 그룹** 중 모델 예측에 덜 기여하는 변수를 단계별로 제거  
→ 모델의 손실 함수(loss function)가 다중공선성 영향 자동 감지[6][8]

---

### 전통적 방법과의 차이점  
| 방식                | 다중공선성 검출 방법      | 처리 메커니즘           |
|----------------------|---------------------------|-------------------------|
| 통계적 접근(VIF 등) | 수학적 공식 기반          | 변수 제거/변환         |
| SFS                  | 모델 예측 성능 변화 관측  | 알고리즘 자체 최적화   |

**실제 사례**:  
Boston 주택가격 데이터에서 `TAX`-`RAD`의 상관계수 0.91 → SFS가 릿지 회귀 사용 시 두 변수 중 하나만 선택[2][7]

---

### 주의 필요 상황  
1. **선형 모델 사용 시**:  
   ```python
   # LinearRegression은 다중공선성에 취약
   sfs_linear = SequentialFeatureSelector(
       estimator=LinearRegression(), 
       scoring='r2'
   )
   ```
   → 결과 해석 시 회귀 계수 신뢰도 저하 가능성[4]

2. **고차원 데이터**:  
   피처 수 > 샘플 수인 경우 모델 선택이 결정적 영향  
   → L1 정규화 모델 권장[1][5]

---

### 최적 활용 전략  
```python
# 트리 모델 + SFS 조합 예시
from sklearn.ensemble import GradientBoostingRegressor
sfs_optimized = SequentialFeatureSelector(
    estimator=GradientBoostingRegressor(),
    scoring='neg_mean_absolute_error',
    cv=5
)
```
이 방식은 모델이 **비선형 관계 처리 능력**과 **내재적 특성 선택 메커니즘**을 동시에 활용합니다[3][8].

Citations:
[1] https://dodonam.tistory.com/464
[2] https://ysyblog.tistory.com/122
[3] https://dacon.io/competitions/open/235698/talkboard/404068
[4] https://sseozytank.tistory.com/54
[5] https://velog.io/@do_genie/Feature-Selection
[6] https://datascienceschool.net/03%20machine%20learning/06.04%20%EB%8B%A4%EC%A4%91%EA%B3%B5%EC%84%A0%EC%84%B1%EA%B3%BC%20%EB%B3%80%EC%88%98%20%EC%84%A0%ED%83%9D.html
[7] https://blog.naver.com/discoveringjmp/221912144879
[8] https://narrowmoon.tistory.com/6

---
Perplexity로부터의 답변: pplx.ai/share