In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# 보스턴 주택 가격 데이터셋 로드
boston = load_boston()

# 데이터셋을 DataFrame으로 변환
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['target'] = boston.target

# 입력 데이터와 출력 데이터
X = df.drop('target', axis=1)
y = df['target']

# 데이터 분할: 학습 데이터와 테스트 데이터
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 선형 회귀 모델 생성
model = LinearRegression()

# 모델 학습
model.fit(X_train, y_train)

# 학습된 모델을 사용하여 테스트 데이터 예측
y_pred = model.predict(X_test)

# 평균 제곱 오차(Mean Squared Error) 계산
before_mse = mean_squared_error(y_test, y_pred)
print("BEFORE MSE:", before_mse)

BEFORE MSE: 24.2911194749736


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
import itertools
from tqdm import tqdm
import json

task = None

# Dataload
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['target'] = boston.target

# Check Data Task
label_mean = np.mean(df['target'])
label_std = np.std(df['target'])

# isRegression
if label_std > 0:  
    task = 'reg'

# Preprocess
def apply_ordinal_encoder(df):
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

    for column in df.columns:
        if df[column].dtype == 'object' or df[column].dtype == 'str':
            encoded_data = encoder.fit_transform(df[[column]])
            df[column] = encoded_data
    return df


def scaler(df):
    scaler = StandardScaler()
    scaled_df = scaler.fit_transform(df)
    return pd.DataFrame(scaled_df, columns=df.columns).astype('float32')


X = apply_ordinal_encoder(df.drop(['target'], axis=1))
X = scaler(X)


# SET eval model
def evaluate_model(X_train, X_test, y_train, y_test, task):
    if task == 'reg':
        model = DecisionTreeRegressor()
    else:
        model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    return score


# Make Golden Feature
def create_new_features(df):
    new_features = pd.DataFrame()
    
    # Feature Combinations
    combinations = list(itertools.combinations(df.columns, 2))

    # plus, multiply, devide, minus
    for col1, col2 in combinations:
        new_features.loc[:, f'{col1}_plus_{col2}'] = df[col1] + df[col2]
        new_features.loc[:, f'{col1}_multiply_{col2}'] = df[col1] * df[col2]
        new_features.loc[:, f'{col1}_divide_{col2}'] = df[col1] / df[col2]
        new_features.loc[:, f'{col1}_minus_{col2}'] = df[col1] - df[col2]

    return new_features


X_train_new = X.copy()
new_features = create_new_features(X_train_new)

# Each Golden Feature Scoring
scores = []
for new_feature in tqdm(new_features.columns):
    new_df = pd.concat([X_train_new, new_features[[new_feature]]], axis=1)

    # Test
    if task == 'reg':
        X_train, X_test, y_train, y_test = train_test_split(new_df, y, random_state=42)
    else:
        X_train, X_test, y_train, y_test = train_test_split(new_df, y, stratify=y, random_state=42)
    score = evaluate_model(X_train, X_test, y_train, y_test, task)
    scores.append((new_feature, score))

# Sorting Score
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

# Make Dict
results = []
for feature, score in sorted_scores:
    result = {'Feature': feature, 'Score': score}
    results.append(result)
top10 = results[:10]
top10

100%|███████████████████████████████████████████████████████████████████████████████| 312/312 [00:01<00:00, 225.51it/s]


[{'Feature': 'NOX_plus_PTRATIO', 'Score': 0.8822110289465535},
 {'Feature': 'RM_divide_B', 'Score': 0.8669953317836949},
 {'Feature': 'TAX_plus_PTRATIO', 'Score': 0.8668896359860812},
 {'Feature': 'RM_divide_RAD', 'Score': 0.8664016362396519},
 {'Feature': 'TAX_divide_B', 'Score': 0.8661958667613189},
 {'Feature': 'B_divide_LSTAT', 'Score': 0.8649016554522404},
 {'Feature': 'RM_multiply_AGE', 'Score': 0.8635321077767777},
 {'Feature': 'CRIM_multiply_PTRATIO', 'Score': 0.8631138222798383},
 {'Feature': 'DIS_multiply_LSTAT', 'Score': 0.8619849012074536},
 {'Feature': 'CRIM_divide_AGE', 'Score': 0.8617352792173447}]

In [3]:
# 보스턴 주택 가격 데이터셋 로드
boston = load_boston()

# 데이터셋을 DataFrame으로 변환
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['target'] = boston.target

for col in top10:
    tmp = list(col.values())[0].split('_')
    f1, f2 = tmp[0], tmp[-1]
    if tmp[1] == 'multiply':
        df.loc[:, f'{f1}_multiply_{f2}'] = df[f1] * df[f2]
    elif tmp[1] == 'minus':
        df.loc[:, f'{f1}_minus_{f2}'] = df[f1] - df[f2]
    elif tmp[1] == 'plus':
        df.loc[:, f'{f1}_plus_{f2}'] = df[f1] + df[f2]
    elif tmp[1] == 'divide':
        df.loc[:, f'{f1}_divide_{f2}'] = df[f1] / df[f2]
        
# 입력 데이터와 출력 데이터
X = df.drop('target', axis=1)
y = df['target']

# 데이터 분할: 학습 데이터와 테스트 데이터
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 선형 회귀 모델 생성
model = LinearRegression()

# 모델 학습
model.fit(X_train, y_train)

# 학습된 모델을 사용하여 테스트 데이터 예측
y_pred = model.predict(X_test)

# 평균 제곱 오차(Mean Squared Error) 계산
mse = mean_squared_error(y_test, y_pred)
print("BEFORE MSE:", before_mse)
print("AFTER MSE:", mse)

BEFORE MSE: 24.2911194749736
AFTER MSE: 17.55893128587233
