#식당 월 매출 예측
식당 방문객 수, 메뉴 평균 가격, 요리 유형 등 관련 데이터로부터 식당 월 매출을 예측하는, 회귀 문제를 해결해 봅시다.

본 과제는 2점 만점이며, 결정계수(Coefficient of Determination)를 평가 지표로 사용합니다.
Baseline보다 높은 결정계수 달성: 2.0점
Baseline의 결정계수와 동일(소수 5 번째 자리까지)하거나, 낮은 결정계수 달성: 1.0점
미제출: 0점
(주의) 허용할 수 없을 정도로 낮은 결정계수로 기록된 것도 미제출로 간주될 수 있습니다.

현재 Baseline은 Keras 라이브러리의 Dense 클래스로 구성된 인공신경망 모델을 활용해 도출됐습니다.
데이터 세트 분할 시, random_state 값은 42로 지정했습니다.
수업 시간에 다뤘던 인공신경망 모델 구조 설계 및 하이퍼파라미터와 관련된 내용을 복습하고, 과제 3을 해결해 봅시다.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# 1) 데이터 로드
train = pd.read_csv("train.csv")   # 경로 맞게
test = pd.read_csv("test.csv")
submission = pd.read_csv("submission.csv")

X = train.drop(columns=["Monthly_Revenue"])
y = train["Monthly_Revenue"]

# 2) 숫자형 / 범주형 분리
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

print("범주형:", cat_cols)
print("숫자형:", num_cols)

# 3) 전처리 파이프라인: 범주형 → OneHot, 숫자형 → 표준화
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_cols),
    ]
)

# 4) train/valid 분할 (random_state=42 필수)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 전처리 적용
X_train_p = preprocess.fit_transform(X_train)
X_val_p = preprocess.transform(X_val)

input_dim = X_train_p.shape[1]
print("입력 차원:", input_dim)

# 5) 인공신경망 모델 정의 (이전보다 더 큰 모델)
tf.random.set_seed(42)

model = Sequential([
    Dense(128, activation='relu', input_shape=(input_dim,)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  # 회귀니까 출력 1, activation 없음
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='mse'
)

# 조기 종료 콜백 (과적합 방지, 하지만 충분히 학습되도록 patience 크게)
es = EarlyStopping(
    monitor='val_loss',
    patience=30,
    restore_best_weights=True
)

# 6) 학습
history = model.fit(
    X_train_p, y_train,
    validation_data=(X_val_p, y_val),
    epochs=500,
    batch_size=32,
    callbacks=[es],
    verbose=0  # 진행상태 보고 싶으면 1로
)

# 7) 검증셋 예측 + R^2 계산
y_val_pred = model.predict(X_val_p).flatten()
val_r2 = r2_score(y_val, y_val_pred)
print("Validation R^2:", val_r2)


범주형: ['Cuisine_Type']
숫자형: ['Number_of_Customers', 'Menu_Price', 'Marketing_Spend', 'Average_Customer_Spending', 'Promotions', 'Reviews']
입력 차원: 10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
Validation R^2: 0.6002546037082983


In [2]:
# 8) 전체 데이터에 대해 전처리를 다시 fit
X_all = train.drop(columns=["Monthly_Revenue"])
y_all = train["Monthly_Revenue"]

X_all_p = preprocess.fit_transform(X_all)
X_test_p = preprocess.transform(test)

# 9) 같은 구조의 모델을 새로 만들어서 전체 데이터로 학습
tf.random.set_seed(42)

model_final = Sequential([
    Dense(128, activation='relu', input_shape=(X_all_p.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)
])

model_final.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='mse'
)

es_final = EarlyStopping(
    monitor='loss',
    patience=30,
    restore_best_weights=True
)

model_final.fit(
    X_all_p, y_all,
    epochs=500,
    batch_size=32,
    callbacks=[es_final],
    verbose=0
)

# 10) test 예측
y_pred = model_final.predict(X_test_p).flatten()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv("submission.csv")
df.dropna(axis=1, inplace=True)

# 여기!! np.arange(...) 대신 우리가 만든 예측값
df["Monthly_Revenue"] = y_pred

df.to_csv("new_submission.csv", index=False)
print("✅ new_submission.csv 저장 완료")


✅ new_submission.csv 저장 완료


#2트

In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# 1) 데이터 로드
train = pd.read_csv("train.csv")   # 경로 맞게
test = pd.read_csv("test.csv")
submission = pd.read_csv("submission.csv")

X = train.drop(columns=["Monthly_Revenue"])
y = train["Monthly_Revenue"]

# 2) 숫자형 / 범주형 분리
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

print("범주형:", cat_cols)
print("숫자형:", num_cols)

# 3) 전처리 파이프라인: 범주형 → OneHot, 숫자형 → 표준화
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_cols),
    ]
)

# 4) train/valid 분할 (random_state=42 필수)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 전처리 적용
X_train_p = preprocess.fit_transform(X_train)
X_val_p = preprocess.transform(X_val)

input_dim = X_train_p.shape[1]
print("입력 차원:", input_dim)

# 5) 인공신경망 모델 정의 (이전보다 더 큰 모델)
tf.random.set_seed(42)

model = Sequential([
    Dense(128, activation='relu', input_shape=(input_dim,)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  # 회귀니까 출력 1, activation 없음
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='mse'
)

# 조기 종료 콜백 (과적합 방지, 하지만 충분히 학습되도록 patience 크게)
es = EarlyStopping(
    monitor='val_loss',
    patience=30,
    restore_best_weights=True
)

# 6) 학습
history = model.fit(
    X_train_p, y_train,
    validation_data=(X_val_p, y_val),
    epochs=500,
    batch_size=32,
    callbacks=[es],
    verbose=0  # 진행상태 보고 싶으면 1로
)

# 7) 검증셋 예측 + R^2 계산
y_val_pred = model.predict(X_val_p).flatten()
val_r2 = r2_score(y_val, y_val_pred)
print("Validation R^2:", val_r2)


범주형: ['Cuisine_Type']
숫자형: ['Number_of_Customers', 'Menu_Price', 'Marketing_Spend', 'Average_Customer_Spending', 'Promotions', 'Reviews']
입력 차원: 10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
Validation R^2: 0.6028363962789323


In [11]:
# 8) 전체 데이터에 대해 전처리를 다시 fit
X_all = train.drop(columns=["Monthly_Revenue"])
y_all = train["Monthly_Revenue"]

X_all_p = preprocess.fit_transform(X_all)
X_test_p = preprocess.transform(test)

# 9) 같은 구조의 모델을 새로 만들어서 전체 데이터로 학습
tf.random.set_seed(42)

model_final = Sequential([
    Dense(128, activation='relu', input_shape=(X_all_p.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)
])

model_final.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='mse'
)

es_final = EarlyStopping(
    monitor='loss',
    patience=30,
    restore_best_weights=True
)

model_final.fit(
    X_all_p, y_all,
    epochs=500,
    batch_size=32,
    callbacks=[es_final],
    verbose=0
)

# 10) test 예측
y_pred = model_final.predict(X_test_p).flatten()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step


In [12]:
import pandas as pd
import numpy as np

df = pd.read_csv("submission.csv")
df.dropna(axis=1, inplace=True)

# 여기!! np.arange(...) 대신 우리가 만든 예측값
df["Monthly_Revenue"] = y_pred

df.to_csv("new_submission.csv", index=False)
print("✅ new_submission.csv 저장 완료")


✅ new_submission.csv 저장 완료


#3트

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

# 1) 데이터 로드
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("submission.csv")

X = train.drop(columns=["Monthly_Revenue"])
y = train["Monthly_Revenue"]

# 2) 범주형/수치형 컬럼
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_cols),
    ]
)

# 3) train/valid split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train_p = preprocess.fit_transform(X_train)
X_val_p = preprocess.transform(X_val)

# 4) RandomForest 모델
rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_p, y_train)
y_val_pred = rf.predict(X_val_p)
val_r2 = r2_score(y_val, y_val_pred)
print("RF Validation R^2:", val_r2)


RF Validation R^2: 0.6175187834544955


In [2]:
# 전체 데이터로 다시 학습
X_all_p = preprocess.fit_transform(X)
X_test_p = preprocess.transform(test)

rf_final = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)

rf_final.fit(X_all_p, y)
y_pred = rf_final.predict(X_test_p)

# 제출 파일
df = pd.read_csv("submission.csv")
df.dropna(axis=1, inplace=True)
df["Monthly_Revenue"] = y_pred
df.to_csv("new_submission.csv", index=False)
print("new_submission.csv 저장 완료")


new_submission.csv 저장 완료


#4트

In [8]:
!pip install catboost

Collecting catboost
  Using cached catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor

# 1) 데이터 불러오기
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("submission.csv")

X = train.drop(columns=["Monthly_Revenue"])
y = train["Monthly_Revenue"]

# 2) 범주형 컬럼 자동 지정 (object dtype)
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

# 3) train/valid split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4) CatBoostRegressor (튜닝 필요 없음)
model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.03,
    depth=6,
    loss_function="RMSE",
    random_seed=42,
    verbose=False
)

# 5) 학습
model.fit(
    X_train, y_train,
    cat_features=cat_cols,
    eval_set=(X_val, y_val),
    verbose=False
)

# 6) Validation R^2
val_pred = model.predict(X_val)
val_r2 = r2_score(y_val, val_pred)
print("Validation R^2:", val_r2)


Validation R^2: 0.6375314065701064


In [12]:
# 전체 train 학습
model.fit(
    X, y,
    cat_features=cat_cols,
    verbose=False
)

# test 예측
y_pred = model.predict(test)

# 교수님 포맷
df = pd.read_csv("submission.csv")
df.dropna(axis=1, inplace=True)
df["Monthly_Revenue"] = y_pred
df.to_csv("new_submission.csv", index=False)

print("new_submission.csv 저장 완료!")


new_submission.csv 저장 완료!


#5트

In [2]:
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# 1) 데이터 불러오기
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("submission.csv")

X = train.drop(columns=["Monthly_Revenue"])
y = train["Monthly_Revenue"]

# 2) 범주형 컬럼을 category 타입으로 변경 (여기가 핵심!!)
cat_cols = ["Cuisine_Type"]

for c in cat_cols:
    X[c] = X[c].astype("category")
    test[c] = test[c].astype("category")

# 3) train/valid split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4) LightGBM 모델 정의
model = LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.02,
    max_depth=-1,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=42
)

# 5) 학습 (categorical_feature 따로 안 줘도 category dtype이면 인식함)
model.fit(X_train, y_train)

# 6) 검증 R^2 확인
val_pred = model.predict(X_val)
val_r2 = r2_score(y_val, val_pred)
print("Validation R^2:", val_r2)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000240 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 838
[LightGBM] [Info] Number of data points in the train set: 640, number of used features: 7
[LightGBM] [Info] Start training from score 271.023584
Validation R^2: 0.5939960406643674


In [3]:
model.fit(X, y, categorical_feature=cat_cols)
y_pred = model.predict(test)

df = pd.read_csv("submission.csv")
df.dropna(axis=1, inplace=True)
df["Monthly_Revenue"] = y_pred
df.to_csv("new_submission.csv", index=False)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000096 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 963
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 7
[LightGBM] [Info] Start training from score 271.063399


#6트

In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score

# 1. 데이터 로드
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
submission_df = pd.read_csv("submission.csv")

# 2. 파생 변수 생성 (가장 중요한 단계)
# 매출 ≈ 방문객 수 * 1인당 지출 이라는 물리적 관계를 모델에 힌트로 줍니다.
train_df['Interaction'] = train_df['Number_of_Customers'] * train_df['Average_Customer_Spending']
test_df['Interaction'] = test_df['Number_of_Customers'] * test_df['Average_Customer_Spending']

# 3. 데이터 분리 (X, y)
X = train_df.drop('Monthly_Revenue', axis=1)
y = train_df['Monthly_Revenue']
X_test = test_df.copy() # 테스트 데이터

# 4. 전처리 (스케일링 및 인코딩)
# 수치형 컬럼과 범주형 컬럼 구분
numeric_features = ['Number_of_Customers', 'Menu_Price', 'Marketing_Spend',
                    'Average_Customer_Spending', 'Promotions', 'Reviews', 'Interaction']
categorical_features = ['Cuisine_Type']

# 전처리 파이프라인 정의
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
    ])

# 전처리 수행
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test)

# 학습/검증 데이터 분리 (Baseline 조건: random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# 5. Keras 인공신경망 모델 설계
model = keras.Sequential([
    # 입력층 (전처리된 feature 개수에 맞춤)
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    # 출력층 (회귀이므로 활성화 함수 없음)
    layers.Dense(1)
])

# 모델 컴파일 (Optimizer: Adam, Loss: MSE, Metric: MAE)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),
              loss='mse',
              metrics=['mae'])

# 6. 모델 학습
history = model.fit(
    X_train, y_train,
    epochs=150,           # 충분한 학습 횟수
    batch_size=32,
    validation_data=(X_val, y_val),
    verbose=1
)

# 7. 검증 데이터 평가 (R2 Score 확인)
y_val_pred = model.predict(X_val)
r2 = r2_score(y_val, y_val_pred)
print(f"\n검증 데이터 R2 Score: {r2:.5f}")

# 8. 최종 예측 및 저장
final_predictions = model.predict(X_test_processed)

# submission 파일 양식에 맞게 저장
# flatten()을 사용하여 2차원 배열을 1차원으로 변환
submission_df.dropna(axis=1, inplace=True) # 기존 예시 코드 반영
submission_df["Monthly_Revenue"] = final_predictions.flatten()

submission_df.to_csv("new_submission.csv", index=False)
print("new_submission.csv 저장 완료!")

Epoch 1/150


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - loss: 85269.2656 - mae: 274.1423 - val_loss: 83249.3359 - val_mae: 268.5916
Epoch 2/150
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 81870.2031 - mae: 267.0713 - val_loss: 78587.3047 - val_mae: 260.0186
Epoch 3/150
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 74523.8594 - mae: 255.8828 - val_loss: 64826.3320 - val_mae: 233.4296
Epoch 4/150
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 57368.9531 - mae: 220.0125 - val_loss: 36613.6641 - val_mae: 169.3632
Epoch 5/150
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 25377.5977 - mae: 139.6772 - val_loss: 9028.0605 - val_mae: 77.4699
Epoch 6/150
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 5533.0674 - mae: 58.6347 - val_loss: 6312.1201 - val_mae: 62.7728
Epoch 7/150
[1m20/20[0m [32m━━━━━━━━

#7트

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

# ======== 1. 데이터 불러오기 ========
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X = train.drop("Monthly_Revenue", axis=1)
y = train["Monthly_Revenue"]

categorical_cols = ["Cuisine_Type", "Promotions"]
numeric_cols = [c for c in X.columns if c not in categorical_cols]

# ======== 2. 전처리 파이프라인 ========
preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ("num", StandardScaler(), numeric_cols)
])

# ======== 3. 여러 모델 준비(앙상블 후보) ========

model_rf = RandomForestRegressor(
    n_estimators=400,
    max_depth=12,
    random_state=42
)

model_xgb = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42
)

model_mlp = MLPRegressor(
    hidden_layer_sizes=(128, 64),
    activation='relu',
    solver='adam',
    max_iter=700,
    random_state=42
)

# ======== 4. 하나씩 학습해보고 R² 비교 ========
models = {
    "RandomForest": model_rf,
    "XGBoost": model_xgb,
    "MLP": model_mlp
}

results = {}

for name, model in models.items():
    pipe = Pipeline([
        ("pre", preprocess),
        ("model", model)
    ])

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_val)
    r2 = r2_score(y_val, pred)
    results[name] = (pipe, r2)
    print(f"{name} Validation R2:", r2)

# ======== 5. 가장 성능 좋은 모델 선택 ========
best_model_name = max(results, key=lambda x: results[x][1])
best_model, best_r2 = results[best_model_name]

print("\n📌 선택된 최종 모델:", best_model_name)
print("📌 최고 Validation R²:", best_r2)

# ======== 6. test.csv 예측 ========
y_pred = best_model.predict(test)

# ======== 7. 제출 파일 생성 ========
submission = pd.read_csv("submission.csv")
submission["Monthly_Revenue"] = y_pred
submission.to_csv("new_submission.csv", index=False)

print("\n🎉 제출 파일 생성 완료: new_submission.csv")


RandomForest Validation R2: 0.6177710093141922
XGBoost Validation R2: 0.6175104509321083
MLP Validation R2: 0.6134726558452552

📌 선택된 최종 모델: RandomForest
📌 최고 Validation R²: 0.6177710093141922

🎉 제출 파일 생성 완료: new_submission.csv


