In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

# 1. 데이터 불러오기
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X = train.drop(columns=["gold price"])
y = train["gold price"]

# 2. 검증용 데이터 분할 (결정계수 확인용)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. 다항 특성 생성 (6차)
poly = PolynomialFeatures(degree=6, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)
X_test_poly = poly.transform(test)

# 4. 특성 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_val_scaled = scaler.transform(X_val_poly)
X_test_scaled = scaler.transform(X_test_poly)

# 5. Ridge 회귀 학습 (α=0.00001)
model = Ridge(alpha=0.00001)
model.fit(X_train_scaled, y_train)

# 6. 결정계수 확인
val_pred = model.predict(X_val_scaled)
r2 = r2_score(y_val, val_pred)
print("검증 R² (결정계수):", round(r2, 6))

# 7. 전체 데이터로 재학습
X_full_poly = poly.fit_transform(X)
X_full_scaled = scaler.fit_transform(X_full_poly)
final_model = Ridge(alpha=0.00001)
final_model.fit(X_full_scaled, y)

# 8. 테스트셋 예측
X_test_poly_final = poly.transform(test)
X_test_scaled_final = scaler.transform(X_test_poly_final)
final_pred = final_model.predict(X_test_scaled_final)

# 9. ID를 1부터 시작하도록 설정
submission = pd.DataFrame({
    "ID": test.index + 1,
    "gold price": final_pred
})
submission.to_csv("submission.csv", index=False)
print("✅ 제출 파일 저장 완료: submission.csv (ID는 1부터 시작)")

검증 R² (결정계수): 0.994505
✅ 제출 파일 저장 완료: submission.csv (ID는 1부터 시작)
