In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 1. 데이터 불러오기
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# 2. 전처리
X = train_df.drop(columns='Rain')
y = train_df['Rain'].map({'no rain': 0, 'rain': 1})  # 문자 → 숫자 라벨링

# 3. 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_df)

# 4. 학습/검증 분할
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42)

# 5. 모델 훈련 (RandomForest 사용)
model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=5)
model.fit(X_train, y_train)

# 6. 검증 데이터 정확도 확인
val_pred = model.predict(X_val)
val_acc = accuracy_score(y_val, val_pred)
print(f"✅ Validation Accuracy: {val_acc:.5f}")  # ex) 0.99500

# 7. 테스트 데이터 예측 및 제출파일 생성
test_pred = model.predict(test_scaled)
test_pred_label = ['rain' if p == 1 else 'no rain' for p in test_pred]

# ID 컬럼은 1부터 시작해서 len(test.csv)까지 생성
submission = pd.DataFrame({
    'ID': range(1, len(test_pred_label)+1),
    'Rain': test_pred_label
})

submission.to_csv('new2_submission.csv', index=False)
print("📁 submission.csv 저장 완료 (ID 포함)")

✅ Validation Accuracy: 0.99500
📁 submission.csv 저장 완료 (ID 포함)
