In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score


#데이터 클리닝
def load_and_clean_data(train_path, test_path):
    train_df = pd.read_csv(train_path, sep='\t')
    test_df = pd.read_csv(test_path, sep='\t')
    train_df = train_df[train_df['price'] > 0]
    train_df.dropna(subset=['category_name'], inplace=True)
    return train_df, test_df

#피처 엔지니어링
def feature_engineer(df):
    df_copy = df.copy()
    df_copy['category_name'] = df_copy['category_name'].fillna('Unknown/Unknown/Unknown')
    split_categories = df_copy['category_name'].str.split('/', n=2, expand=True)
    df_copy['main_category'] = split_categories[0]
    df_copy['item_description'] = df_copy['item_description'].fillna('').replace('No description yet', '')
    df_copy['desc_len'] = df_copy['item_description'].str.len()
    keyword_list = ['authentic', 'new', 'nwt', 'mint', 'sealed', 'perfect', 'excellent']
    keyword_pattern = '|'.join(keyword_list)
    df_copy['keyword_count'] = df_copy['item_description'].str.lower().str.count(keyword_pattern)
    df_copy['is_brand'] = df_copy['brand_name'].notna().astype('int')
    return df_copy


#머신러닝을 위해 최종 처리를 위한 함수
def prepare_ml_data(train_df, test_df):
    test_df['price'] = np.nan
    combined_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
    features_to_normalize = ['desc_len', 'keyword_count', 'item_condition_id']
    scaler = MinMaxScaler()
    for feature in features_to_normalize:
        combined_df[f'{feature}_norm'] = scaler.fit_transform(combined_df[[feature]])
    main_category_dummies = pd.get_dummies(combined_df['main_category'], prefix='category')
    combined_df = pd.concat([combined_df, main_category_dummies], axis=1)
    final_train_df = combined_df[combined_df['price'].notna()]
    final_test_df = combined_df[combined_df['price'].isna()]
    final_features_list = [
        'item_condition_id_norm', 'desc_len_norm', 'keyword_count_norm', 'shipping', 'is_brand'
    ] + list(main_category_dummies.columns)
    X_train = final_train_df[final_features_list]
    y_train = np.log1p(final_train_df['price'])
    X_submission = final_test_df[final_features_list]
    return X_train, y_train, X_submission

#메인 파이프라인 실행 함수
def main():

    #데이터 로드, 클리닝, 피처 엔지니어링, 최종 데이터 준비
    train_raw, test_raw = load_and_clean_data('train.tsv', 'test.tsv')
    test_ids = test_raw['test_id']
    train_fe = feature_engineer(train_raw)
    test_fe = feature_engineer(test_raw)
    X_train_full, y_train_full, X_submission = prepare_ml_data(train_fe, test_fe)

    #교차 검증 > 파이프라인 성능 평가
    model_for_cv = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=10)

    #교차 검증으로 R-squared 점수 계산
    cv_scores = cross_val_score(model_for_cv, X_train_full, y_train_full, cv=5, scoring='r2', n_jobs=-1)

    print("Pipeline Performance Report (Cross-Validation)")
    print(f"각 Fold의 R² 점수: {np.round(cv_scores, 4)}")
    print(f"평균 R² 점수: {cv_scores.mean():.4f} (파이프라인 예상 성능)")
    print(f"R² 점수 표준편차: {cv_scores.std():.4f}")


    #전체 학습 데이터로 최종 모델 학습/예측
    final_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=10)
    final_model.fit(X_train_full, y_train_full)
    predictions_log = final_model.predict(X_submission)
    predictions_final = np.expm1(predictions_log)

    #제출 파일 생성: submission.csv
    submission = pd.DataFrame({'test_id': test_ids, 'price': predictions_final})
    submission.loc[submission['price'] < 0, 'price'] = 0
    submission.to_csv('submission.csv', index=False)
    print(submission.head())

#스크립트 실행 함수
if __name__ == '__main__':
    try:
        main()
    except FileNotFoundError:
      print("파일 오류")
    except KeyError as e:
      print(f"\n컬럼 오류: {e}")

Pipeline Performance Report (Cross-Validation)
각 Fold의 R² 점수: [0.178  0.1751 0.1738 0.1776 0.1758]
평균 R² 점수: 0.1760 (파이프라인 예상 성능)
R² 점수 표준편차: 0.0016
   test_id      price
0        0  13.151080
1        1  12.767034
2        2  16.307899
3        3  19.036835
4        4  12.687727
