## 1. 데이터 불러오기

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [22]:
train.shape

(40118, 13)

In [23]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4986 entries, 0 to 4985
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      4986 non-null   object 
 1   x_0     4986 non-null   float64
 2   x_1     4986 non-null   float64
 3   x_2     4986 non-null   float64
 4   x_3     4986 non-null   float64
 5   x_4     4986 non-null   float64
 6   x_5     4986 non-null   float64
 7   x_6     4986 non-null   float64
 8   x_7     4986 non-null   float64
 9   x_8     4986 non-null   float64
 10  x_9     4986 non-null   float64
 11  x_10    4986 non-null   float64
dtypes: float64(11), object(1)
memory usage: 467.6+ KB


## 2. 데이터 전처리

### 이상치 처리

In [24]:
#이상치 처리: IQR 방법 사용
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

outlier_columns = ['x_1', 'x_2', 'x_4', 'x_5', 'x_6', 'x_7', 'x_9', 'x_10']
for col in outlier_columns:
    train = remove_outliers_iqr(train, col)
    test = remove_outliers_iqr(test,col) 

### 피쳐 변환

In [25]:
#  피처 변환: 로그 변환 (양수만 변환 가능)
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)

train_x = train.drop(columns=['ID', 'y'])
test_x = test.drop(columns=['ID'])
train_y = train['y']

# 다항 피처 생성 및 변환
train_x_poly = poly.fit_transform(train_x)
test_x_poly = poly.transform(test_x)

### 데이터 스케일링

In [26]:
# 3. 데이터 스케일링
scaler = StandardScaler()
train_x_poly_scaled = scaler.fit_transform(train_x_poly)
test_x_poly_scaled = scaler.transform(test_x_poly)

## 3. 분석 모델 설계

In [27]:
# 개별 모델 정의
lr = LinearRegression()
rf = RandomForestRegressor(n_estimators=100, random_state=42)
gbr = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Voting Regressor로 앙상블 모델 생성
voting_regressor = VotingRegressor(estimators=[('lr', lr), ('rf', rf), ('gbr', gbr)])

## 4. 모델 학습

분석 모델을 학습시킵니다.

In [28]:
voting_regressor.fit(train_x_poly_scaled, train_y)

## 5. 예측값 생성
학습한 모델을 사용하여 예측값을 생성합니다.

In [None]:
y_pred = voting_regressor.predict(test_x_poly_scaled)

In [None]:
len(y_pred)

3694

## 5. 제출 파일 생성
submission 파일을 만들어서 제출합니다.

In [None]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4986 entries, 0 to 4985
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      4986 non-null   object
 1   y       4986 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 78.0+ KB


In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission['y'] = y_pred
submission.to_csv('./baseline_submit.csv', index=False)

ValueError: Length of values (3694) does not match length of index (4986)

In [None]:
import winsound

# 주파수(Hz)와 지속 시간(ms)을 설정하여 소리 내기
frequency = 1000  # 주파수 (헤르츠)
duration = 1000  # 지속 시간 (밀리초)
winsound.Beep(frequency, duration)