In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv('/aiffel/data/train.csv')
test = pd.read_csv('/aiffel/data/test.csv')

# 결측치 채우기
train['reviews_per_month'] = train['reviews_per_month'].fillna(0)
test['reviews_per_month'] = test['reviews_per_month'].fillna(0)

# 데이터 전처리
train['price']= np.log1p(train['price'])

# 'name', 'host_name' columns 지우기
train = train.drop(['name','host_name'], axis=1)
test = test.drop(['name','host_name'], axis=1)

# 결측치 컬럼 삭제 (last_review)
train = train.drop('last_review', axis=1)
test = test.drop('last_review', axis=1)

# 가격 값 복사
target = train['price']
train = train.drop('price', axis=1)

# 수치형 피처 선택
# 수치형 데이터와 범주형 데이터 분리
n_train = train.select_dtypes(exclude='object').copy()
c_train = train.select_dtypes(include='object').copy()
n_test = test.select_dtypes(exclude='object').copy()
c_test = test.select_dtypes(include='object').copy()

from sklearn.preprocessing import MinMaxScaler

cols = [
        'host_id',
        'latitude', 
        'longitude', 
        'minimum_nights',
        'number_of_reviews', 
        'reviews_per_month',
        'calculated_host_listings_count',
        'availability_365'
        ]
scaler = MinMaxScaler()
n_train[cols]=scaler.fit_transform(n_train[cols])
n_test[cols]=scaler.fit_transform(n_test[cols])

# 범주형 변수
from sklearn.preprocessing import LabelEncoder

le= LabelEncoder()
cols = [
        'neighbourhood_group',
        'neighbourhood',
        'room_type'
        ]

for col in cols:
    c_train[col]=le.fit_transform(c_train[col])
    c_test[col]=le.fit_transform(c_test[col])
# 분리한 데이터 다시 합침
train = pd.concat([n_train, c_train], axis=1)
test = pd.concat([n_test, c_test], axis=1)

# 검증 데이터 분리
from sklearn.model_selection import train_test_split

target = np.expm1(target)
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=2022)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

# 머신러닝(회귀)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

LRmodel = LinearRegression()
LRmodel.fit(X_train, y_train)
pred = LRmodel.predict(X_val)
mean_squared_error(y_val, pred)

# 릿지 회귀
from sklearn.linear_model import Ridge

Rmodel = Ridge()
Rmodel.fit(X_train, y_train)
pred = Rmodel.predict(X_val)
mean_squared_error(y_val, pred)

# 라쏘 회귀
from sklearn.linear_model import Lasso

Lmodel = Lasso()
Lmodel.fit(X_train, y_train)
pred = Lmodel.predict(X_val)
mean_squared_error(y_val, pred)

# 엘라스틱넷 회귀
from sklearn.linear_model import ElasticNet

ENmodel = ElasticNet()
ENmodel.fit(X_train, y_train)
pred = ENmodel.predict(X_val)
mean_squared_error(y_val, pred)

# 랜덤포레스트
from sklearn.ensemble import RandomForestRegressor

RFRmodel = RandomForestRegressor(random_state=2022, n_estimators=200)
RFRmodel.fit(X_train, y_train)
pred = RFRmodel.predict(X_val)
mean_squared_error(y_val, pred)

# Xgboost
from xgboost import XGBRegressor
XGBRmodel = XGBRegressor(max_depth =10,
                         learing_rate = 0.02,
                         n_estimators=500,
                         random_state=2022)

XGBRmodel.fit(X_train, y_train)
pred = XGBRmodel.predict(X_val)
mean_squared_error(y_val, pred)

# test 데이터 예측 및 평가
y_test = pd.read_csv('/aiffel/data/y_test.csv')

LRpred = LRmodel.predict(test)
print('LinearRegression MSE =',mean_squared_error(y_test, LRpred))

Rpred = Rmodel.predict(test)
print('Ridge MSE =',mean_squared_error(y_test, Rpred))

Lpred = Lmodel.predict(test)
print('Lasso MSE =',mean_squared_error(y_test, Lpred))

ENpred = ENmodel.predict(test)
print('ElasticNet MSE =',mean_squared_error(y_test, ENpred))

RFRpred = RFRmodel.predict(test)
print('RandomForestRegressor MSE =',mean_squared_error(y_test, RFRpred))

XGBRpred = XGBRmodel.predict(test)
print('XGBRegressor MSE =',mean_squared_error(y_test, XGBRpred))