## 데이터 전처리
1. 모든 열 중복값 제거<br>
2. description length, length_ratio, length_mean 생성
3. 모델링에 사용할 feature 선택

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [5]:
# Colab에서 작업
from google.colab import drive
drive.mount('/content/drive')

In [6]:
#데이터 불러오기
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

#데이터 통합(onehotencoding 위해 통합)
tgt = pd.concat([train, test], ignore_index=True)

In [7]:
tgt.head()

Unnamed: 0,country,description,designation,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,points
0,Australia,"Possibly a little sweet, this is a soft, easyg...",,5.0,Australia Other,South Eastern Australia,,Joe Czerwinski,@JoeCz,Banrock Station 2006 Chardonnay (South Eastern...,Chardonnay,Banrock Station,83.0
1,France,"A soft, almost off dry wine that is full in th...",Réserve,12.0,Rhône Valley,Côtes du Rhône,,Roger Voss,@vossroger,Cellier des Dauphins 2015 Réserve Rosé (Côtes ...,Rosé,Cellier des Dauphins,85.0
2,Spain,Generic white-fruit aromas of peach and apple ...,Estate Grown & Bottled,9.0,Northern Spain,Rueda,,Michael Schachner,@wineschach,Esperanza 2013 Estate Grown & Bottled Verdejo-...,Verdejo-Viura,Esperanza,86.0
3,US,This is the winery's best Nebula in years. Whi...,Nebula,29.0,California,Paso Robles,Central Coast,,,Midnight 2010 Nebula Cabernet Sauvignon (Paso ...,Cabernet Sauvignon,Midnight,87.0
4,US,This is a very rich Pinot whose primary virtue...,Wiley Vineyard,40.0,California,Anderson Valley,,,,Harrington 2006 Wiley Vineyard Pinot Noir (And...,Pinot Noir,Harrington,88.0


In [8]:
#중복값 제거
tgt = tgt.drop_duplicates()

In [9]:
#description 통해 새로운 변수들 생성
tgt['length'] =tgt['description'].apply(lambda x : len(str(x).split(" ")))
tgt = pd.merge(tgt, tgt.groupby('taster_name')['length'].mean(), how='left', on='taster_name')

In [10]:
tgt.rename(columns={'length_x':'length', 'length_y':'length_mean'}, inplace=True )
tgt['length_ratio'] = tgt['length']/tgt['length_mean']

In [12]:
tgt.head()

Unnamed: 0,country,description,designation,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,points,length,length_mean,length_ratio
0,Australia,"Possibly a little sweet, this is a soft, easyg...",,5.0,Australia Other,South Eastern Australia,,Joe Czerwinski,@JoeCz,Banrock Station 2006 Chardonnay (South Eastern...,Chardonnay,Banrock Station,83.0,18,41.027902,0.438726
1,France,"A soft, almost off dry wine that is full in th...",Réserve,12.0,Rhône Valley,Côtes du Rhône,,Roger Voss,@vossroger,Cellier des Dauphins 2015 Réserve Rosé (Côtes ...,Rosé,Cellier des Dauphins,85.0,31,37.556332,0.825427
2,Spain,Generic white-fruit aromas of peach and apple ...,Estate Grown & Bottled,9.0,Northern Spain,Rueda,,Michael Schachner,@wineschach,Esperanza 2013 Estate Grown & Bottled Verdejo-...,Verdejo-Viura,Esperanza,86.0,33,42.871257,0.769747
3,US,This is the winery's best Nebula in years. Whi...,Nebula,29.0,California,Paso Robles,Central Coast,,,Midnight 2010 Nebula Cabernet Sauvignon (Paso ...,Cabernet Sauvignon,Midnight,87.0,40,,
4,US,This is a very rich Pinot whose primary virtue...,Wiley Vineyard,40.0,California,Anderson Valley,,,,Harrington 2006 Wiley Vineyard Pinot Noir (And...,Pinot Noir,Harrington,88.0,35,,


In [13]:
#모델링에 사용할 feature 선택
features = ['price', 'length', 'country', 'province', 'taster_name', 'variety', 'length_ratio']

categorical_columns = ['country', 'province', 'taster_name', 'variety']
num_columns = ['price', 'length', 'length_ratio']

In [14]:
#feature 중에 categorical variable 결측치 처리 (None으로 대체)

# categorical_columns = ['country', 'province', 'taster_name', 'variety']

tgt[categorical_columns] = tgt[categorical_columns].fillna('None')


In [15]:
tgt[categorical_columns].head()

Unnamed: 0,country,province,taster_name,variety
0,Australia,Australia Other,Joe Czerwinski,Chardonnay
1,France,Rhône Valley,Roger Voss,Rosé
2,Spain,Northern Spain,Michael Schachner,Verdejo-Viura
3,US,California,,Cabernet Sauvignon
4,US,California,,Pinot Noir


In [16]:
#categorical variables - OneHotEncoding 진행

from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse= False)
OH_cols_tgt = pd.DataFrame(OH_encoder.fit_transform(tgt[categorical_columns])) # 모든 categorical variable 대한 onehotencoding
OH_cols_tgt.index = tgt.index # 인덱스 복원
OH_cols_tgt.columns = OH_encoder.get_feature_names_out(categorical_columns) #column 명 복원

# OH_cols_train

In [17]:
OH_cols_tgt

Unnamed: 0,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Bosnia and Herzegovina,country_Brazil,country_Bulgaria,country_Canada,country_Chile,country_China,...,variety_Yapincak,variety_Zelen,variety_Zibibbo,variety_Zierfandler,variety_Zierfandler-Rotgipfler,variety_Zinfandel,variety_Zlahtina,variety_Zweigelt,variety_Çalkarası,variety_Žilavka
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# 필요하다면 price에 로그 처리

# train.price = np.log(train.price)
# test.price = np.log(test.price)

In [21]:
num_columns

['price', 'length', 'length_ratio']

In [22]:
# feature 중에 numerical variable 처리 

from sklearn.impute import SimpleImputer

num_tgt = tgt[num_columns]

my_imputer = SimpleImputer(strategy='mean') #평균으로 결측치 대체
# my_imputer = SimpleImputer(strategy='median')
imputed_num_tgt = pd.DataFrame(my_imputer.fit_transform(num_tgt), columns =num_tgt.columns)

imputed_num_tgt.index = num_tgt.index #기존의 index 복원

In [23]:
# 각각 결측치 처리한 데이터 하나로 통합
alltogether = pd.concat([imputed_num_tgt, OH_cols_tgt, tgt['points']], axis = 1)

In [24]:
alltogether

Unnamed: 0,price,length,length_ratio,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Bosnia and Herzegovina,country_Brazil,country_Bulgaria,...,variety_Zelen,variety_Zibibbo,variety_Zierfandler,variety_Zierfandler-Rotgipfler,variety_Zinfandel,variety_Zlahtina,variety_Zweigelt,variety_Çalkarası,variety_Žilavka,points
0,5.0,18.0,0.438726,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,83.0
1,12.0,31.0,0.825427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,85.0
2,9.0,33.0,0.769747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,86.0
3,29.0,40.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,87.0
4,40.0,35.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,88.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123165,16.0,28.0,0.699911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
123166,30.0,38.0,0.975301,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
123167,45.0,53.0,1.102347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
123168,150.0,75.0,1.559925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [25]:
# points 점수의 유무를 기준으로 trainset, testset 구분

trainset = alltogether[alltogether.points.notnull()]
testset = alltogether[alltogether.points.isnull()]

In [27]:
trainset

Unnamed: 0,price,length,length_ratio,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Bosnia and Herzegovina,country_Brazil,country_Bulgaria,...,variety_Zelen,variety_Zibibbo,variety_Zierfandler,variety_Zierfandler-Rotgipfler,variety_Zinfandel,variety_Zlahtina,variety_Zweigelt,variety_Çalkarası,variety_Žilavka,points
0,5.0,18.0,0.438726,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,83.0
1,12.0,31.0,0.825427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,85.0
2,9.0,33.0,0.769747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,86.0
3,29.0,40.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,87.0
4,40.0,35.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,88.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97566,24.0,32.0,0.779957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,88.0
97567,17.0,35.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,87.0
97568,11.0,61.0,1.408693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,88.0
97569,36.0,44.0,1.129296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,85.0


In [28]:
# train, valid, test 셋 구분

x = trainset.drop('points', axis = 1).copy()
y = trainset['points'].copy()

x_train, x_valid, y_train, y_valid = train_test_split(x, y,train_size = 0.8, test_size = 0.2, random_state = 1)

In [33]:
x_train

Unnamed: 0,price,length,length_ratio,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Bosnia and Herzegovina,country_Brazil,country_Bulgaria,...,variety_Yapincak,variety_Zelen,variety_Zibibbo,variety_Zierfandler,variety_Zierfandler-Rotgipfler,variety_Zinfandel,variety_Zlahtina,variety_Zweigelt,variety_Çalkarası,variety_Žilavka
15844,11.000000,25.0,0.665667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12521,65.000000,46.0,1.180628,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95921,11.000000,34.0,0.793072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55093,31.000000,42.0,0.979677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95115,22.000000,37.0,0.985187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21440,35.542996,35.0,0.898304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73349,50.000000,62.0,1.431787,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50057,70.000000,58.0,1.206342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5192,40.000000,38.0,1.011813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 모델 적합

In [34]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

for n in [5, 10, 15]: #max_depth -> 10이 가장 결과 좋음

    #모델 적합
    model = DecisionTreeRegressor(random_state=10, max_depth=n)
    model.fit(x_train, y_train)

    #모델 예측
    valid_preds = model.predict(x_valid)

    #모델 평가 
    print('features:{}'.format(features))
    print('Max Depth : {}'.format(n))
    print('MSE : {}'.format(mean_squared_error(y_valid, valid_preds)))
    print('RMSE: {}'.format(mean_squared_error(y_valid, valid_preds, squared=False)))
    print('R^2 : {}'.format(r2_score(y_valid, valid_preds)))


features:['price', 'length', 'country', 'province', 'taster_name', 'variety', 'length_ratio']
Max Depth : 5
MSE : 4.922829723963125
RMSE: 2.2187450786341194
R^2 : 0.48733372750003934
features:['price', 'length', 'country', 'province', 'taster_name', 'variety', 'length_ratio']
Max Depth : 10
MSE : 4.492795077928151
RMSE: 2.1196214468456747
R^2 : 0.5321177788263374
features:['price', 'length', 'country', 'province', 'taster_name', 'variety', 'length_ratio']
Max Depth : 15
MSE : 4.717407259967713
RMSE: 2.171959313607811
R^2 : 0.5087265391164247


## 모델 성능 개선 - Grid Search

In [35]:
#Grid_Search

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [36]:
params = {
    'max_depth': [10, 20, 50],
    'min_samples_split': [1, 5, 10, 50],
    'min_samples_leaf': [0.1, 0.03, 0.003],
    'max_leaf_nodes': [100, 200, 300],
}


grid = GridSearchCV(DecisionTreeRegressor(random_state=10), 
                    params, scoring = 'r2', cv = 5,verbose = 3)
grid.fit(x_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 1/5] END max_depth=10, max_leaf_nodes=100, min_samples_leaf=0.1, min_samples_split=1;, score=nan total time=   0.4s
[CV 2/5] END max_depth=10, max_leaf_nodes=100, min_samples_leaf=0.1, min_samples_split=1;, score=nan total time=   0.3s
[CV 3/5] END max_depth=10, max_leaf_nodes=100, min_samples_leaf=0.1, min_samples_split=1;, score=nan total time=   0.3s
[CV 4/5] END max_depth=10, max_leaf_nodes=100, min_samples_leaf=0.1, min_samples_split=1;, score=nan total time=   0.3s
[CV 5/5] END max_depth=10, max_leaf_nodes=100, min_samples_leaf=0.1, min_samples_split=1;, score=nan total time=   0.3s
[CV 1/5] END max_depth=10, max_leaf_nodes=100, min_samples_leaf=0.1, min_samples_split=5;, score=0.384 total time=   1.0s
[CV 2/5] END max_depth=10, max_leaf_nodes=100, min_samples_leaf=0.1, min_samples_split=5;, score=0.396 total time=   1.1s
[CV 3/5] END max_depth=10, max_leaf_nodes=100, min_samples_leaf=0.1, min_samples_split=5;, sc

[CV 3/5] END max_depth=10, max_leaf_nodes=200, min_samples_leaf=0.1, min_samples_split=5;, score=0.401 total time=   1.0s
[CV 4/5] END max_depth=10, max_leaf_nodes=200, min_samples_leaf=0.1, min_samples_split=5;, score=0.397 total time=   0.9s
[CV 5/5] END max_depth=10, max_leaf_nodes=200, min_samples_leaf=0.1, min_samples_split=5;, score=0.394 total time=   1.0s
[CV 1/5] END max_depth=10, max_leaf_nodes=200, min_samples_leaf=0.1, min_samples_split=10;, score=0.384 total time=   0.9s
[CV 2/5] END max_depth=10, max_leaf_nodes=200, min_samples_leaf=0.1, min_samples_split=10;, score=0.396 total time=   1.0s
[CV 3/5] END max_depth=10, max_leaf_nodes=200, min_samples_leaf=0.1, min_samples_split=10;, score=0.401 total time=   1.0s
[CV 4/5] END max_depth=10, max_leaf_nodes=200, min_samples_leaf=0.1, min_samples_split=10;, score=0.397 total time=   0.9s
[CV 5/5] END max_depth=10, max_leaf_nodes=200, min_samples_leaf=0.1, min_samples_split=10;, score=0.394 total time=   1.0s
[CV 1/5] END max_de

[CV 5/5] END max_depth=10, max_leaf_nodes=300, min_samples_leaf=0.1, min_samples_split=10;, score=0.394 total time=   1.0s
[CV 1/5] END max_depth=10, max_leaf_nodes=300, min_samples_leaf=0.1, min_samples_split=50;, score=0.384 total time=   0.9s
[CV 2/5] END max_depth=10, max_leaf_nodes=300, min_samples_leaf=0.1, min_samples_split=50;, score=0.396 total time=   1.0s
[CV 3/5] END max_depth=10, max_leaf_nodes=300, min_samples_leaf=0.1, min_samples_split=50;, score=0.401 total time=   1.0s
[CV 4/5] END max_depth=10, max_leaf_nodes=300, min_samples_leaf=0.1, min_samples_split=50;, score=0.397 total time=   0.9s
[CV 5/5] END max_depth=10, max_leaf_nodes=300, min_samples_leaf=0.1, min_samples_split=50;, score=0.394 total time=   1.0s
[CV 1/5] END max_depth=10, max_leaf_nodes=300, min_samples_leaf=0.03, min_samples_split=1;, score=nan total time=   0.2s
[CV 2/5] END max_depth=10, max_leaf_nodes=300, min_samples_leaf=0.03, min_samples_split=1;, score=nan total time=   0.3s
[CV 3/5] END max_dep

[CV 2/5] END max_depth=20, max_leaf_nodes=100, min_samples_leaf=0.03, min_samples_split=1;, score=nan total time=   0.3s
[CV 3/5] END max_depth=20, max_leaf_nodes=100, min_samples_leaf=0.03, min_samples_split=1;, score=nan total time=   0.4s
[CV 4/5] END max_depth=20, max_leaf_nodes=100, min_samples_leaf=0.03, min_samples_split=1;, score=nan total time=   0.3s
[CV 5/5] END max_depth=20, max_leaf_nodes=100, min_samples_leaf=0.03, min_samples_split=1;, score=nan total time=   0.3s
[CV 1/5] END max_depth=20, max_leaf_nodes=100, min_samples_leaf=0.03, min_samples_split=5;, score=0.451 total time=   2.0s
[CV 2/5] END max_depth=20, max_leaf_nodes=100, min_samples_leaf=0.03, min_samples_split=5;, score=0.451 total time=   1.9s
[CV 3/5] END max_depth=20, max_leaf_nodes=100, min_samples_leaf=0.03, min_samples_split=5;, score=0.462 total time=   1.9s
[CV 4/5] END max_depth=20, max_leaf_nodes=100, min_samples_leaf=0.03, min_samples_split=5;, score=0.454 total time=   2.1s
[CV 5/5] END max_depth=2

[CV 4/5] END max_depth=20, max_leaf_nodes=200, min_samples_leaf=0.03, min_samples_split=5;, score=0.454 total time=   2.0s
[CV 5/5] END max_depth=20, max_leaf_nodes=200, min_samples_leaf=0.03, min_samples_split=5;, score=0.452 total time=   1.9s
[CV 1/5] END max_depth=20, max_leaf_nodes=200, min_samples_leaf=0.03, min_samples_split=10;, score=0.451 total time=   2.0s
[CV 2/5] END max_depth=20, max_leaf_nodes=200, min_samples_leaf=0.03, min_samples_split=10;, score=0.451 total time=   1.9s
[CV 3/5] END max_depth=20, max_leaf_nodes=200, min_samples_leaf=0.03, min_samples_split=10;, score=0.462 total time=   1.9s
[CV 4/5] END max_depth=20, max_leaf_nodes=200, min_samples_leaf=0.03, min_samples_split=10;, score=0.454 total time=   2.0s
[CV 5/5] END max_depth=20, max_leaf_nodes=200, min_samples_leaf=0.03, min_samples_split=10;, score=0.452 total time=   1.9s
[CV 1/5] END max_depth=20, max_leaf_nodes=200, min_samples_leaf=0.03, min_samples_split=50;, score=0.451 total time=   1.9s
[CV 2/5] E

[CV 1/5] END max_depth=20, max_leaf_nodes=300, min_samples_leaf=0.03, min_samples_split=50;, score=0.451 total time=   1.7s
[CV 2/5] END max_depth=20, max_leaf_nodes=300, min_samples_leaf=0.03, min_samples_split=50;, score=0.451 total time=   1.7s
[CV 3/5] END max_depth=20, max_leaf_nodes=300, min_samples_leaf=0.03, min_samples_split=50;, score=0.462 total time=   1.7s
[CV 4/5] END max_depth=20, max_leaf_nodes=300, min_samples_leaf=0.03, min_samples_split=50;, score=0.454 total time=   1.7s
[CV 5/5] END max_depth=20, max_leaf_nodes=300, min_samples_leaf=0.03, min_samples_split=50;, score=0.452 total time=   1.8s
[CV 1/5] END max_depth=20, max_leaf_nodes=300, min_samples_leaf=0.003, min_samples_split=1;, score=nan total time=   0.3s
[CV 2/5] END max_depth=20, max_leaf_nodes=300, min_samples_leaf=0.003, min_samples_split=1;, score=nan total time=   0.2s
[CV 3/5] END max_depth=20, max_leaf_nodes=300, min_samples_leaf=0.003, min_samples_split=1;, score=nan total time=   0.2s
[CV 4/5] END m

[CV 3/5] END max_depth=50, max_leaf_nodes=100, min_samples_leaf=0.003, min_samples_split=1;, score=nan total time=   0.3s
[CV 4/5] END max_depth=50, max_leaf_nodes=100, min_samples_leaf=0.003, min_samples_split=1;, score=nan total time=   0.3s
[CV 5/5] END max_depth=50, max_leaf_nodes=100, min_samples_leaf=0.003, min_samples_split=1;, score=nan total time=   0.3s
[CV 1/5] END max_depth=50, max_leaf_nodes=100, min_samples_leaf=0.003, min_samples_split=5;, score=0.505 total time=   3.8s
[CV 2/5] END max_depth=50, max_leaf_nodes=100, min_samples_leaf=0.003, min_samples_split=5;, score=0.508 total time=   3.7s
[CV 3/5] END max_depth=50, max_leaf_nodes=100, min_samples_leaf=0.003, min_samples_split=5;, score=0.515 total time=   3.7s
[CV 4/5] END max_depth=50, max_leaf_nodes=100, min_samples_leaf=0.003, min_samples_split=5;, score=0.505 total time=   3.7s
[CV 5/5] END max_depth=50, max_leaf_nodes=100, min_samples_leaf=0.003, min_samples_split=5;, score=0.510 total time=   3.7s
[CV 1/5] END m

[CV 5/5] END max_depth=50, max_leaf_nodes=200, min_samples_leaf=0.003, min_samples_split=5;, score=0.520 total time=   4.3s
[CV 1/5] END max_depth=50, max_leaf_nodes=200, min_samples_leaf=0.003, min_samples_split=10;, score=0.511 total time=   4.2s
[CV 2/5] END max_depth=50, max_leaf_nodes=200, min_samples_leaf=0.003, min_samples_split=10;, score=0.516 total time=   4.3s
[CV 3/5] END max_depth=50, max_leaf_nodes=200, min_samples_leaf=0.003, min_samples_split=10;, score=0.521 total time=   4.4s
[CV 4/5] END max_depth=50, max_leaf_nodes=200, min_samples_leaf=0.003, min_samples_split=10;, score=0.514 total time=   4.4s
[CV 5/5] END max_depth=50, max_leaf_nodes=200, min_samples_leaf=0.003, min_samples_split=10;, score=0.520 total time=   4.5s
[CV 1/5] END max_depth=50, max_leaf_nodes=200, min_samples_leaf=0.003, min_samples_split=50;, score=0.511 total time=   4.4s
[CV 2/5] END max_depth=50, max_leaf_nodes=200, min_samples_leaf=0.003, min_samples_split=50;, score=0.516 total time=   4.3s
[

[CV 2/5] END max_depth=50, max_leaf_nodes=300, min_samples_leaf=0.003, min_samples_split=50;, score=0.517 total time=   4.4s
[CV 3/5] END max_depth=50, max_leaf_nodes=300, min_samples_leaf=0.003, min_samples_split=50;, score=0.522 total time=   4.3s
[CV 4/5] END max_depth=50, max_leaf_nodes=300, min_samples_leaf=0.003, min_samples_split=50;, score=0.514 total time=   4.3s
[CV 5/5] END max_depth=50, max_leaf_nodes=300, min_samples_leaf=0.003, min_samples_split=50;, score=0.522 total time=   4.4s


GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=10),
             param_grid={'max_depth': [10, 20, 50],
                         'max_leaf_nodes': [100, 200, 300],
                         'min_samples_leaf': [0.1, 0.03, 0.003],
                         'min_samples_split': [1, 5, 10, 50]},
             scoring='r2', verbose=3)

In [38]:
grid.best_params_

{'max_depth': 20,
 'max_leaf_nodes': 300,
 'min_samples_leaf': 0.003,
 'min_samples_split': 5}

In [39]:
grid.best_score_

0.517257681472379

In [40]:
# 최적 모델 추천

y_pred = grid.predict(x_valid)

print("MSE: ", mean_squared_error(y_valid,y_pred))
print("RMSE: ", (mean_squared_error(y_valid,y_pred))**0.5)
print("MAE: ", mean_absolute_error(y_valid,y_pred))
print('R^2(Score) : {}'.format(r2_score(y_valid, valid_preds)))

MSE:  4.523538915640542
RMSE:  2.126861282651161
MAE:  1.682450094593127
R^2(Score) : 0.5087265391164247


## 모델 성능개선 - Random Search

In [41]:
#Random Search 

from sklearn.model_selection import RandomizedSearchCV
params = {'max_depth': [10, 15, 20, 40],
          'min_samples_split': [1, 5, 10],
          'min_samples_leaf': [50, 25, 10, 1, 0.1, 0.003, 0.003],
          'max_leaf_nodes': [300, 400, 500],}


random_grid = RandomizedSearchCV(DecisionTreeRegressor(random_state=10),
                                 params, n_jobs=-1, 
                                 scoring='r2', 
                                 n_iter=100)
random_grid.fit(x_train, y_train)


RandomizedSearchCV(estimator=DecisionTreeRegressor(random_state=10), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'max_depth': [10, 15, 20, 40],
                                        'max_leaf_nodes': [300, 400, 500],
                                        'min_samples_leaf': [50, 25, 10, 1, 0.1,
                                                             0.003, 0.003],
                                        'min_samples_split': [1, 5, 10]},
                   scoring='r2')

In [42]:
random_grid.best_params_

{'min_samples_split': 10,
 'min_samples_leaf': 10,
 'max_leaf_nodes': 500,
 'max_depth': 40}

In [43]:
random_grid.best_score_

0.5343913679896175

In [44]:
#최적 모델 추천

y_pred = random_grid.predict(x_valid)

print("MSE: ", mean_squared_error(y_valid,y_pred))
print("RMSE: ", (mean_squared_error(y_valid,y_pred))**0.5)
print("MAE: ", mean_absolute_error(y_valid,y_pred))

MSE:  4.393722372024975
RMSE:  2.0961207913727145
MAE:  1.654189742179463


- Grid Search 결과 추천 parameter 값 : {'max_depth': 20, 'max_leaf_nodes': 300, 'min_samples_leaf': 0.003, 'min_samples_split': 5}<br>
MSE:  4.523538915640542<br>
RMSE:  2.126861282651161<br>
MAE:  1.682450094593127<br>
R^2(Score) : 0.5087265391164247<br><br>
- Random Search 결과 추천 parameter 값 : 'min_samples_split': 10, 'min_samples_leaf': 10,  'max_leaf_nodes': 500, 'max_depth': 40}<br>
MSE:  4.393722372024975<br>
RMSE:  2.0961207913727145<br>
MAE:  1.654189742179463<br>
R^2(Score) : 0.5343913679896175<br><br>

-> 최종적으로 Random Search의 결과를 따르기로 결정

In [46]:
#최적 모델 적합
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


#모델 적합
model = DecisionTreeRegressor(random_state=10, min_samples_split=10, min_samples_leaf=10, max_leaf_nodes=500, max_depth=40)
model.fit(x_train, y_train)

#모델 예측
valid_preds = model.predict(x_valid)

#모델 평가 
print('features:{}'.format(features))
print('MSE : {}'.format(mean_squared_error(y_valid, valid_preds)))
print('RMSE: {}'.format(mean_squared_error(y_valid, valid_preds, squared=False)))
print('R^2 : {}'.format(r2_score(y_valid, valid_preds)))


features:['price', 'length', 'country', 'province', 'taster_name', 'variety', 'length_ratio']
MSE : 4.393722372024975
RMSE: 2.0961207913727145
R^2 : 0.5424352664685824


## Test set에 대한 points 예측
- 위에서 전처리 완료
- 여기서는 points column만 삭제

In [47]:
testset.head()

Unnamed: 0,price,length,length_ratio,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Bosnia and Herzegovina,country_Brazil,country_Bulgaria,...,variety_Zelen,variety_Zibibbo,variety_Zierfandler,variety_Zierfandler-Rotgipfler,variety_Zinfandel,variety_Zlahtina,variety_Zweigelt,variety_Çalkarası,variety_Žilavka,points
97571,15.0,48.0,1.169936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
97572,50.0,51.0,1.189608,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
97573,38.0,42.0,0.873558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
97574,25.0,37.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
97575,20.0,50.0,1.360155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [48]:
testset = testset.drop('points', axis = 1)

### 모델을 통한 예측

In [49]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


model = DecisionTreeRegressor(random_state=10, min_samples_split=10, min_samples_leaf=10, max_leaf_nodes=500, max_depth=40)
model.fit(x_train, y_train)


#모델 예측
test_pred = model.predict(testset)

In [50]:
# 결과를 파일로 저장
wine_prediction = pd.DataFrame({'id': testset.index, 'points': test_pred})
wine_prediction

wine_prediction.to_csv('wine_point_prediction.csv', index=False)