## 4.웹사이트 방문자 예측

In [62]:
!git clone https://github.com/Soyoung-Yoon/bigdata

Cloning into 'bigdata'...
remote: Enumerating objects: 159, done.[K
remote: Counting objects: 100% (109/109), done.[K
remote: Compressing objects: 100% (83/83), done.[K
remote: Total 159 (delta 33), reused 85 (delta 26), pack-reused 50 (from 1)[K
Receiving objects: 100% (159/159), 16.52 MiB | 7.68 MiB/s, done.
Resolving deltas: 100% (48/48), done.


### 4-1. 데이터 이해, 생성
- 1개 데이터를 나누어 시험용으로 변경해 보는 작업
- 실제 시험에서는 이런 과정은 필요 없음


In [63]:
# https://www.kaggle.com/bobnau/daily-website-visitors
# Daily number of pages loaded
# Daily number of visitors from whose IP addresses there haven't been hits on any page in over 6 hours
# Number of unique visitors who do not have a cookie identifying them as a previous customer
# Number of unique visitors minus first time visitors

In [64]:
# [0] 사용 라이브러리 import
import pandas as pd

# 데이타가 많은경우 모두 출력 안되고 ... 으로 생략해서 출력됨.
# 시험환경에서는 아래와 같이 해야해서 수정했습니다 ^^*
pd.options.display.max_rows = 500    #출력할 max row를 지정
pd.options.display.max_columns = 20  #출력할 max columns를 지정
#출력 format 지정 - 소수점아래 4자리까지
pd.set_option('display.float_format','{:.4f}'.format)

In [65]:
# [1] 'bigdata/daily-website-visitors.csv' 파일 불러오기, shape 확인하기
df = pd.read_csv('bigdata/daily-website-visitors.csv')
print(df.shape)
display(df.head(10))

(2167, 8)


Unnamed: 0,Row,Day,Day.Of.Week,Date,Page.Loads,Unique.Visits,First.Time.Visits,Returning.Visits
0,1,Sunday,1,9/14/2014,2146,1582,1430,152
1,2,Monday,2,9/15/2014,3621,2528,2297,231
2,3,Tuesday,3,9/16/2014,3698,2630,2352,278
3,4,Wednesday,4,9/17/2014,3667,2614,2327,287
4,5,Thursday,5,9/18/2014,3316,2366,2130,236
5,6,Friday,6,9/19/2014,2815,1863,1622,241
6,7,Saturday,7,9/20/2014,1658,1118,985,133
7,8,Sunday,1,9/21/2014,2288,1656,1481,175
8,9,Monday,2,9/22/2014,3638,2586,2312,274
9,10,Tuesday,3,9/23/2014,4462,3257,2989,268


In [66]:
# [2] head를 사용해 데이터의 모습 확인
# 컬럼명이 대소문자 섞여 있어 사용하기 좋지 않음
print(df.head(2))

   Row     Day  Day.Of.Week       Date Page.Loads Unique.Visits  \
0    1  Sunday            1  9/14/2014      2,146         1,582   
1    2  Monday            2  9/15/2014      3,621         2,528   

  First.Time.Visits Returning.Visits  
0             1,430              152  
1             2,297              231  


- [240508] 버전에 따른 오류 확인 : https://colab.research.google.com/drive/1oPQmNefoujgfQDdzqPdCM1LgZqc1mfFr#scrollTo=FqihypH0f4Mb

In [67]:
# [3] 컬럼명 정리 - 소문자로 변경, '.'을 '_'로 변경
# [240508] 버전이 달라지면서 regex=False를 사용해야 합니다.
df.columns = df.columns.str.replace('.','_',regex=False).str.lower()
df.columns

Index(['row', 'day', 'day_of_week', 'date', 'page_loads', 'unique_visits',
       'first_time_visits', 'returning_visits'],
      dtype='object')

컬럼 정보
- row : 1번 부터 시작하는 일련번호
- day : 요일 정보
- day_of_week : 요일 정보
- date : 날짜 정보
- page_loads : 로드된 일별 페이지 수
- unique_visits : 6시간 이상 페이지에서 조회되지 않은 IP 주소의 일일 방문자 수 (종속변수)
- first_time_visits : 이전 고객으로 식별되는 쿠키를 가지고 있지 않은 고유 방문자 수
- returning_visits : unique_visits 수에서 first_time_visits 제외

In [68]:
df.dtypes

Unnamed: 0,0
row,int64
day,object
day_of_week,int64
date,object
page_loads,object
unique_visits,object
first_time_visits,object
returning_visits,object


In [69]:
# [4] unique_visits를 예측하는 값으로 사용할 것이며, 수치데이터 이어야 함
# 콤마 제거 후, int로 형변환
df['unique_visits'] = df['unique_visits'].str.replace(',','',regex=False).astype(int)


In [70]:
display(df['unique_visits'].head(2))

Unnamed: 0,unique_visits
0,1582
1,2528


In [71]:
# [5] 시험 형식으로 train, test 데이터로 나누어 저장합니다.  (6:4 비율)
train_size = len(df) - int(len(df)*0.4)

# 여러 가지 데이터가 고루 섞이도록 해야 함
df = df.sample(frac=1,random_state=1234)
df['row'] = range(1,len(df)+1)
train = df.iloc[:train_size,:]
test = df.iloc[train_size:,:]
print(train.shape,test.shape)
y = 'unique_visits'
X_train = train.drop(columns=y)
Y_train = train[['row',y]]
X_test = test.drop(columns=y)
Y_test = test[['row',y]]
X_train.to_csv('x_train.csv', index=False)
Y_train.to_csv('y_train.csv', index=False)
X_test.to_csv('x_test.csv', index=False)
Y_test.to_csv('y_test.csv', index=False)

(1301, 8) (866, 8)


### 4-2. 데이터 불러오기, 전처리

In [72]:
# [1] 데이터 파일 불러오기
X_use = pd.read_csv('x_train.csv')
X_submission = pd.read_csv('x_test.csv')
Y = pd.read_csv('y_train.csv')
Y_hidden = pd.read_csv('y_test.csv')

In [73]:
display(X_use.head(2),X_submission.head(2))

Unnamed: 0,row,day,day_of_week,date,page_loads,first_time_visits,returning_visits
0,1,Friday,6,6/26/2020,3555,2248,427
1,2,Thursday,5,8/29/2019,4221,2477,505


Unnamed: 0,row,day,day_of_week,date,page_loads,first_time_visits,returning_visits
0,1302,Thursday,5,1/5/2017,3841,2058,567
1,1303,Tuesday,3,4/18/2017,5009,2906,796


In [74]:
# [2] dfX 만들기, dfX의 info() 확인
# use, submission 데이터를 묶어 전처리 하기 위함
dfX = pd.concat([X_use,X_submission],axis=0,ignore_index=True)
dfX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2167 entries, 0 to 2166
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   row                2167 non-null   int64 
 1   day                2167 non-null   object
 2   day_of_week        2167 non-null   int64 
 3   date               2167 non-null   object
 4   page_loads         2167 non-null   object
 5   first_time_visits  2167 non-null   object
 6   returning_visits   2167 non-null   object
dtypes: int64(2), object(5)
memory usage: 118.6+ KB


In [75]:
# [3] dfX의 object 값이 어떤 것인지 확인하기 위해 5줄 출력을 합니다.
dfX.head()

Unnamed: 0,row,day,day_of_week,date,page_loads,first_time_visits,returning_visits
0,1,Friday,6,6/26/2020,3555,2248,427
1,2,Thursday,5,8/29/2019,4221,2477,505
2,3,Friday,6,11/18/2016,4767,2734,690
3,4,Saturday,7,11/30/2019,3301,2053,390
4,5,Sunday,1,7/5/2020,2948,1836,362


In [76]:
# [4] 'page_loads', 'first_time_visits', 'returning_visits' 에 대해서
#  콤마를 없애고, int로 형변환 합니다
names = ['page_loads','first_time_visits','returning_visits']
dfX[names] = dfX[names].replace(',','',regex=True).astype(int)
dfX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2167 entries, 0 to 2166
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   row                2167 non-null   int64 
 1   day                2167 non-null   object
 2   day_of_week        2167 non-null   int64 
 3   date               2167 non-null   object
 4   page_loads         2167 non-null   int64 
 5   first_time_visits  2167 non-null   int64 
 6   returning_visits   2167 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 118.6+ KB


In [77]:
# [5] 'date'에 대해서 datetime64로 형변환 합니다.
#dfX['date'] = dfX['date'].astype('datetime64')
dfX['date'] = pd.to_datetime(dfX['date'], format='%m/%d/%Y')

In [78]:
# [6] 형변환이 올바르게 되었는지 확인합니다.
dfX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2167 entries, 0 to 2166
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   row                2167 non-null   int64         
 1   day                2167 non-null   object        
 2   day_of_week        2167 non-null   int64         
 3   date               2167 non-null   datetime64[ns]
 4   page_loads         2167 non-null   int64         
 5   first_time_visits  2167 non-null   int64         
 6   returning_visits   2167 non-null   int64         
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 118.6+ KB


In [79]:
# [7] day, day_of_week를 확인해 보면 동일한 데이터 인 것을 알 수 있음
# 'day' 컬럼을 제거합니다.
display(dfX['day'].head(),dfX['day_of_week'].head())
dfX2 = dfX.drop(columns='day')

Unnamed: 0,day
0,Friday
1,Thursday
2,Friday
3,Saturday
4,Sunday


Unnamed: 0,day_of_week
0,6
1,5
2,6
3,7
4,1


In [80]:
# [8] 'date'에서 'year', 'month', 'day'에 대한 정보를 dfX2에 포함시키고,
# 'date' 를 제거합니다.
# 상황에 따라 'quarter' 사용도 고려할 수 있음, 주말/주중으로 나눌 수도 있음 ...
date = pd.DataFrame()
temp = dfX2['date'].dt
date['year'] = temp.year
date['month'] = temp.month
date['day'] = temp.day
dfX3 = dfX2.drop(columns='date')
dfX3 = pd.concat([dfX3,date],axis=1)
dfX3.info()
dfX3.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2167 entries, 0 to 2166
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   row                2167 non-null   int64
 1   day_of_week        2167 non-null   int64
 2   page_loads         2167 non-null   int64
 3   first_time_visits  2167 non-null   int64
 4   returning_visits   2167 non-null   int64
 5   year               2167 non-null   int32
 6   month              2167 non-null   int32
 7   day                2167 non-null   int32
dtypes: int32(3), int64(5)
memory usage: 110.2 KB


Unnamed: 0,row,day_of_week,page_loads,first_time_visits,returning_visits,year,month,day
0,1,6,3555,2248,427,2020,6,26
1,2,5,4221,2477,505,2019,8,29
2,3,6,4767,2734,690,2016,11,18
3,4,7,3301,2053,390,2019,11,30
4,5,1,2948,1836,362,2020,7,5


In [81]:
# [9] dfX3과 Y를 merge 합니다.
# Y와 X의 feature의 관계를 분석해 보기 위해!
dfXY = pd.merge(dfX3, Y)
print(dfXY.shape, dfX3.shape, Y.shape)

(1301, 9) (2167, 8) (1301, 2)


In [82]:
# [10] dfXY의 상관계수를 확인한다
# 'unique_visits'와 상관이 매우 높은 feature들이 있음 (page_loads, first_time_visits, returning_visits)
dfXY.corr()['unique_visits']

Unnamed: 0,unique_visits
row,0.0162
day_of_week,-0.2592
page_loads,0.9885
first_time_visits,0.9962
returning_visits,0.9059
year,0.0707
month,-0.0478
day,-0.0353
unique_visits,1.0


In [83]:
# [11] 'day_of_week' 별 'unique_visits'의 평균을 구해본다.
# 2, 3, 4, 5 가 높고 1, 6, 7이 낮은 것을 볼 수 있다.
temp = dfXY.groupby('day_of_week')['unique_visits'].mean()
print(temp)

temp_2 = dfXY.groupby('day_of_week')['unique_visits'].mean().sort_values(ascending=False)
temp_2

day_of_week
1   2351.3175
2   3487.1016
3   3590.0156
4   3454.6684
5   3362.2404
6   2680.1477
7   1784.1872
Name: unique_visits, dtype: float64


Unnamed: 0_level_0,unique_visits
day_of_week,Unnamed: 1_level_1
3,3590.0156
2,3487.1016
4,3454.6684
5,3362.2404
6,2680.1477
1,2351.3175
7,1784.1872


In [84]:
# [12] 'day_of_week' 별 평균이 높은 것에 대한 데이터를 high_day_of_week로 저장
temp_mean = round(temp.mean())
print(temp_mean)
high_day_of_week = temp[temp>temp_mean].index.values
high_day_of_week

2959


array([2, 3, 4, 5])

In [85]:
# [13] 'month' 별 'unique_visits'의 평균을 구해본다.
# 2, 3, 4, 5, 10, 11이 높고 1, 6, 7이 낮은 것을 볼 수 있다.
temp = dfXY.groupby('month')['unique_visits'].mean()
temp

Unnamed: 0_level_0,unique_visits
month,Unnamed: 1_level_1
1,2504.1947
2,3164.8842
3,3306.5798
4,3835.3818
5,3324.8396
6,2619.0339
7,2315.3505
8,2266.598
9,2668.1776
10,3188.5345


In [86]:
# [14] 'month' 별 평균이 높은 것에 대한 데이터를 high_month로 저장
temp_mean = temp.mean()
high_month = temp[temp>temp_mean].index.values
high_month

array([ 2,  3,  4,  5, 10, 11], dtype=int32)

In [87]:
# [15] high_day_of_week, high_month의 index 정보를 사용해 파생 변수 생성
# dfX3에 높은 평균을 갖는 요일과 달을 1, 아닌 것을 0으로 하는 dow_h, month_h 변수 추가

#dfX3['dow_h'] = np.where(dfX3['day_of_week'].isin(high_day_of_week), 1, 0)
dfX3['month_h'] = dfX3['month'].isin(high_month).astype(int)
dfX3['dow_h'] = dfX3['day_of_week'].isin(high_day_of_week).astype(int)
dfX3.head(3)

Unnamed: 0,row,day_of_week,page_loads,first_time_visits,returning_visits,year,month,day,month_h,dow_h
0,1,6,3555,2248,427,2020,6,26,0,0
1,2,5,4221,2477,505,2019,8,29,0,1
2,3,6,4767,2734,690,2016,11,18,1,0


In [88]:
temp = dfXY.groupby('year')['unique_visits'].mean()
temp

Unnamed: 0_level_0,unique_visits
year,Unnamed: 1_level_1
2014,2588.9322
2015,2946.1948
2016,3240.0045
2017,2502.0631
2018,3027.3835
2019,3069.0755
2020,3155.5333


### 4-3.성능평가, 모델링 함수


In [89]:
#[16] 사용할 라이브러리 import
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge,Lasso

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_log_error as msle
import numpy as np



In [90]:
# [17] r2_train, r2_test, mae, mse, rmse, msle, rmsle 를 구해
#      Series로 반환하는 함수를 작성합니다.

def get_scores2(model,xtrain,xtest,ytrain,ytest):
  pred1 = model.predict(xtrain)
  pred2 = model.predict(xtest)

  # 음수가 있어도 상관없음
  A1 = r2_score(ytrain,pred1)
  A2 = r2_score(ytest,pred2)
  B = mae(ytest,pred2)
  C = mse(ytest,pred2)
  E = np.sqrt(C) #rmse

  # msle,rmsle 는 음수가 있어서는 안됨
  pred2 =  np.where(pred2<0,0,pred2)
  D = msle(ytest,pred2)
  F = np.sqrt(D) #rmsle

  data = [round(x,4) for x in [A1,A2,B,C,D,E,F]]
  names = 'r2_train r2_test mae mse rmse msle rmsle'.split()
  scores = pd.Series(data,index=names)
  return scores

In [91]:
# [18] 다양한 모델을 만들고 성능을 출력하는 함수 작성
# 결과를 DataFrame으로 만듭니다

def make_models(xtrain,xtest,ytrain,ytest,n=300,RL=False):
  temp = pd.DataFrame()

  model1 = LinearRegression().fit(xtrain,ytrain)
  temp['model1'] = get_scores2(model1,xtrain,xtest,ytrain,ytest)

  if not RL:
    model2 = DecisionTreeRegressor(random_state=0).fit(xtrain,ytrain)
    temp['model2'] = get_scores2(model2,xtrain,xtest,ytrain,ytest)

    for d in range(3,9):
      model2 = DecisionTreeRegressor(max_depth=d,random_state=0).fit(xtrain,ytrain)
      temp[f'model2_{d}'] = get_scores2(model2,xtrain,xtest,ytrain,ytest)

    model3 = RandomForestRegressor(n,random_state=0).fit(xtrain,ytrain)
    temp['model3'] = get_scores2(model3,xtrain,xtest,ytrain,ytest)

    for k in range(3,9):
      model3 = RandomForestRegressor(n,max_depth=k,random_state=0).fit(xtrain,ytrain)
      temp[f'model3_{k}'] = get_scores2(model3,xtrain,xtest,ytrain,ytest)

    model4 = XGBRegressor(objective='reg:squarederror').fit(xtrain,ytrain)
    temp['model4'] = get_scores2(model4,xtrain,xtest,ytrain,ytest)

  if RL:
    for a in [0.01,0.1,1,2]:
      model5 = Ridge(alpha=a).fit(xtrain,ytrain)
      temp[f'model5_{a}'] = get_scores2(model5,xtrain,xtest,ytrain,ytest)

    for a in [0.01,0.1,1,2]:
      model6 = Lasso(alpha=a).fit(xtrain,ytrain)
      temp[f'model6_{a}'] = get_scores2(model6,xtrain,xtest,ytrain,ytest)

  temp = temp.T
  temp.insert(2,'diff',(temp['r2_train']-temp['r2_test']).abs())

  return temp

### 4-4.데이터 분리, 모델적용

In [92]:
dfX3.columns

Index(['row', 'day_of_week', 'page_loads', 'first_time_visits',
       'returning_visits', 'year', 'month', 'day', 'month_h', 'dow_h'],
      dtype='object')

In [93]:
# [19] 학습에 사용할 feature 선택,
# dfX4로 저장
# 'page_loads','first_time_visits' => 이것을 사용하면 성능이 좋음.
# (높은 상관계수)
from sklearn.preprocessing import MinMaxScaler
Xfeatures = ['day_of_week','returning_visits', 'year', 'month', 'day', 'month_h','dow_h']
dfX4 = dfX3[Xfeatures]
#dfX4 = MinMaxScaler().fit_transform(dfX4)  # dfX4 ndarray 객체
print(dfX4.shape,type(dfX4))

(2167, 7) <class 'pandas.core.frame.DataFrame'>


In [94]:
# [20] dfX4 데이터를 X_use, X_submission 으로 다시 분리
# dfX4 분리에서는 X_use의 행의 개수 사용
# YF, Y_submissionF 생성  Y에서 'unique_visits'만 선택
train_size=len(X_use)
XF = dfX4[:train_size]
X_submissionF = dfX4[train_size:]
YF = Y['unique_visits']
Y_submissionF = Y_hidden['unique_visits']
print([x.shape for x in [XF,X_submissionF,YF,Y_submissionF]])

[(1301, 7), (866, 7), (1301,), (866,)]


In [95]:
# [21] XF, YF 데이터를 사용하여 데이터를 분리하고 make_models를 호출합니다.
from sklearn.model_selection import train_test_split

xtrain,xtest,ytrain,ytest = train_test_split(XF,YF,test_size=0.3,
                                             random_state=1234)

print([x.shape for x in [xtrain,xtest,ytrain,ytest]])
models = make_models(xtrain,xtest,ytrain,ytest)
print(models)

[(910, 7), (391, 7), (910,), (391,)]
          r2_train  r2_test   diff      mae         mse   rmse     msle  rmsle
model1      0.8530   0.8697 0.0167 299.7255 136163.3198 0.0226 369.0031 0.1504
model2      1.0000   0.8882 0.1118 255.2890 116809.7340 0.0173 341.7744 0.1314
model2_3    0.8096   0.8079 0.0017 366.3969 200785.1874 0.0292 448.0906 0.1708
model2_4    0.8512   0.8497 0.0015 320.9280 157094.0770 0.0237 396.3510 0.1538
model2_5    0.8886   0.8697 0.0189 294.6392 136193.8284 0.0201 369.0445 0.1417
model2_6    0.9264   0.8907 0.0357 263.0719 114247.2981 0.0175 338.0049 0.1321
model2_7    0.9507   0.9057 0.0450 248.4075  98534.3498 0.0151 313.9018 0.1231
model2_8    0.9715   0.8926 0.0789 253.9273 112220.2071 0.0171 334.9928 0.1309
model3      0.9913   0.9443 0.0470 188.8357  58181.7718 0.0094 241.2090 0.0968
model3_3    0.8401   0.8379 0.0022 338.3365 169358.3283 0.0252 411.5317 0.1587
model3_4    0.8868   0.8781 0.0087 290.4683 127346.6211 0.0197 356.8566 0.1404
model3_5    0.9

In [96]:
# [22] 여러 가지 조건으로 정렬하고 평가할 수 있음
print(models.sort_values('rmsle'))

          r2_train  r2_test   diff      mae         mse   rmse     msle  rmsle
model4      0.9997   0.9591 0.0406 158.6566  42758.5216 0.0064 206.7813 0.0803
model3      0.9913   0.9443 0.0470 188.8357  58181.7718 0.0094 241.2090 0.0968
model3_8    0.9770   0.9381 0.0389 199.5254  64679.5589 0.0104 254.3218 0.1018
model3_7    0.9658   0.9324 0.0334 209.6980  70616.0249 0.0113 265.7368 0.1065
model3_6    0.9488   0.9218 0.0270 226.3324  81704.1767 0.0130 285.8394 0.1141
model2_7    0.9507   0.9057 0.0450 248.4075  98534.3498 0.0151 313.9018 0.1231
model3_5    0.9251   0.9070 0.0181 249.3637  97209.8099 0.0154 311.7849 0.1240
model2_8    0.9715   0.8926 0.0789 253.9273 112220.2071 0.0171 334.9928 0.1309
model2      1.0000   0.8882 0.1118 255.2890 116809.7340 0.0173 341.7744 0.1314
model2_6    0.9264   0.8907 0.0357 263.0719 114247.2981 0.0175 338.0049 0.1321
model3_4    0.8868   0.8781 0.0087 290.4683 127346.6211 0.0197 356.8566 0.1404
model2_5    0.8886   0.8697 0.0189 294.6392 136193.8

In [97]:
# [23] Polynomial Regression  + Ridge, Lasso 실행
from sklearn.preprocessing import MinMaxScaler

dfX4 = dfX3[Xfeatures]
Xscaled = MinMaxScaler().fit_transform(dfX4)
XPoly = PolynomialFeatures(degree=2).fit_transform(Xscaled)
XF2 = XPoly[:train_size]

xtrain2,xtest2,ytrain2,ytest2 = train_test_split(XF2,YF,
                                                test_size = 0.3,
                                                random_state = 1234)
print([x.shape for x in [xtrain2,xtest2,ytrain2,ytest2]])

[(910, 36), (391, 36), (910,), (391,)]


In [98]:
models= make_models(xtrain2,xtest2,ytrain2,ytest2,RL=True)
print(models)

             r2_train  r2_test   diff      mae        mse   rmse     msle  \
model1         0.9352   0.9456 0.0104 186.4174 56878.6641 0.0089 238.4925   
model5_0.01    0.9353   0.9457 0.0104 186.3197 56771.4721 0.0089 238.2676   
model5_0.1     0.9345   0.9450 0.0105 187.6336 57467.9433 0.0089 239.7247   
model5_1       0.9203   0.9304 0.0101 213.7556 72686.1275 0.0121 269.6036   
model5_2       0.9092   0.9189 0.0097 230.9592 84699.0334 0.0147 291.0310   
model6_0.01    0.9353   0.9457 0.0104 186.1680 56692.2296 0.0089 238.1013   
model6_0.1     0.9351   0.9459 0.0108 185.5255 56529.9866 0.0088 237.7604   
model6_1       0.9293   0.9401 0.0108 197.3332 62635.0517 0.0098 250.2700   
model6_2       0.9181   0.9286 0.0105 218.5983 74629.1915 0.0120 273.1834   

             rmsle  
model1      0.0943  
model5_0.01 0.0942  
model5_0.1  0.0946  
model5_1    0.1100  
model5_2    0.1211  
model6_0.01 0.0942  
model6_0.1  0.0940  
model6_1    0.0990  
model6_2    0.1096  


In [99]:
# [24] 여러 가지 조건으로 정렬하고 평가할 수 있음
print(models.sort_values('rmsle'))

             r2_train  r2_test   diff      mae        mse   rmse     msle  \
model6_0.1     0.9351   0.9459 0.0108 185.5255 56529.9866 0.0088 237.7604   
model5_0.01    0.9353   0.9457 0.0104 186.3197 56771.4721 0.0089 238.2676   
model6_0.01    0.9353   0.9457 0.0104 186.1680 56692.2296 0.0089 238.1013   
model1         0.9352   0.9456 0.0104 186.4174 56878.6641 0.0089 238.4925   
model5_0.1     0.9345   0.9450 0.0105 187.6336 57467.9433 0.0089 239.7247   
model6_1       0.9293   0.9401 0.0108 197.3332 62635.0517 0.0098 250.2700   
model6_2       0.9181   0.9286 0.0105 218.5983 74629.1915 0.0120 273.1834   
model5_1       0.9203   0.9304 0.0101 213.7556 72686.1275 0.0121 269.6036   
model5_2       0.9092   0.9189 0.0097 230.9592 84699.0334 0.0147 291.0310   

             rmsle  
model6_0.1  0.0940  
model5_0.01 0.0942  
model6_0.01 0.0942  
model1      0.0943  
model5_0.1  0.0946  
model6_1    0.0990  
model6_2    0.1096  
model5_1    0.1100  
model5_2    0.1211  


In [100]:
# XF, YF
# model4      0.9694   0.9613 0.0081 152.6876  40391.0413 0.0058 200.9752 0.0764
# 번외  # model3      0.9998   0.9987 0.0011 23.3542    1308.1246 0.0002 36.1680  0.0134

# XF2, YF
# model6_0.1  0.9506   0.9563 0.0057 170.1574 45614.3617 0.0073 213.5752 0.0854

In [101]:
# [TIP] 학습 중 warning 메시지 없애기
import warnings
warnings.filterwarnings('ignore')

### 4-5. 모델 선택, 결과 제출

In [102]:
# [25] xtrain, ytrain으로 모델 학습시켜 model 로 저장
model = XGBRegressor(objective = 'reg:squarederror').fit(xtrain,ytrain)
print(get_scores2(model,xtrain,xtest,ytrain,ytest))

r2_train       0.9997
r2_test        0.9591
mae          158.6566
mse        42758.5216
rmse           0.0064
msle         206.7813
rmsle          0.0803
dtype: float64


In [103]:
# [26] X_submissionF 데이터로 pred 구하고 submission DataFrame 생성
pred = model.predict(X_submissionF)
submission = pd.DataFrame({'row' : X_submission['row'],
                           'unique_visits': pred})
submission.head()

Unnamed: 0,row,unique_visits
0,1302,2496.6226
1,1303,4134.2627
2,1304,3336.197
3,1305,2672.9907
4,1306,1299.6127


In [104]:
# [29] 파일로 저장
submission.to_csv('00001000.csv', index=False)