In [1]:
import sys
import numpy as np
import pandas as pd

In [2]:
sys.path.append("../..")
from preprocess.load_data.data_loader import load_hotel_reserve
customer_tb, hotel_tb, reserve_tb = load_hotel_reserve()

## 4-1 マスタテーブルの結合

### 4-1 Not Awesome

In [4]:
# reserve_tbとhotel_tbを、hotel_idが等しいもの同士で内部結合
# people_numが1かつis_businessがTrueのデータのみ抽出
reserve_4_1NA = pd.merge(reserve_tb, hotel_tb, on='hotel_id', how='inner') \
  .query('people_num == 1 & is_business')

In [7]:
reserve_4_1NA.head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,base_price,big_area_name,small_area_name,hotel_latitude,hotel_longitude,is_business
14,r310,h_219,c_73,2017-03-30 01:52:36,2017-04-18,11:00:00,2017-04-21,1,30900,10300,B,B-3,35.644729,139.693389,True
18,r1433,h_219,c_348,2017-04-08 03:19:47,2017-04-25,10:00:00,2017-04-28,1,30900,10300,B,B-3,35.644729,139.693389,True
21,r2992,h_219,c_756,2016-02-17 14:02:23,2016-03-01,12:00:00,2016-03-03,1,20600,10300,B,B-3,35.644729,139.693389,True
24,r3522,h_219,c_875,2016-09-09 07:10:21,2016-09-20,10:00:00,2016-09-21,1,10300,10300,B,B-3,35.644729,139.693389,True
101,r7,h_256,c_1,2017-12-29 10:38:36,2018-01-25,10:30:00,2018-01-28,1,103500,34500,C,C-1,38.237294,140.696131,True


### 4-1 Awesome

In [8]:
reserve_4_1Awe = pd.merge(reserve_tb.query('people_num == 1'),
         hotel_tb.query('is_business'),
         on='hotel_id', how='inner')

In [10]:
reserve_4_1Awe.head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,base_price,big_area_name,small_area_name,hotel_latitude,hotel_longitude,is_business
0,r7,h_256,c_1,2017-12-29 10:38:36,2018-01-25,10:30:00,2018-01-28,1,103500,34500,C,C-1,38.237294,140.696131,True
1,r997,h_256,c_244,2016-10-15 22:47:40,2016-10-31,10:30:00,2016-11-02,1,69000,34500,C,C-1,38.237294,140.696131,True
2,r2602,h_256,c_650,2016-05-10 00:42:56,2016-05-12,11:00:00,2016-05-14,1,69000,34500,C,C-1,38.237294,140.696131,True
3,r3738,h_256,c_930,2017-04-12 09:53:00,2017-05-08,11:30:00,2017-05-09,1,34500,34500,C,C-1,38.237294,140.696131,True
4,r11,h_183,c_2,2016-11-19 12:49:10,2016-12-08,11:00:00,2016-12-11,1,29700,9900,G,G-4,33.595248,130.633567,True


## 4-2 条件に応じた結合テーブルの切り替え

### 4-2 Awesome

In [11]:
# ガベージコレクション(必要ないメモリの解放)のためのライブラリ
import gc

In [12]:
# small_area_nameごとにホテル数をカウント
small_area_mst = hotel_tb \
  .groupby(['big_area_name', 'small_area_name'], as_index=False) \
  .size().reset_index()
small_area_mst.columns = ['big_area_name', 'small_area_name', 'hotel_cnt']

In [13]:
small_area_mst.head()

Unnamed: 0,big_area_name,small_area_name,hotel_cnt
0,A,A-1,35
1,A,A-3,30
2,B,B-1,15
3,B,B-2,18
4,B,B-3,19


In [14]:
# 20件以上であればjoin_area_idをsmall_area_nameとして設定
# 20件未満であればjoin_area_idをbig_area_nameとして設定
# -1は、自ホテルを引いている
small_area_mst['join_area_id'] = \
  np.where(small_area_mst['hotel_cnt'] - 1 >= 20,
           small_area_mst['small_area_name'],
           small_area_mst['big_area_name'])

In [15]:
small_area_mst.head()

Unnamed: 0,big_area_name,small_area_name,hotel_cnt,join_area_id
0,A,A-1,35,A-1
1,A,A-3,30,A-3
2,B,B-1,15,B
3,B,B-2,18,B
4,B,B-3,19,B


In [16]:
# 必要なくなった列を削除
small_area_mst.drop(['hotel_cnt', 'big_area_name'], axis=1, inplace=True)

In [17]:
small_area_mst.head()

Unnamed: 0,small_area_name,join_area_id
0,A-1,A-1
1,A-3,A-3
2,B-1,B
3,B-2,B
4,B-3,B


In [18]:
# レコメンド元になるホテルにsmall_area_mstを結合することで、join_area_idを設定
base_hotel_mst = pd.merge(hotel_tb, small_area_mst, on='small_area_name') \
                   .loc[:, ['hotel_id', 'join_area_id']]

In [19]:
base_hotel_mst.head()

Unnamed: 0,hotel_id,join_area_id
0,h_1,D
1,h_79,D
2,h_125,D
3,h_127,D
4,h_129,D


In [20]:
# 下記は必要に応じて、メモリを解放(必須ではないですがメモリ量に余裕のないときに利用)
del small_area_mst
gc.collect()

250

In [21]:
# recommend_hotel_mstはレコメンド候補のためのテーブル
recommend_hotel_mst = pd.concat([
  # join_area_idをbig_area_nameとしたレコメンド候補マスタ
  hotel_tb[['small_area_name', 'hotel_id']] \
    .rename(columns={'small_area_name': 'join_area_id'}, inplace=False),

  # join_area_idをsmall_area_nameとしたレコメンド候補マスタ
  hotel_tb[['big_area_name', 'hotel_id']] \
    .rename(columns={'big_area_name': 'join_area_id'}, inplace=False)
])

In [22]:
recommend_hotel_mst.head()

Unnamed: 0,join_area_id,hotel_id
0,D-2,h_1
1,A-1,h_2
2,E-4,h_3
3,C-3,h_4
4,G-3,h_5


In [23]:
# hotel_idの列名が結合すると重複するので変更
recommend_hotel_mst.rename(columns={'hotel_id': 'rec_hotel_id'}, inplace=True)

In [24]:
# base_hotel_mstとrecommend_hotel_mstを結合し、レコメンド候補の情報を付与
# query関数によってレコメンド候補から自分を除く
pd.merge(base_hotel_mst, recommend_hotel_mst, on='join_area_id') \
  .loc[:, ['hotel_id', 'rec_hotel_id']] \
  .query('hotel_id != rec_hotel_id')

Unnamed: 0,hotel_id,rec_hotel_id
1,h_1,h_14
2,h_1,h_22
3,h_1,h_27
4,h_1,h_40
5,h_1,h_45
6,h_1,h_77
7,h_1,h_79
8,h_1,h_85
9,h_1,h_91
10,h_1,h_103


## 4-3 過去データの結合

### 4-3a n件前のデータ取得
### Awesome

In [25]:
# customerごとにreserve_datetimeで並び替え
# groupby関数のあとにapply関数を適用することによって、groupごとに並び替える
# sort_values関数によってデータを並び替え、axisが0の場合は行、1の場合は列を並び替え
result4_3a = reserve_tb \
  .groupby('customer_id') \
  .apply(lambda group:
         group.sort_values(by='reserve_datetime', axis=0, inplace=False))

In [26]:
result4_3a.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
c_1,0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200
c_1,1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600
c_1,2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600
c_1,3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400
c_1,4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100


In [28]:
# resultはすでに、customer_idごとにgroup化されている
# customerごとに2つ前のtotal_priceをbefore_priceとして保存
# shift関数は、periodsの引数の数だけデータ行を下にずらす関数
result4_3a['before_price'] = \
  pd.Series(result4_3a['total_price'].groupby('customer_id').shift(periods=2))

In [29]:
# 完成
result4_3a.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,before_price
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
c_1,0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200,
c_1,1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600,
c_1,2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600,97200.0
c_1,3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400,20600.0
c_1,4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100,33600.0


In [30]:
result4_3a['total_price'].groupby('customer_id').shift(periods=2)

customer_id      
c_1          0            NaN
             1            NaN
             2        97200.0
             3        20600.0
             4        33600.0
             5       194400.0
             6        68100.0
             7        36000.0
c_10         58           NaN
             59           NaN
             60      110400.0
             61       36000.0
             62      187200.0
             63       89600.0
c_100        434          NaN
             435          NaN
             436      13800.0
             437      72300.0
             438      33600.0
c_1000       4028         NaN
             4029         NaN
c_101        439          NaN
             440          NaN
             441      64000.0
             442      15300.0
             443     111600.0
c_102        444          NaN
c_103        445          NaN
             446          NaN
             447      11200.0
                       ...   
c_993        3998         NaN
             3999     

### 4-3a 過去n件の合計値
### Awesome