In [1]:
import os
import pandas as pd
import psycopg2

In [2]:
# connect postgreSQL
sys.path.append('..')
import connection_info
users = connection_info.users
dbnames = connection_info.dbnames
passwords = connection_info.passwords
conn = psycopg2.connect(" user=" + users +" dbname=" + dbnames +" password=" + passwords)

## 3-1 データ数，種類数の算出

In [4]:
reserve3_1 = pd.read_sql(
sql=
    """
SELECT
  -- 集約単位のホテルIDの抽出
  hotel_id,

  -- COUNT関数にreserve_idを指定しているので、reserve_idがNULLでない行数をカウント
  COUNT(reserve_id) AS rsv_cnt,

  -- customer_idにdistinctを付け、重複を排除
  -- 重複を排除したcustomer_idの数をカウント
  COUNT(distinct customer_id) AS cus_cnt

FROM work.reserve_tb

-- GROUP BY句で集約する単位をhotel_idに指定
GROUP BY hotel_id
""",
con=conn)

In [5]:
reserve3_1

Unnamed: 0,hotel_id,rsv_cnt,cus_cnt
0,h_1,10,10
1,h_10,3,3
2,h_100,20,19
3,h_101,17,17
4,h_102,13,13
5,h_103,10,10
6,h_104,11,11
7,h_105,15,15
8,h_106,9,9
9,h_107,11,11


## 3-2　合計値の算出

### 3-2 Awesome

In [6]:
reserve3_2 = pd.read_sql(
sql=
    """
SELECT
  hotel_id,
  people_num,

  -- SUM関数にtotal_priceを指定し、売上合計金額を算出
  SUM(total_price) AS price_sum

FROM work.reserve_tb

-- 集約単位をhotel_idとpeople_numの組み合わせに指定
GROUP BY hotel_id, people_num
""",
con=conn)

In [7]:
reserve3_2.head()

Unnamed: 0,hotel_id,people_num,price_sum
0,h_267,1,49000
1,h_290,4,699600
2,h_262,4,684000
3,h_223,1,753500
4,h_139,2,316000


## 3-3　極値，代表値の算出

### 3-3 Awesome

In [9]:
reserve3_3 = pd.read_sql(
sql=
    """
SELECT
  hotel_id,

  -- total_priceの最大値を算出
  MAX(total_price) AS price_max,

  -- total_priceの最小値を算出
  MIN(total_price) AS price_min,

  -- total_priceの平均値を算出
  AVG(total_price) AS price_avg,

  -- total_priceの中央値を算出
  -- MEDIAN(total_price) AS price_med,

  -- posgresqlにはMEDIAN関数がないから， 50 パーセンタイルを求める
  PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY total_price) AS price_med,

  -- PERCENTILE_CONT関数に0.2を指定し、20パーセントタイル値を算出
  -- ORDER BY句にtotal_priceを指定し、パーセンタイル値の対象列とデータの並べ方を指定
  PERCENTILE_CONT(0.2) WITHIN GROUP(ORDER BY total_price) AS price_20per

FROM work.reserve_tb
GROUP BY hotel_id
""",
con=conn)

In [10]:
reserve3_3.head()

Unnamed: 0,hotel_id,price_max,price_min,price_avg,price_med,price_20per
0,h_1,208800,26100,112230.0,104400.0,73080.0
1,h_10,67200,11200,42933.333333,50400.0,26880.0
2,h_100,57600,4800,27600.0,28800.0,9600.0
3,h_101,168000,14000,75764.705882,56000.0,30800.0
4,h_102,72000,12000,32769.230769,24000.0,18000.0


## 3-4　ばらつき具合の算出

### 3-3 Awesome

In [11]:
reserve3_4 = pd.read_sql(
sql=
    """
SELECT
  hotel_id,

  -- VARIANCE関数にtotal_priceを指定し、分散値を算出
  -- COALESCE関数によって、分散値がNULLのときは0に変換
  COALESCE(VARIANCE(total_price), 0) AS price_var,

  -- データ数が2件以上の場合は、STDDEV関数にtotal_priceを指定し、標準偏差値を算出
  COALESCE(STDDEV(total_price), 0) AS price_std

FROM work.reserve_tb
GROUP BY hotel_id
""",
con=conn)

In [12]:
reserve3_4.head()

Unnamed: 0,hotel_id,price_var,price_std
0,h_148,1224510000.0,34992.9993
1,h_253,305694300.0,17484.1161
2,h_113,4894901000.0,69963.566526
3,h_137,429387500.0,20721.666741
4,h_218,4108434000.0,64097.065639


## 3-5　最頻値の算出

### 3-5 Not Awesome

In [13]:
reserve3_5NA = pd.read_sql(
sql=
"""
WITH rsv_cnt_table AS(
  SELECT
    -- Round関数によって四捨五入し、total_priceを1000単位の値に変換
    ROUND(total_price, -3) AS total_price_round,

    -- COUNT関数で金額別の予約数を算出
    COUNT(*) AS rsv_cnt

  FROM work.reserve_tb

  -- ASで新たに命名した列名total_price_roundを指定して、予約金額の1000単位で集約
  GROUP BY total_price_round
)
SELECT
  total_price_round
FROM rsv_cnt_table

-- ()内のクエリによって最頻値の値を取得し、WHERE句で最頻値と一致するものを抽出
WHERE rsv_cnt = (SELECT max(rsv_cnt) FROM rsv_cnt_table)
""",
con=conn)

In [14]:
reserve3_5NA

Unnamed: 0,total_price_round
0,20000.0


### 3-5 Awesome

In [16]:
reserve3_5Awe = pd.read_sql(
sql=
"""
SELECT
  ROUND(total_price, -3) AS total_price_round
FROM work.reserve_tb
GROUP BY total_price_round

-- COUNT関数で算出した金額別の予約数を大きい順に並び替え(DESCを付けると昇順)
-- COUNT(*)で，データフレーム全体の行数をカウント
ORDER BY COUNT(*) DESC

-- LIMIT句で最初の1件のみ結果を取得
LIMIT 1
""",
con=conn)

In [17]:
reserve3_5Awe

Unnamed: 0,total_price_round
0,20000.0


## 3-6　順位の算出

### 3-6 Awesome

In [19]:
reserve3_6 = pd.read_sql(
sql=
"""
SELECT
  *,

  -- ROW_NUMBERで順位を取得
  -- PARTITION by customer_idで顧客ごとに順位を取得するよう設定
  -- ORDER BY reserve_datetimeで順位を予約日時の古い順に設定
  ROW_NUMBER()
    OVER (PARTITION BY customer_id ORDER BY reserve_datetime) AS log_no

FROM work.reserve_tb
""",
con=conn)

In [21]:
reserve3_6.head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,log_no
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200,1
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600,2
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600,3
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400,4
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100,5


### 3-6b Awesome

In [22]:
reserve3_6b = pd.read_sql(
sql=
"""
SELECT
  hotel_id,

  -- RANK関数で予約数の順位を指定
  -- COUNT(*)をRANKの基準として指定(集約したあとの予約数に対して順位を付ける算出処理)
  -- DESCを付けることによって、降順を指定
  RANK() OVER (ORDER BY COUNT(*) DESC) AS rsv_cnt_rank

FROM work.reserve_tb

-- hotel_idを集約単位に指定、予約数を計算するための集約指定でRANK関数には関係なし
GROUP BY hotel_id
""",
con=conn)

In [23]:
reserve3_6b.head()

Unnamed: 0,hotel_id,rsv_cnt_rank
0,h_241,1
1,h_144,2
2,h_37,3
3,h_142,3
4,h_178,5
