In [2]:
import os
from glob import glob

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from contextlib import contextmanager
from time import time
import matplotlib.pyplot as plt
import seaborn as sns
import random
import shutil

%matplotlib inline


# ref: Kaggleコード遺産 https://qiita.com/kaggle_grandmaster-arai-san/items/d59b2fb7142ec7e270a5 
class Timer:
    def __init__(self, logger=None, format_str="{:.3f}[s]", prefix=None, suffix=None, sep=" "):

        if prefix: format_str = str(prefix) + sep + format_str
        if suffix: format_str = format_str + sep + str(suffix)
        self.format_str = format_str
        self.logger = logger
        self.start = None
        self.end = None

    @property
    def duration(self):
        if self.end is None:
            return 0
        return self.end - self.start

    def __enter__(self):
        self.start = time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time()
        out_str = self.format_str.format(self.duration)
        if self.logger:
            self.logger.info(out_str)
        else:
            print(out_str)


def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    
# 再現性確保!
seed_everything(427)

from pathlib import Path
# data_dir
DATA_DIR = Path("../data/")

# 学習用のログデータと正解ラベル
train_log = pd.read_csv(DATA_DIR / "train_log.csv")
train_label = pd.read_csv(DATA_DIR / "train_label.csv")

# 宿のデータ
yado = pd.read_csv(DATA_DIR / "yado.csv")

# テスト期間のログデータ
test_log = pd.read_csv(DATA_DIR / "test_log.csv")

sample_submission = pd.read_csv(DATA_DIR / "sample_submission.csv")

# 画像のデータ
image = pd.read_parquet(DATA_DIR / "image_embeddings.parquet")

# すべてのログデータはあとあと参照をするので先に作っておきます.
whole_log = pd.concat([train_log, test_log], ignore_index=True)

test_session = pd.read_csv(DATA_DIR / "test_session.csv")

In [3]:
import pandas as pd
from collections import defaultdict
from heapq import heappush, heappop

In [4]:
# データの加工

# train_logで実際に予約した宿をひけるようにしておく
map_reserved = defaultdict(int)
for idx, rec in train_label.iterrows():
  session_id, yad_no_reserved = rec
  map_reserved[session_id] = yad_no_reserved

# 縦持ちのセッションログを、session_id : [閲覧したyad_noのリスト] のdictに変換
def Make_session_list(session_log):
  map_session_yads = defaultdict(list)
  for _, row in session_log.iterrows():
    session_id = row[0]
    yad_no = row[2]
    map_session_yads[session_id].append(yad_no)
  return map_session_yads

map_session_yads_train = Make_session_list(train_log)
map_session_yads_test = Make_session_list(test_log)

# D[v][r]:= 「最後に宿vを閲覧して、宿rを予約した」セッションの件数
D = defaultdict(lambda:defaultdict(int))
for session_id, viewed_yad_no in map_session_yads_train.items():
  last_viewed = viewed_yad_no[-1]
  reserved = map_reserved[session_id]
  D[last_viewed][reserved] += 1

  session_id = row[0]
  yad_no = row[2]


In [5]:
# session_idごとに、最も出現頻度の高いsml_cdを集計しておく
_test_log = test_log.merge(yado, how="left", on="yad_no")
sml_cd_mode_df = _test_log.groupby('session_id')['sml_cd'].agg(lambda x: x.mode().iloc[0]).reset_index()
sml_cd_dict = dict(zip(sml_cd_mode_df['session_id'], sml_cd_mode_df['sml_cd']))

# 宿の出現回数を計算
yad_count = test_log.groupby("yad_no").size().rename("yad_count").reset_index()

# 宿のマスター情報を紐づけて
_df = pd.merge(yad_count, yado, on="yad_no", how="left")

# 出現回数が多い順に並び替え
_df = _df.sort_values("yad_count", ascending=False)

# この状態で小エリアCDごとに上位10件を取得する
sml_top_10 = _df.groupby("sml_cd").head(10).sort_values("yad_count", ascending=False)


In [6]:
# データの加工

# train_logで実際に予約した宿をひけるようにしておく
map_reserved = defaultdict(int)
for idx, rec in train_label.iterrows():
  session_id, yad_no_reserved = rec
  map_reserved[session_id] = yad_no_reserved

# 縦持ちのセッションログを、session_id : [閲覧したyad_noのリスト] のdictに変換
def Make_session_list(session_log):
  map_session_yads = defaultdict(list)
  for _, row in session_log.iterrows():
    session_id = row[0]
    yad_no = row[2]
    map_session_yads[session_id].append(yad_no)
  return map_session_yads

map_session_yads_train = Make_session_list(train_log)
map_session_yads_test = Make_session_list(test_log)

# D[v][r]:= 「最後に宿vを閲覧して、宿rを予約した」セッションの件数
D = defaultdict(lambda:defaultdict(int))
for session_id, viewed_yad_no in map_session_yads_train.items():
  last_viewed = viewed_yad_no[-1]
  reserved = map_reserved[session_id]
  D[last_viewed][reserved] += 1

  session_id = row[0]
  yad_no = row[2]


In [7]:
# test_logを用いて予測値を出力

# 1. 1件の宿vしか閲覧していないログの場合、D[v][r]の大きい順にrを10件出力する.
# 2. 2件以上閲覧しているログの場合、最後から2番目に閲覧している宿を1位とする. その後、2位以降について1.と同様に出力する.
# 3. 余った場合、地域の人気宿を入れる
test_session_number = len(test_session)
Predicted_List = [ [0]*10 for _ in range(test_session_number) ]
for idx, session_id in enumerate(test_session["session_id"]):
  viewed_number = len(map_session_yads_test[session_id])
  last_viewed = map_session_yads_test[session_id][-1]
  rank = 0

  if viewed_number > 1:
    Predicted_List[idx][rank] = map_session_yads_test[session_id][-2]
    rank += 1

  sorted_yad_list = []
  for yad_no, viewed_cnt in D[last_viewed].items():
    heappush(sorted_yad_list, (-viewed_cnt, yad_no))

  while rank < 10 and sorted_yad_list:
    _, predicted_yad_no = heappop(sorted_yad_list)
    Predicted_List[idx][rank] = predicted_yad_no
    rank += 1
  
  if rank < 10:
    cd = sml_cd_dict[session_id]
    for row in sml_top_10.itertuples():
      if row.sml_cd != cd:
        continue
      Predicted_List[idx][rank] = row.yad_no
      rank += 1
      if rank == 10:
        break

df_submit = pd.DataFrame(Predicted_List, columns=["predict_0", "predict_1", "predict_2", "predict_3", "predict_4", "predict_5", "predict_6", "predict_7", "predict_8", "predict_9"])
df_submit.to_csv("../#9_submission.csv", index=False)  