In [1]:
import collections
import itertools
import json
import numpy as np
import os
import pandas as pd
import plotly.graph_objects as go
import re

from nltk.stem import PorterStemmer

# Introduction

<b> Goal: for each user, recommend restaurants using Contextual Bandit. </b>

1. Create supervised ML model that predicts Click/No Click
2. with % of Click for multiple recommendations, dynamically pick one using Thompson Sampling

# Data Extraction & Cleansing

Yelp data from: <a href="https://www.kaggle.com/yelp-dataset/yelp-dataset">Kaggle yelp dataset</a>

In [2]:
def json2df(fjson):
    data = []
    for line in fjson:
        data.append(json.loads(line))
    return pd.DataFrame(data)

In [20]:
business_df = json2df(open(f"data/yelp/yelp_academic_dataset_business.json"))
checkin_df = json2df(open(f"data/yelp/yelp_academic_dataset_checkin.json"))
tip_df = json2df(open(f"data/yelp/yelp_academic_dataset_tip.json"))
user_df = json2df(open(f"data/yelp/yelp_academic_dataset_user.json"))

In [23]:
user_df.dtypes

user_id                object
name                   object
review_count            int64
yelping_since          object
useful                  int64
funny                   int64
cool                    int64
elite                  object
friends                object
fans                    int64
average_stars         float64
compliment_hot          int64
compliment_more         int64
compliment_profile      int64
compliment_cute         int64
compliment_list         int64
compliment_note         int64
compliment_plain        int64
compliment_cool         int64
compliment_funny        int64
compliment_writer       int64
compliment_photos       int64
dtype: object

In [22]:
user_df.columns

Index(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny',
       'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos'],
      dtype='object')

In [None]:
(user_df
    .drop(columns=["name", ])
)

In [21]:
user_df.head()

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,q_QQ5kBBwlCcbL1s4NVK3g,Jane,1220,2005-03-14 20:26:35,15038,10030,11291,200620072008200920102011201220132014,"xBDpTUbai0DXrvxCe3X16Q, 7GPNBO496aecrjJfW6UWtg...",1357,...,163,190,361,147,1212,5691,2541,2541,815,323
1,dIIKEfOgo0KqUfGQvGikPg,Gabi,2136,2007-08-10 19:01:51,21272,10289,18046,"2007,2008,2009,2010,2011,2012,2013,2014,2015,2...","XPzYf9_mwG2eXYP2BAGSTA, 2LooM5dcIk2o01nftYdPIg...",1025,...,87,94,232,96,1187,3293,2205,2205,472,294
2,D6ErcUnFALnCQN4b1W_TlA,Jason,119,2007-02-07 15:47:53,188,128,130,20102011,"GfB6sC4NJQvSI2ewbQrDNA, jhZtzZNNZJOU2YSZ6jPlXQ...",16,...,1,3,0,0,5,20,31,31,3,1
3,JnPIjvC0cmooNDfsa9BmXg,Kat,987,2009-02-09 16:14:29,7234,4722,4035,200920102011201220132014,"HQZPQhKMwRAyS6BCselVWQ, kP2U1s_sjQfHO9grxiyDTA...",420,...,129,93,219,90,1120,4510,1566,1566,391,326
4,37Hc8hr3cw0iHLoPzLK6Ow,Christine,495,2008-03-03 04:57:05,1577,727,1124,200920102011,"-Q88pZUcrfN0BLBDp-bkAQ, etPn4Pv1Gc4cRZjRgB_BOw...",47,...,19,32,16,15,77,131,310,310,98,44


----

In [5]:
def preprocess(df):
    return (df.astype({"name":"category", "address":"category", "city":"category", "state":"category"
                      })
              .rename(columns={"date":"chkin_date"})
              .dropna(subset=["categories"])
            )
bs_df = pd.merge(business_df, checkin_df, how="left", on="business_id")
bs_df = preprocess(bs_df)

In [6]:
def nlp_word(word):
    """
    Perform nlp techniques to given word
    
    Parameters
    ----------
    word : str
    """
    word = word.strip().lower()
    word = stem(word)
    
    return word


def stem(word):
    """
    Reduce word to their root
    ex: running -> run
    """
    ps = PorterStemmer()
    return ps.stem(word)

cat_counter = collections.Counter(itertools.chain.from_iterable(c.split(",") for c in bs_df["categories"]))
word_cnt_df = pd.DataFrame.from_dict(cat_counter, orient="index") \
                          .rename(columns={0:"num_occ"})
word_cnt_df.sort_values("num_occ", ascending=False, inplace=True)

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Bar(
        x = word_cnt_df.index,
        y = word_cnt_df["num_occ"]
    )
)

fig.update_layout(
    title = "<b>Category word count</b>",
    xaxis_title = "category",
    yaxis_title = "# of occurence"
)

fig.show()

# EDA