In [1]:
import numpy as np
import pandas as pd
import pickle
import MeCab
import sys
from collections import Counter, defaultdict
import re
import urllib.request, urllib.error

In [2]:
tagger = MeCab.Tagger("-Owakati")
tagger.parse("")

'\n'

In [3]:
lunch_file = ["../pretest_sushitsuu_lunch.csv",
              "../pretest_sushiichi_lunch.csv",
              "../pretest_sawada_lunch.csv",
              "../pretest_mitani_lunch.csv",
              "../pretest_matsukan_lunch.csv",
              "../pretest_kiyoda_lunch.csv",
              "../pretest_imamura_lunch.csv",
              "../pretest_hatsune_lunch.csv"
              ]

In [4]:
dinner_file = ["../pretest_sushitsuu_dinner.csv",
              "../pretest_sushiichi_dinner.csv",
              "../pretest_sawada_dinner.csv",
              "../pretest_mitani_dinner.csv",
              "../pretest_matsukan_dinner.csv",
              "../pretest_kiyoda_dinner.csv",
              "../pretest_imamura_dinner.csv",
              "../pretest_hatsune_dinner.csv",
              "../pretest_aozora_dinner.csv"
              ]

In [5]:
store_name = ["sushitsuu",
             "sushiichi",
             "sawada",
             "mitani",
             "matsukan",
             "kiyoda",
             "imamura",
             "hatsune",
             "aozora",
             ]

In [6]:
url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'
slothlib_file = urllib.request.urlopen(url=url)
stop_words = [line.decode("utf-8").strip() for line in slothlib_file]
stop_words = [sw for sw in stop_words if not sw==u'']
my_stop_words = ["℃/", "℃", "kg", "訪問", "寿司", "すし", "なく", "美味しい", "美味",
                 "～」", "○－", "○○/", ")○/", "握り", "シャリ", "ネタ", "予約",
                "ない", "良い", "料理", "三谷", "最高", "好き", "好み", "最後", "美味しかっ", "さわ",
                "店内", "最初", "一番", "良く", "追加", "良かっ", "全て", "東京", "通り", "なし",
                "美味しく", "今日", "澤田", "非常", "スタート", "本日", "かん", "無い", "よい", "週間",
                "お願い", "～。", "素晴らしく", "美味し", "おいしかっ", "日間", "場所", "美味い", "説明", "次回",
                "評価", "内容", "レベル", "○/", "://", "素晴らしい", "印象", "貫目", "初音", "注文",
                "大好き", "空間", "状態", "よく", "瞬間", "かなり", "com", '）"', ",\u3000", "\u3000-",
                "◎/", "◎◎/", "）⚪", ")、", "^)", "(^", "cp", "交互", "ボタン", "名刺",
                "嬉しい", "店員", "オーダー", "インター", "ピンポン", "丁目", "全力", "入り口", "入口", "フォン",
                "馬糞", "すき", "やばし", "マイクロ", "冷蔵庫", "毎回", "まつ", "コス", "基本", "よかっ",
                "ケース", "予想", "気分", "悪い", "きよ", "代目", "友人", "部位", "コレ", "旨かっ",
                "その他", "コト", "良し", "まつ", "むら", "いま", "昼間", "近所", "キロ", "代わり",
                "相手", "長い", "途中", "モノ", "狭く", "難点", "方々", "近辺", "番手", "ーー",
                '""', "！⚫", "青空", "ない", "", "", "", "", "", "",
                "", "", "", "", "", "", "", "", "", "",
                "", "", "", "", "", "", "", "", "", "",
                "", "", "", "", "", "", "", "", "", "",]
stop_words += my_stop_words 

In [7]:
def normalize_number(text):
    # 連続した数字を0で置換
    replaced_text = re.sub(r'\d+', '0', text)
    return replaced_text

In [16]:
def tokenize_ja(text, lower):
    text = replaced_text = re.sub(r"\d+", "0", text)
    node = tagger.parseToNode(str(text))
    while node:
        surface = node.surface.lower()
        if lower and node.feature.split(',')[0] in ["名詞"] and surface not in stop_words:
            #分かち書きで取得する品詞を指定
            yield surface
            #lowerで小文字に変換 正規化
        node = node.next

In [17]:
def tokenize(content, token_min_len, token_max_len, lower):
    return [
        str(token) for token in tokenize_ja(content, lower)
        if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
    ]

In [18]:
def frequent_lunch(path):
    f_words_list = []
    #f_words = []
    df = pd.read_csv(path)
    for i in df["lunch_review"]:
        txt = tokenize(i, 2, 10000, True)
        for t in txt:
            f_words_list.append(t)
    #counter = Counter(f_words_list)
    #for word, count in counter.most_common(30):
        #f_words.append(word + " " + str(count))
    return f_words_list
#店のレビューの単語を全てf_word_listにいれる

In [19]:
def frequent_dinner(path):
    f_words_list = []
    #f_words = []
    df = pd.read_csv(path)
    for i in df["dinner_review"]:
        txt = tokenize(i, 2, 10000, True)
        for t in txt:
            f_words_list.append(t)
    #counter = Counter(f_words_list)
    #for word, count in counter.most_common(30):
        #f_words.append(word + " " + str(count))
    return f_words_list
#店のレビューの単語を全てf_word_listにいれる

In [20]:
lunch_words = []
for path in lunch_file:
    lunch_words.extend(frequent_lunch(path))

In [21]:
counter_lunch = Counter(lunch_words)
lunch_character_dict = defaultdict(float)
len_word = len(lunch_words)
for word, count in counter_lunch.most_common(1000):
    lunch_character_dict[word] = float(count/len_word)

In [22]:
def lunch_character(path):
    f_words_list = [] #その店の全単語をいれる
    character_words = [] #その店の特徴的な単語をいれる
    df = pd.read_csv(path)
    for i in df["lunch_review"]:
        txt = tokenize(i, 2, 10000, True)
        for t in txt:
            f_words_list.append(t)
    len_word = len(f_words_list)
    counter = Counter(f_words_list)
    for word, count in counter.most_common(100):
        if lunch_character_dict[word] != None:
            if float(count/len_word) >= 3 * lunch_character_dict[word]:
                character_words.append(word)
    return character_words

In [23]:
for path in lunch_file:
    print(lunch_character(path))

['味噌汁', '六本木', 'ばら', 'バラ', '定食', '個室', '西麻布', '焼き魚', 'ズワイガニ', '地下', '研究', '繊維', '小鉢', '事前', 'ほた']
['銀座', 'コース', 'デザート', '職人', 'インターホン', 'アイス', '女性', 'ミシュラン', '板前', '芙蓉', '先付け', '刺身', 'シャーベット', '築地', 'スタッフ', 'コロッケ', '和食', '真ん中', 'ソムリエ', '路地', '""', 'ワインセラー', '石橋', 'セリ', '料金']
['軍艦', '印籠', 'トマト', '唐津', '鬼灯', '蛇腹', '詰め', 'ーー', '炭火', '禁止', '特大', 'ほおずき']
['ワイン', '白子', 'ご飯', 'キャビア', 'スープ', 'シャンパン', 'マリアージュ', 'ソース', 'ステーキ', 'カラス', '銚子', '来年', 'からすみ', '揚げ', '吟醸', '四ツ谷', '待ち', 'ハマグリ', 'シャンパーニュ', '組み合わせ', 'シャコ', '半年', '紀尾井町']
['麻布', '雰囲気', '巻物', 'イクラ', '外国', 'ホテル', '元気', '吉祥寺', 'お世話', '満席', '大衆', 'リーズナブル', '小柱', '江戸', '味噌汁', '特徴', '紹介', '今度', '麻布十番', '久しぶり', 'ウマ', 'グッド', '良心', 'テーブル', 'にぎやか', '食事', '系列', '板前', '共通', '普通', '充実', 'コストパフォーマンス', '小ぶり', '気持ち', '鉄火', 'ぶどう', 'ごちそうさま', '午前', 'ドイツ', '帰国', '連れ', '営業', 'タイミング', '存在', '気持', '体験', '本気', '価値', '会計', '当たり', '...', 'たっぷり', 'ズワイガニ', 'ソテー', 'カキ', 'さより', '絶賛', '七味', 'タレ', '全員', 'グランドハイアット', '恵比寿', 'ウェスティン', '近隣', '国際', '支持', '納得', '威勢']
['マグロ', '赤

In [28]:
dinner_words = []
for path in dinner_file:
    dinner_words.extend(frequent_dinner(path))

In [25]:
counter_dinner = Counter(dinner_words)
dinner_character_dict = defaultdict(float)
len_word = len(dinner_words)
for word, count in counter_dinner.most_common(1000):
    dinner_character_dict[word] = float(count/len_word)

In [26]:
def dinner_character(path):
    f_words_list = []
    character_words = []
    df = pd.read_csv(path)
    for i in df["dinner_review"]:
        txt = tokenize(i, 2, 10000, True)
        for t in txt:
            f_words_list.append(t)
    len_word = len(f_words_list)
    counter = Counter(f_words_list)
    for word, count in counter.most_common(100):
        if dinner_character_dict[word] != None:
            if float(count/len_word) >= 4 * dinner_character_dict[word]:
                character_words.append(word)
    return character_words

In [27]:
for path in dinner_file:
    print(dinner_character(path))

['包丁', '西麻布', 'ランチ', '六本木', '個室', '隠し', '研究', '味噌汁', '地下', '塩焼き', '変態', 'テーブル', '繊維', '藤永', 'バラ']
['銀座', 'ワイン', 'ランチ', 'インターホン', '海胆', 'ソムリエ', '築地', '女性', 'シャネル', '隠れ家', '階段', '先付け', '燻製', '焼き物', '石橋', 'セリ', 'ウズラ', 'あなご', 'ワインセラー', '牡丹', 'オススメ', '板前']
['海胆', '印籠', 'トマト', '唐津', '蝦蛄', '鬼灯', '詰め', '半生', '禁止']
['ワイン', 'マリアージュ', 'スープ', '四谷', 'シャンパン', 'キャビア', '待ち', '紀尾井町', 'ソース', 'リング', '四ツ谷', 'ペア', '銚子', '組み合わせ', 'イワシ', 'シャンパーニュ', 'ステーキ', '創作', '半年', '独創']
['値段', '麻布', '麻布十番', 'ビール', '活気', '満席', 'テーブル', 'メニュー', '冷酒', 'コストパフォーマンス', 'アジ', '客層', '外国', '家族', '揚げ', '熱燗', 'クオリティ', 'ドリンク', '板前', '居酒屋', 'お世話', '金額', 'アナゴ', '充実', 'カニ', 'たま', '居心地', '気軽', '磯辺', 'リーズナブル', 'メイン', '連れ', '瓶ビール', '接待', 'ｶｳﾝﾀｰ', '時代', 'つめ', '通常', '地元', '人柄', '立地', '出口', '勘八', '昭和', '創業']
['まぐろ', '極上', '新津', '真子', 'ツマミ', '新子', '爽やか', '紹介', '伝説', '木村', 'ワタ', '白洲', '潮汁', '投稿', '敷居', 'あわび', '時代', 'とり', 'かっぱ', '閖上', 'カレイ']
['白金', '茶碗蒸し', '高輪', '太刀魚', 'アイスクリーム', 'アイス', 'かつお', '歯応え', '北里大学', '病院', '美人', '小瓶', '北里']
[]
['青空', '移転'

In [29]:
for i, path in enumerate(lunch_file):
    df = pd.DataFrame(lunch_character(path))
    df.to_csv("pre_analysis_"+ store_name[i] + "_lunch.csv")

In [30]:
for i, path in enumerate(dinner_file):
    df = pd.DataFrame(dinner_character(path))
    df.to_csv("pre_analysis_"+ store_name[i] + "_dinner.csv")

In [32]:
df = pd.read_csv("pre_analysis_sushiichi_dinner.csv")
df

Unnamed: 0.1,Unnamed: 0,0
0,0,銀座
1,1,ワイン
2,2,ランチ
3,3,インターホン
4,4,海胆
5,5,ソムリエ
6,6,築地
7,7,女性
8,8,シャネル
9,9,隠れ家
