In [1]:
import numpy as np
import pandas as pd
import MeCab
import sys
from collections import Counter
import re
import urllib.request, urllib.error

In [2]:
tagger = MeCab.Tagger("-Owakati")
tagger.parse("")

'\n'

In [3]:
lunch_file = ["../pretest_sushitsuu_lunch.csv",
              "../pretest_sushiichi_lunch.csv",
              "../pretest_sawada_lunch.csv",
              "../pretest_mitani_lunch.csv",
              "../pretest_matsukan_lunch.csv",
              "../pretest_kiyoda_lunch.csv",
              "../pretest_imamura_lunch.csv",
              "../pretest_hatsune_lunch.csv"
              ]

In [4]:
dinner_file = ["../pretest_sushitsuu_dinner.csv",
              "../pretest_sushiichi_dinner.csv",
              "../pretest_sawada_dinner.csv",
              "../pretest_mitani_dinner.csv",
              "../pretest_matsukan_dinner.csv",
              "../pretest_kiyoda_dinner.csv",
              "../pretest_imamura_dinner.csv",
              "../pretest_hatsune_dinner.csv",
              "../pretest_aozora_dinner.csv"
              ]

In [5]:
store_name = ["sushitsuu",
             "sushiichi",
             "sawada",
             "mitani",
             "matsukan",
             "kiyoda",
             "imamura",
             "hatsune",
             "aozora",
             ]

In [6]:
url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'
slothlib_file = urllib.request.urlopen(url=url)
stop_words = [line.decode("utf-8").strip() for line in slothlib_file]
stop_words = [sw for sw in stop_words if not sw==u'']
my_stop_words = ["℃/", "℃", "kg", "訪問", "寿司", "すし", "なく", "美味しい", "美味", "～」", "○－", "○○/", ")○/",
                "℃/", "℃", "kg", "訪問", "寿司", "すし", "なく", "美味しい", "美味",
                 "～」", "○－", "○○/", ")○/", "握り", "シャリ", "ネタ", "予約",
                "ない", "良い", "料理", "三谷", "最高", "好き", "好み", "最後", "美味しかっ", "さわ",
                "店内", "最初", "一番", "良く", "追加", "良かっ", "全て", "東京", "通り", "なし",
                "美味しく", "今日", "澤田", "非常", "スタート", "本日", "かん", "無い", "よい", "週間",
                "お願い", "～。", "素晴らしく", "美味し", "おいしかっ", "日間", "場所", "美味い", "説明", "次回",
                "評価", "内容", "レベル", "○/", "://", "素晴らしい", "印象", "貫目", "初音", "注文",
                "大好き", "空間", "状態", "よく", "瞬間", "かなり", "com", '）"', ",\u3000", "\u3000-",
                "◎/", "◎◎/", "）⚪", ")、", "^)", "(^", "cp", "交互", "ボタン", "名刺",
                "嬉しい", "店員", "オーダー", "インター", "ピンポン", "丁目", "全力", "入り口", "入口", "フォン",
                "馬糞", "すき", "やばし", "マイクロ", "冷蔵庫", "毎回", "まつ", "コス", "基本", "よかっ",
                "ケース", "予想", "気分", "悪い", "きよ", "代目", "友人", "部位", "コレ", "旨かっ",
                "その他", "コト", "良し", "まつ", "むら", "いま", "昼間", "近所", "キロ", "代わり",
                "相手", "長い", "途中", "モノ", "狭く", "難点", "方々", "近辺", "番手", "ーー",
                '""', "！⚫", "青空", "ない", "いい"]
stop_words += my_stop_words

In [7]:
def normalize_number(text):
    # 連続した数字を0で置換
    replaced_text = re.sub(r'\d+', '0', text)
    return replaced_text

In [8]:
def tokenize_ja(text, lower):
    text = replaced_text = re.sub(r"\d+", "0", text)
    node = tagger.parseToNode(str(text))
    while node:
        surface = node.surface.lower()
        if lower and node.feature.split(',')[0] in ["名詞","形容詞"] and surface not in stop_words:
            #分かち書きで取得する品詞を指定
            yield surface
            #lowerで小文字に変換 正規化
        node = node.next

In [9]:
def tokenize(content, token_min_len, token_max_len, lower):
    return [
        str(token) for token in tokenize_ja(content, lower)
        if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
    ]

In [10]:
def frequent_lunch(path):
    f_words_list = []
    f_words = []
    f_words_count = []
    df = pd.read_csv(path)
    for i in df["lunch_review"]:
        txt = tokenize(i, 2, 10000, True)
        for t in txt:
            f_words_list.append(t)
    counter = Counter(f_words_list)
    for word, count in counter.most_common(30):
        f_words.append(word)
        f_words_count.append(count)
    return f_words, f_words_count

In [11]:
def frequent_dinner(path):
    f_words_list = []
    f_words = []
    f_words_count = []
    df = pd.read_csv(path)
    for i in df["dinner_review"]:
        txt = tokenize(i, 2, 10000, True)
        for t in txt:
            f_words_list.append(t)
    counter = Counter(f_words_list)
    for word, count in counter.most_common(30):
        f_words.append(word)
        f_words_count.append(count)
    return f_words, f_words_count

In [12]:
many_words_lunch = []
many_count_lunch = []
for path in lunch_file:
    many_word, word_count = frequent_lunch(path)
    many_words_lunch.append(many_word)
    many_count_lunch.append(word_count)
    #print("================================")

In [14]:
many_count_lunch

[[175,
  139,
  111,
  81,
  76,
  75,
  68,
  55,
  54,
  50,
  49,
  44,
  42,
  42,
  41,
  40,
  39,
  37,
  37,
  36,
  36,
  34,
  34,
  34,
  33,
  32,
  32,
  31,
  31,
  29],
 [52,
  47,
  46,
  30,
  26,
  23,
  21,
  21,
  20,
  19,
  17,
  17,
  17,
  15,
  14,
  13,
  13,
  13,
  13,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  11,
  11,
  11],
 [168,
  70,
  65,
  63,
  57,
  53,
  50,
  49,
  47,
  45,
  40,
  40,
  40,
  37,
  37,
  37,
  35,
  33,
  33,
  32,
  32,
  32,
  30,
  29,
  28,
  27,
  27,
  27,
  27,
  27],
 [69,
  56,
  53,
  50,
  47,
  47,
  43,
  40,
  36,
  35,
  35,
  35,
  33,
  32,
  31,
  30,
  29,
  26,
  26,
  26,
  25,
  24,
  24,
  23,
  22,
  22,
  22,
  22,
  22,
  21],
 [7,
  7,
  6,
  5,
  4,
  4,
  4,
  4,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2],
 [39,
  32,
  21,
  17,
  14,
  11,
  11,
  9,
  9,
  9,
  9,
  8,
  8,
  8,
  7,
  7,
  7,
  7,
  6,
  6,
  6,
  6,
  6,

In [64]:
many_words_dinner = []
many_count_dinner = []
for path in dinner_file:
    many_word, word_count = frequent_dinner(path)
    many_words_dinner.append(many_word)
    many_count_dinner.append(word_count)
    #print("================================")

In [16]:
many_words_dinner

[(['大将',
   '熟成',
   'トロ',
   '包丁',
   'カウンター',
   '海老',
   '西麻布',
   '昆布',
   'ランチ',
   '旨味',
   '雲丹',
   '仕事',
   '六本木',
   '満足',
   'マグロ',
   'ウニ',
   '金目',
   '個室',
   '提供',
   '漬け',
   '感動',
   '穴子',
   '値段',
   '種類',
   '隠し',
   '江戸前',
   '雰囲気',
   '一品',
   '写真',
   'コース'],
  [176,
   160,
   84,
   83,
   68,
   60,
   43,
   43,
   42,
   41,
   40,
   39,
   36,
   33,
   32,
   31,
   31,
   30,
   29,
   28,
   28,
   27,
   27,
   26,
   26,
   26,
   26,
   26,
   25,
   25]),
 (['銀座',
   'カウンター',
   'ワイン',
   'コース',
   '雰囲気',
   'ランチ',
   '玉子',
   'インターホン',
   'ミシュラン',
   '日本',
   '雲丹',
   'トロ',
   '大将',
   '職人',
   '海胆',
   'マグロ',
   'うに',
   'デザート',
   'ソムリエ',
   '種類',
   '普通',
   '築地',
   '穴子',
   '女性',
   'シャネル',
   '車海老',
   '隠れ家',
   '階段',
   '先付け',
   '素敵'],
  [48,
   27,
   24,
   21,
   16,
   16,
   15,
   14,
   14,
   14,
   14,
   13,
   13,
   12,
   12,
   11,
   11,
   11,
   11,
   10,
   10,
   10,
   10,
   10,
   9,
   9,
   9,
   9,
   9,
   8]),
 (['

In [37]:
a = []
for i in range(len(many_words_lunch[0])):
    a.append([many_words_lunch[0][i], many_count_lunch[0][i]])

In [38]:
df = pd.DataFrame(a, columns = ["頻出単語TOP30", "出現回数"])

In [39]:
df

Unnamed: 0,頻出単語TOP30,出現回数
0,熟成,175
1,ランチ,139
2,大将,111
3,海老,81
4,トロ,76
5,カウンター,75
6,包丁,68
7,味噌汁,55
8,昆布,54
9,旨味,50


In [68]:
for i in range(len(many_words_lunch)):
    a = []
    for j in range(len(many_words_lunch[i])):
        a.append([many_words_lunch[i][j], many_count_lunch[i][j]])
    df = pd.DataFrame(a, columns = ["頻出単語", "出現回数"])
    df.to_csv("pre_round2_frequency_"+store_name[i] + "_lunch.csv", index=False)

In [71]:
for i in range(len(many_words_dinner)):
    a = []
    for j in range(len(many_words_dinner[i])):
        a.append([many_words_dinner[i][j], many_count_dinner[i][j]])
    df = pd.DataFrame(a, columns = ["頻出単語", "出現回数"])
    df.to_csv("pre_round2_frequency_"+store_name[i] + "_dinner.csv", index=False)

In [69]:
df = pd.read_csv("pre_round2_frequency_sushitsuu_lunch.csv")

In [70]:
df

Unnamed: 0,頻出単語TOP30,出現回数
0,熟成,175
1,ランチ,139
2,大将,111
3,海老,81
4,トロ,76
5,カウンター,75
6,包丁,68
7,味噌汁,55
8,昆布,54
9,旨味,50


In [72]:
df = pd.read_csv(lunch_file[0])

In [77]:
df["lunch_review"][7]

'六本木通りから1本路地を入る。ビルの1階だが、通りから階段で少し低いところに降りていく、そのアプローチとエントランスが品格があり、良い店に来たという感覚になる。店内は明るく清潔で気持ちが良い。職人さんとは別に、きちんとした格好をした接客担当の方がいて、すごくしっかりとした接客をする。ここも感じが良い。づけさっぱりとしたづけ車海老良い塩梅で湯引きしてあり、ぷりぷりとした食感が楽しめる。あら秋刀魚すずき黒むつゴマサバフォトジェニックな握り。笑太刀魚炙りで。香ばしい。塩気が少し強いが脂がのっていて美味しい太刀魚だった。あなご上品なツメの穴子とろたく海苔が巻かないで出てくる。自分で海苔を巻く。海苔のパリパリ感が際立つ。せっかくだから海苔を巻かないで食べてみても良かった。うに臭みのない上質なウニ。海苔は自分で巻くのだが、せっかく海苔を巻かずに握りの形で出てきているのだから海苔を巻かないで食べてみた。海苔は海苔で食べたのだが、これ自体も上質であることは間違いない。最後はカステラのように焼いた玉子焼き。しっとりなめらか。お店のレベルの高さを感じる。テーブル席で、サイトウさんという方が担当だったが、上手だった。ランチであれば、大将の前のカウンターも座れる。ネタはもちろん良い。シャリも良い。シャリはかなり小さめ。酢は弱く、上品な甘みがある。上質な鮨が11貫茶わん蒸しも、みそ汁もついて、ランチとはいえ、これで5700円は本当に出色。しまだ鮨の鮨ランチも素晴らしいが、ここはそれを超える。'