In [118]:
import numpy as np
import pandas as pd
import MeCab
import sys
from collections import Counter
import re
import urllib.request, urllib.error

In [119]:
tagger = MeCab.Tagger("-Owakati")
tagger.parse("")

'\n'

In [120]:
lunch_file = ["../pretest_sushitsuu_lunch.csv",
              "../pretest_sushiichi_lunch.csv",
              "../pretest_sawada_lunch.csv",
              "../pretest_mitani_lunch.csv",
              "../pretest_matsukan_lunch.csv",
              "../pretest_kiyoda_lunch.csv",
              "../pretest_imamura_lunch.csv",
              "../pretest_hatsune_lunch.csv"
              ]

In [121]:
dinner_file = ["../pretest_sushitsuu_dinner.csv",
              "../pretest_sushiichi_dinner.csv",
              "../pretest_sawada_dinner.csv",
              "../pretest_mitani_dinner.csv",
              "../pretest_matsukan_dinner.csv",
              "../pretest_kiyoda_dinner.csv",
              "../pretest_imamura_dinner.csv",
              "../pretest_hatsune_dinner.csv",
              "../pretest_aozora_dinner.csv"
              ]

In [122]:
store_name = ["sushitsuu",
             "sushiichi",
             "sawada",
             "mitani",
             "matsukan",
             "kiyoda",
             "imamura",
             "hatsune",
             "aozora",
             ]

In [123]:
stop_words = [ "○－", "○○/", ")○/",
                 "～」", "○－", "○○/", ")○/", "～。", "○/", "://", "com", '）"', ",\u3000", "\u3000-",
                "◎/", "◎◎/", "）⚪", ")、", "^)", "(^", "cp", "ーー",
                '""', "！⚫",]

In [196]:
def normalize_number(text):
    # 連続した数字を0で置換
    replaced_text = re.sub(r'\d+', '0', text)
    return replaced_text

In [215]:
def tokenize_ja(text, lower):
    #text = replaced_text = re.sub(r"\d+", "0", text)
    node = tagger.parseToNode(str(text))
    while node:
        surface = node.surface
        if lower and node.feature.split(',')[0] in ["名詞","形容詞", "動詞", "副詞", "助動詞", "接続詞", "連体詞", "感動詞", "接頭詞"] and surface not in stop_words:
            #分かち書きで取得する品詞を指定
            yield surface
            #lowerで小文字に変換 正規化
        node = node.next

In [198]:
text = "今日はいい天気です。"
tokenize_ja(text, True)

<generator object tokenize_ja at 0x115c2bde0>

In [216]:
def tokenize(content, token_min_len, token_max_len, lower):
    return [
        str(token) for token in tokenize_ja(content, lower)
        if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
    ]

In [217]:
def frequent_lunch(path):
    f_words_list = []
    f_words = []
    f_words_count = []
    review_list = [] #レビューごとに全単語を収納
    df = pd.read_csv(path)
    for i in df["lunch_review"]:
        txt = tokenize(i, 0, 10000, True) #全単語を収納したリスト
        review_list.append(txt)
        for t in txt:
            f_words_list.append(t)
    counter = Counter(f_words_list)
    for word, count in counter.most_common(20000):
        f_words.append(word)
        f_words_count.append(count)
    return f_words, f_words_count, len(f_words_list), review_list

In [218]:
def frequent_dinner(path):
    f_words_list = []
    f_words = []
    f_words_count = []
    review_list = []
    df = pd.read_csv(path)
    for i in df["dinner_review"]:
        txt = tokenize(i, 0, 10000, True)
        review_list.append(txt)
        for t in txt:
            f_words_list.append(t)
    counter = Counter(f_words_list)
    for word, count in counter.most_common(20000):
        f_words.append(word)
        f_words_count.append(count)
    return f_words, f_words_count, len(f_words_list), review_list

In [219]:
many_words_lunch = []
many_count_lunch = []
word_times_lunch = [] #レビューの全単語数
review_list_lunch = [] #各レビューの全単語
for path in lunch_file:
    many_word, word_count, word_times, review_list = frequent_lunch(path)
    many_words_lunch.append(many_word)
    many_count_lunch.append(word_count)
    word_times_lunch.append(word_times)
    review_list_lunch.append(review_list)

In [220]:
many_words_dinner = []
many_count_dinner = []
word_times_dinner = [] #レビューの全単語数
review_list_dinner = []
for path in dinner_file:
    many_word, word_count, word_times, review_list = frequent_dinner(path)
    many_words_dinner.append(many_word)
    many_count_dinner.append(word_count)
    word_times_dinner.append(word_times)
    review_list_dinner.append(review_list)

In [175]:
len(review_list_lunch[0])

113

In [204]:
for i in range(len(many_words_lunch)):
    a = []
    for j in range(len(many_words_lunch[i])):
        s = 0 #many_words_lunch[i][j]が含まれるレビューの数
        for review in review_list_lunch[i]:
            if many_words_lunch[i][j] in review:
                s += 1
        
        a.append([many_words_lunch[i][j], many_count_lunch[i][j], 100 * (many_count_lunch[i][j] / word_times_lunch[i]), 
                  100 * (s / (len(review_list_lunch[i]) + 1)), many_count_lunch[i][j] / (len(review_list_lunch[i]) + 1)])
    df = pd.DataFrame(a, columns = ["頻出単語", "登場回数", "全単語(" + str(word_times_lunch[i]) + ")%"
                                    , "登場レビュー率 /(" +  str(len(review_list_lunch[i])) + ")本", "レビューあたりの出現回数 /回"])
    df.to_csv(store_name[i] + "_lunch.csv", index=False)

In [242]:
for i in range(len(many_words_lunch)):
    a = []
    for j in range(len(many_words_lunch[i])):
        s = 0 #many_words_lunch[i][j]が含まれるレビューの数
        for review in review_list_lunch[i]:
            if many_words_lunch[i][j] in review:
                s += 1
        
        a.append([many_words_lunch[i][j], many_count_lunch[i][j], int(10000 * (many_count_lunch[i][j] / word_times_lunch[i])) / 100, 
                  int(10000 * (s / (len(review_list_lunch[i]) + 1))) / 100, int(100 * many_count_lunch[i][j] / (len(review_list_lunch[i]) + 1)) / 100])
    df = pd.DataFrame(a, columns = ["頻出単語", "登場回数", "全単語(" + str(word_times_lunch[i]) + ")%"
                                    , "登場レビュー率 /(" +  str(len(review_list_lunch[i])) + "本)", "レビューあたりの出現回数 /回"])
    df.to_csv(store_name[i] + "_lunch.csv", index=False)

In [239]:
for i in range(len(many_words_dinner)):
    a = []
    for j in range(len(many_words_dinner[i])):
        s = 0 #many_words_dinner[i][j]が含まれるレビューの数
        for review in review_list_dinner[i]:
            if many_words_dinner[i][j] in review:
                s += 1
        
        a.append([many_words_dinner[i][j], many_count_dinner[i][j], int(10000 * (many_count_dinner[i][j] / word_times_dinner[i])) / 100, 
                  int(10000 * (s / (len(review_list_dinner[i]) + 1))) / 100, int(100 * many_count_dinner[i][j] / (len(review_list_dinner[i]) + 1)) / 100])
    df = pd.DataFrame(a, columns = ["頻出単語", "登場回数", "全単語(" + str(word_times_dinner[i]) + ")%"
                                    , "登場レビュー率 /(" +  str(len(review_list_dinner[i])) + "本)", "レビューあたりの出現回数 /回"])
    df.to_csv(store_name[i] + "_dinner.csv", index=False)

In [243]:
df = pd.read_csv("imamura_lunch.csv")

In [244]:
df

Unnamed: 0,頻出単語,登場回数,全単語(16554)%,登場レビュー率 /(54本),レビューあたりの出現回数 /回
0,た,593,3.58,92.72,10.78
1,です,270,1.63,76.36,4.90
2,し,261,1.57,92.72,4.74
3,お,206,1.24,87.27,3.74
4,な,196,1.18,83.63,3.56
5,-,196,1.18,10.90,3.56
6,ます,181,1.09,69.09,3.29
7,ない,149,0.90,76.36,2.70
8,の,146,0.88,69.09,2.65
9,店,136,0.82,87.27,2.47


In [117]:
review_list_lunch[0][5]

['六本木',
 'ある',
 '有名',
 '寿司',
 'すし',
 '高級',
 'です',
 '実は',
 'ランチ',
 'なら',
 '行け',
 'そう',
 'という',
 'こと',
 '有志',
 '募っ',
 '予約',
 '行く',
 'こと',
 'まし',
 '落ち着い',
 '佇まい',
 '予約',
 'かかわら',
 'ちょっと',
 '入り',
 'づらい',
 '今回',
 'カウンター',
 '真ん中',
 'でし',
 'とても',
 'キレイ',
 '店内',
 'です',
 '軽く',
 '緊張',
 'ます',
 'ランチ',
 '種類',
 '予約',
 'とき',
 'もう',
 '伝え',
 'あり',
 'ます',
 'にぎり',
 'おまかせ',
 'コース',
 '蓋物',
 '付き',
 'にぎり',
 'おまかせ',
 'コース',
 '蓋物',
 '付き',
 'シャンパン',
 'ワイン',
 'メニュー',
 'いっぱい',
 'あり',
 'ます',
 'ランチ',
 'ので',
 '自重',
 '今回',
 'おまかせ',
 'コース',
 '選択',
 'さて',
 'コース',
 '始まり',
 'です',
 'マグロ',
 '赤身',
 'マグロ',
 '！#',
 '本木',
 '寿司',
 'すし',
 'ランチ',
 '東京',
 'マグロ',
 '西麻布',
 'hangover',
 'mealhttps',
 'eat',
 'hangover',
 'work',
 '最初',
 'マグロ',
 '赤身',
 '使っ',
 '寿司',
 'って',
 '食べ',
 'こと',
 'なかっ',
 'です',
 'まろやか',
 'コク',
 '旨味',
 'あり',
 'ます',
 'そして',
 '想像',
 '以上',
 'ふわっと',
 'シャリ',
 '本当に',
 '入れる',
 'ほぐれ',
 'いき',
 'ます',
 'まで',
 '寿司',
 '食べ物',
 '感じ',
 'ます',
 'ほうぼう',
 'ほうぼう',
 '握り',
 '！#',
 'ランチ',
 '本木',
 '寿司',
 'すし',
 '西麻布',
 '東京',
 'ほうぼう',
 'っ