In [28]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
from gensim import corpora
from gensim import models
import MeCab
from gensim.models import word2vec
from gensim.models import TfidfModel
from operator import itemgetter

In [4]:
# Read recipe inputs
your_trip_advisor = dataiku.Dataset("trip_advisor_clustered_prepared")
df = your_trip_advisor.get_dataframe()

In [19]:
tfidf_path = dataiku.Folder("tMMk2S0T").get_path() + "/tf_idf"

In [8]:
list_vocabs = df[df['cluster_labels'].isin(['接客などのサービス', '具材・素材・味'])]['words_concat'].values

In [11]:
ramen_word = list_vocabs[0].split(",") + list_vocabs[1].split(",")

In [17]:
ramen_dictionary_path = dataiku.Folder("POe5uF4H").get_path() + "/ramen_dictionary"
dictionary = corpora.Dictionary.load(ramen_dictionary_path)
corpus = list(map(dictionary.doc2bow, [ramen_word]))

In [23]:
tfidf_model = TfidfModel.load(tfidf_path)

In [24]:
corpus_tfidf = tfidf_model[corpus]

In [25]:
corpus_tfidf

<gensim.interfaces.TransformedCorpus at 0x7f8b3a6b0828>

In [26]:
# id->単語へ変換
texts_tfidf = [] # id -> 単語表示に変えた文書ごとのTF-IDF
for doc in corpus_tfidf:
    text_tfidf = []
    for word in doc:
        text_tfidf.append([dictionary[word[0]],word[1]])
    texts_tfidf.append(text_tfidf)

In [27]:
texts_tfidf

[[['10', 0.004334910083146761],
  ['10分', 0.0005285872455867774],
  ['12', 0.006766371415655174],
  ['15', 0.018210897677701232],
  ['20人', 0.022558401048155173],
  ['25', 0.025290056606975],
  ['30', 0.0141947191343707],
  ['30分', 0.004433138305269084],
  ['45', 0.03940381077145201],
  ['いい', 0.0002984279720377294],
  ['うまい', 0.013993555551083062],
  ['こだわり', 0.012668045107411523],
  ['これら', 0.022874923936817417],
  ['どちら', 0.01007160598866056],
  ['めちゃくちゃ', 0.010785702110361322],
  ['スタッフ', 0.013304853097998276],
  ['タイミング', 0.004933735205932772],
  ['チャーシュー', 7.442935664044637e-05],
  ['レストラン', 0.565768181189836],
  ['両方', 0.021065143092355267],
  ['予約', 0.014617912231222056],
  ['使用', 0.009463146089580467],
  ['全体', 0.013299414915807254],
  ['処理', 0.030047955305432167],
  ['出汁', 0.0005285872455867774],
  ['到着', 7.442935664044637e-05],
  ['味付け', 0.0017251494429547857],
  ['大きく', 0.013701431681182593],
  ['大丈夫', 0.01641614040210622],
  ['座席', 0.012835611946504188],
  ['待ち時間', 0.01034

In [29]:
texts_tfidf_sorted_top20 = []

# TF-IDF値を高い順に並び替え上位単語20個に絞る。
# 各ラーメン店のレビューにおいて、TF-IDF値の高い20単語だけが残る。
for i in range(len(texts_tfidf)):
    soted = sorted(texts_tfidf[i], key=itemgetter(1), reverse=True)
    soted_top20 = soted[:20]
    word_list = []
    for k in range(len(soted_top20)):
        word = soted_top20[k][0]
        word_list.append(word)
    texts_tfidf_sorted_top20.append(word_list)

In [30]:
texts_tfidf_sorted_top20

[['レストラン',
  '私たち',
  'あなた',
  '彼ら',
  'パリ',
  '待ち行列',
  'クール',
  '快適',
  '窮屈',
  '事項',
  'キリン',
  '共有',
  'パイ',
  '宣伝',
  '提案',
  'バー',
  'インテリア',
  'コンパクト',
  'ストック',
  '親切']]

In [0]:
wakati_folder = dataiku.Folder("0kM5kXKs").get_path()
tagger_path = '-Owakati -d ' + wakati_folder

In [0]:
tagger = MeCab.Tagger(tagger_path)#タグはMeCab.Tagger（neologd辞書）を使用
tagger.parse('')

In [0]:
def tokenize_ja(text, lower):
    node = tagger.parseToNode(str(text))
    while node:
        if lower and node.feature.split(',')[0] in ["名詞","形容詞"]:#分かち書きで取得する品詞を指定
            yield node.surface.lower()
        node = node.next
def tokenize(content, token_min_len, token_max_len, lower):
    return [
        str(token) for token in tokenize_ja(content, lower)
        if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
    ]

In [0]:
wakati_ramen_text = []
for i in df['jp']:
    txt = tokenize(i, 2, 10000, True)
    wakati_ramen_text.append(txt)

In [0]:
#[w for w in sublist in wakati_ramen_text]
vocab = [w for sublist in wakati_ramen_text for w in sublist]

In [0]:
words = []
vectors = []
for word in vocab:
    try:
        vector = ramen_model.wv[word]
        words.append(word)
        vectors.append(vector)
    except KeyError:
        None

In [0]:
vocab_df = pd.DataFrame(vectors)
vocab_df['words'] = words

In [0]:
trip_advisor_tf_idf = dataiku.Dataset("trip_advisor_vocabs")
trip_advisor_tf_idf.write_with_schema(vocab_df)

In [0]:
#ramen_dictionary_path = dataiku.Folder("POe5uF4H").get_path() + "/ramen_dictionary"
#dictionary = corpora.Dictionary.load(ramen_dictionary_path)
#corpus = list(map(dictionary.doc2bow, reviews_concat))