In [1]:
import pandas as pd
import numpy as np
import sys
!pip install mecab-python3
import MeCab
import fasttext
#!pip install gensim
from gensim.models.wrappers.fasttext import FastText
import time
import datetime



In [2]:
## 基本形で分かち書きをする関数

def analysis(text):
    # tagger = MeCab.Tagger("-Ochasen")
    tagger = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
    tagger.parse("")
    node = tagger.parseToNode(text)
    word = ""
    pre_feature = ""
    while node:
         # 名詞、形容詞、動詞、形容動詞であるかを判定する。
        isUsed = "名詞" in node.feature
        isUsed = "形容詞" in node.feature or isUsed
        isUsed = "動詞" in node.feature or isUsed
        isUsed = "形容動詞" in node.feature or isUsed
         # 以下に該当する場合は除外する。（ストップワード）
        isUsed = (not "代名詞" in node.feature) and isUsed
        isUsed = (not "助動詞" in node.feature) and isUsed
        isUsed = (not "非自立" in node.feature) and isUsed
        isUsed = (not "数" in node.feature) and isUsed
        isUsed = (not "人名" in node.feature) and isUsed
        if isUsed:
            word += " {0}".format(node.feature.split(",")[6])
        '''
        if isUsed:
            if ("名詞接続" in pre_feature and "名詞" in node.feature) or ("接尾" in node.feature):
            word += "{0}".format(node.surface)
        else:
        word += " {0}".format(node.surface)
        #print("{0}{1}".format(node.surface, node.feature))
        '''
        pre_feature = node.feature
        node = node.next
    return word[1:]

In [3]:
# 作品ごとの単語のリストと作品タイトルのリストを出力
def preprocess_reviews(data_path, stopwords_path):
    ## csvの読み込み
    df = pd.read_csv(data_path, encoding='utf-8', dtype={'Rating Score':'float'})
    df = df.dropna(axis=0, how='all', subset=['reviews'])
    ## lists of reviews and titles
    reviews_list = df['reviews'].T.tolist()
    titles_list = df['title'].T.tolist()
    ## stopword
    stopwords_df = pd.read_csv(stopwords_path, encoding='utf-8')
    stopwords_list = stopwords_df.T.values.tolist()[0]
    ## make training data
    splitted_reviews = [analysis(reviews).split(' ') for i, reviews in enumerate(reviews_list)]
    words_list = []
    for i in range(len(splitted_reviews)):
        words_list.append([word for word in splitted_reviews[i] if word not in stopwords_list])
    return words_list, titles_list

In [4]:
# 作品ごとの単語のリスト, 作品タイトルのリスト, idのリストを出力
# レビューの長さを調整：デフォルトは100文字未満の映画を削除、3000字目以降の文字を削除
def preprocess_reviews_length(data_path, stopwords_path, lower=100, upper=3000):
    ## csvの読み込み
    df = pd.read_csv(data_path, encoding='utf-8', dtype={'Rating Score':'float'})
    df = df.dropna(axis=0, how='all', subset=['reviews'])
    ## lists of reviews and titles
    reviews_list = df['reviews'].T.tolist()
    ## stopword
    stopwords_df = pd.read_csv(stopwords_path, encoding='utf-8')
    stopwords_list = stopwords_df.T.values.tolist()[0]
    ## make training data
    splitted_reviews = [analysis(reviews).split(' ') for i, reviews in enumerate(reviews_list)]
    words_list = []
    for i in range(len(splitted_reviews)):
        words_list.append([word for word in splitted_reviews[i] if word not in stopwords_list][:upper])
    df['reviews'] = words_list
    for index, row in df.iterrows():
        df.at[index, 'len'] = len(row['reviews'])
    df = df[df['len'] >= lower]
    words_list = df['reviews'].T.tolist()
    titles_list = df['title'].T.tolist()
    id_list = df['id'].T.tolist()
    return words_list, titles_list, id_list

In [8]:
def create_traindata(data_path, stopwords_path, save_path):
    without_stopwords, titles_list, id_list = preprocess_reviews_length(data_path, stopwords_path)
    trainings = ""
    
    for id, review in zip(id_list, without_stopwords):
        trainings += " __label__" + str(id) + ", " + ' '.join(review) + "\n"
    
    with open(save_path, mode='w') as f:
       f.write(trainings)

In [6]:
def make_model(trainings_path, myFastText_path):
    # save model
    #model = fasttext.train_supervised(input=trainings_path, epoch=2000, loss="hs")
    model = fasttext.train_supervised(input=trainings_path, epoch=2000, dim=300, loss="hs")
    model.save_model(myFastText_path)

In [7]:
def myFastText(data_path, stopwords_path, myFastText_path, titles_path):
    #dt_now = datetime.datetime.now()
    #print(dt_now)
    create_traindata(data_path, stopwords_path, 'data/testdata.txt', titles_path)
    make_model('data/testdata.txt', myFastText_path)
    #dt_now = datetime.datetime.now()
    #print(dt_now)

In [20]:
myFastText('data/1005data.csv', 'data/stopwords.csv', 'data/model')

2020-01-07 22:17:24.584410
2020-01-07 22:39:03.924415


In [16]:
def predict(data_path, model_path, titles_path, word, k):
    
    df = pd.read_csv(data_path, encoding='utf-8')
    titles_list = df['title'].T.tolist()
    
    model = fasttext.load_model(model_path)
    result = model.predict(word, k)
    for i in range(min([k, len(result[0])])):
        print(result[0][i].replace("__label__" , "").replace("," , "") + " : " + str(result[1][i]))
        #label = int(result[0][i].replace("__label__" , "").replace("," , ""))
        #print(str(label) + " : " + titles_list[label] + " : " + str(result[1][i]))

In [22]:
predict('data/1005data.csv', 'data/model', "冒険", 20)

英雄は嘘がお好き : 0.5814523696899414
人生、ただいま修行中 : 0.3005840480327606
シカゴ : 0.04608610272407532
ルパン三世 : 0.0287068672478199
レインマン : 0.019582003355026245
ワイルド・スピード／スーパーコンボ : 0.010769467800855637
グーニーズ : 0.008044487796723843
ボディ・ダブル : 0.0024902045261114836
ナインスゲート : 0.0008865694981068373
小さい魔女とワルプルギスの夜 : 0.0004548046854324639
誰も守ってくれない : 0.0002640899911057204
ベスト・キッド : 0.00014972529606893659
愛の渦 : 0.00014317876775749028
秘密　THE : 0.0001225550367962569
ロッキー5／最後のドラマ : 0.00012201403296785429
ミッション：インポッシブル／ゴースト・プロトコル : 0.00011743255890905857
ピーチガール : 5.2113660785835236e-05
ある精肉店のはなし : 5.109019548399374e-05
ロビン・フッド : 2.6728876036941074e-05
黒い家 : 2.1334335542633198e-05





映画ID順に類似度の配列を返したい

In [15]:
create_traindata('data/1005data.csv', 'data/stopwords.csv', 'data/testdata.txt')