In [None]:
# 学習済みのベクトルデータの準備
import os
import gdown
if not os.path.exists('/content/data'):
    os.mkdir('/content/data')

url = 'https://drive.google.com/uc?id=1BwRe4FnPi26qzpUZ1r98x9PVkWLaA1d6'
gdown.download(url, './data/stem_vec.tar.gz', quiet=False)
!tar zxvf /content/data/stem_vec.tar.gz


In [None]:
# 類義語を検索してみる
import gensim
import warnings
from gensim.models import Word2Vec

if not os.path.exists('/content/stem_vec/word2vec.gensim.egotw.model'):
    raise ValueError("ERROR: 学習済みword2vecモデルが存在しません。")

# 学習済みWord2vecを読込
wv = Word2Vec.load("/content/stem_vec/word2vec.gensim.egotw.model")

# 検索
input_word = '学生'
if not input_word in wv:
    print("Input word [%s] is not included in the model" % input_word)
else:
    results = wv.most_similar(positive=[input_word])
    for result in results:
        print(result)

In [None]:

# 形態素解析器MeCabのラッパーライブラリfugashiをインストール
!pip install fugashi[unidic-lite]
from fugashi import Tagger

txt = '私は徳島大学の学生です。'

tagger = Tagger('-Owakati')

# 形態素解析する関数
def wakatigaki(txt):
    words = []
    wakati = tagger.parse(txt)
    for w in wakati.split(' '):
        words.append(w)
    return words

for w in wakatigaki(txt):
    print(w)

In [None]:
# コーパスの準備
# テキストデータ(livedoor news corpus)をMeCab(fugashi)で分かち書き処理
!wget https://www.rondhuit.com/download/ldcc-20140209.tar.gz
!tar zxvf ldcc-20140209.tar.gz
# テキストを1つのファイルにまとめる(すべてのファイルで3行目以降が本文)
import glob
wf = open('./all.txt', 'w')
for dir in glob.glob('/content/text/*'):
    for path in glob.glob(f'{dir}/*.txt'):
        l = 0
        for line in open(path):
            line = line.rstrip()
            if line == '':
                continue

            l += 1
            if l >= 3:
                wf.write(' '.join(wakatigaki(line)) + '\n')
wf.close


In [None]:
# livedoor news corpusの分かち書きテキストを
# 追加で学習させる
from gensim.models import word2vec
corpus = '/content/all.txt'
newmodel = '/content/stem_vec/w2v.new.livedoor.model'
sentences = word2vec.Text8Corpus(corpus)

model = word2vec.Word2Vec.load("/content/stem_vec/word2vec.gensim.egotw.model")
model.build_vocab(sentences, update=True)
model.train(sentences, total_examples=model.corpus_count, epochs=10)
model.save(newmodel)

In [None]:
# 追加学習したWord2vecを読込
wvnew = Word2Vec.load(newmodel)

# 検索
input_word = '中学生'
if not input_word in wvnew:
    print("Input word [%s] is not included in the model" % input_word)
else:
    results = wvnew.most_similar(positive=[input_word])
    for result in results:
        print(result)

print("-----------")
input_word2 = 'マリオ'
if not input_word2 in wvnew:
    print("Input word [%s] is not included in the model" % input_word2)
else:
    results = wvnew.most_similar(positive=[input_word2])
    for result in results:
        print(result)

## input_wordとinput_word2を足し算した結果
sum_vect = wvnew[input_word] + wvnew[input_word2]
print(sum_vect)
results = wvnew.most_similar(positive=[input_word, input_word2])
for result in results:
    print(result)

print("---------------")
results = wvnew.similar_by_vector(sum_vect)
for result in results:
    print(result)

print(".....................")
## input_wordとinput_word2を引き算した結果
sub_vect = wvnew[input_word] - wvnew[input_word2]
print(sub_vect)
results = wvnew.most_similar(positive=[input_word],negative=[input_word2])
for result in results:
    print(result)

print("#######################")
results = wvnew.similar_by_vector(sub_vect)
for result in results:
    print(result)

In [None]:
# wrimeコーパスをダウンロードする
!git clone https://github.com/ids-cv/wrime


In [None]:
# 文書ベクトルを作成する
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

def cosine_sim(X, Y):
    return (X @ Y.T) / np.sqrt(np.nansum(np.power(X, 2), axis=1) * np.nansum(np.power(Y, 2), axis=1))


# 単語分散表現を格納したリストを作成する
def make_vec(fn, model):
    vlist = []
    for word in fn:
        if not word in model:
            vlist.append(np.array([0.0]*50))
        else:
            vlist.append(model[word])
    vlist = np.asarray(vlist)
    return vlist

def getEmotionLabels( df ):
    cols = ['Writer_Joy', 'Writer_Sadness',
            'Writer_Anticipation','Writer_Surprise',
            'Writer_Anger', 'Writer_Fear', 'Writer_Disgust', 'Writer_Trust']
    emotions = ['喜び', '悲しみ', '期待', '驚き', '怒り', '怖れ', '嫌悪', '信頼']
    emos = []
    for ev in df[cols].values:
        emos.append( emotions[np.argmax(ev)] )
    return emos


def filtering_result(sims, threshold):
    sh = {}
    for i, s in enumerate(sims):
        if s >= threshold:
            sh[i] = s
    return sh

# 複数の文を入力して，足し算・引き算を行う
def make_query_vec(queries, operators, model):        
    avgv = np.array([0.0]*50)
    for i, (q, o) in enumerate(zip(queries, operators)):
        waka = wakatigaki(q)

        for w in waka:
            if w in model:
                if np.sum(avgv) == 0:
                    avgv = o * model[w].copy()
                else:
                    avgv += o * model[w].copy()
    query = np.array([avgv])
    query = np.nanmean(query, axis=0).reshape(1, -1)  
    return query


# メイン関数
def main():
    # 作成済みのlivedoorの分散表現のモデルを読み込んでおく
    livedoor_model_path = '/content/stem_vec/w2v.new.livedoor.model'
    livedoor_model = word2vec.Word2Vec.load(livedoor_model_path)

    # 検索対象となるテキストコーパスを読み込む
    df = pd.read_csv('./wrime/wrime-ver1.tsv', sep='\t')
    df = df[df['Train/Dev/Test'] == 'test']
    sents = df['Sentence'].values
    emos = getEmotionLabels(df)

    print(len(sents))

    cps = []
    with open('./wrime.txt', 'w') as wf:
        for s in sents:
            wl = wakatigaki(s)
            sen = ' '.join(wl)
            wf.write(f'{sen}\n')
            cps.append(sen)

    # コーパスからTF-IDFを計算
    print(len(cps))
    vect = TfidfVectorizer(analyzer=lambda x: x.split(" "), dtype=np.float32, token_pattern="(?u)\\b\\w+\\b")
    wv = vect.fit_transform(cps)
    fn = vect.get_feature_names()

    vlist = make_vec(fn, livedoor_model)

    vsm = wv.sum(axis=1)
    docvec = (wv @ vlist) / vsm

    # 3つの文の足し算・引き算をしてみます
    queries = ['今日は全身がだるくて，何もやる気が起きません！',
            'でも，ちょっと気分が良くなってきたので，集中したいと思います！',
            '今晩のメニューはすき焼きです。とても楽しみです（笑）' ]
    # 演算子(1:+, -1:-)
    operators = [1, -1, 1]
    query = make_query_vec(queries, operators, livedoor_model)

    # 入力されたクエリと検索対象文書集合間の類似度を計算
    sims = cosine_sim(docvec, query)
    # 類似度が0.5以上の結果だけを取得する
    sh = filtering_result(sims, 0.6)

    topk = 30 # 上位30件のみ表示
    emok = {'喜び':0, '悲しみ':0, '期待':0, '驚き':0, '怒り':0, '怖れ':0, '嫌悪':0, '信頼':0}
    for k,v in sorted(sh.items(), key=lambda x:x[1], reverse=True)[:topk]:
        print("SID:[%d], Sim:[%.3lf] <%s> ==> [%s]" % (k,v, cps[k], emos[k]) )
        emok[emos[k]] += 1

    print(emok)

if __name__ == '__main__':
    main()
    