In [1]:
with open('/home/yamanishi/project/trip_recommend/data/lda/lda.pkl' , 'rb') as f:
    topic_dict = pickle.load(f)

In [9]:
# -*- coding: utf-8 -*-
"""LDA_jp.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/github/m3yrin/NTM/blob/master/LDA_jp.ipynb

# Gensim LDA model for Japanese articles
auther : m3yrin

reference : http://tdual.hatenablog.com/entry/2018/04/09/133000

### Memo
* tdual' s LDA script is massively cited.
* janome tokenizer is used instead of Mecab.

### Dataset
livedoor ニュースコーパス / livedoor News Corpus  
https://www.rondhuit.com/download.html#ldcc  
CC BY-ND 2.1 JP  
https://creativecommons.org/licenses/by-nd/2.1/jp/
"""



import os

import pandas as pd
from urllib import request 
import logging
from pathlib import Path
import numpy as np
import re
import janome
import random
from gensim import corpora, models

from janome.tokenizer import Tokenizer
from janome import analyzer
from janome.charfilter import *
from janome.tokenfilter import *

from tqdm import tqdm
import pickle
tqdm.pandas()

# https://ohke.hateblo.jp/entry/2017/11/02/230000
class NumericReplaceFilter(TokenFilter):
    def apply(self, tokens):
        for token in tokens:
            parts = token.part_of_speech.split(',')
            if (parts[0] == '名詞' and parts[1] == '数'):
                token.surface = '0'
                token.base_form = '0'
                token.reading = 'ゼロ'
                token.phonetic = 'ゼロ'
            yield token

            
class docTokenizer:
    def __init__(self, stopwords, parser=None, include_pos=None, exclude_posdetail=None, exclude_reg=None):
    
        self.stopwords = stopwords
        self.include_pos = include_pos if include_pos else  ["名詞", "動詞", "形容詞"]
        self.exclude_posdetail = exclude_posdetail if exclude_posdetail else ["接尾", "数"]
        self.exclude_reg = exclude_reg if exclude_reg else r"$^"  # no matching reg
        
        self.char_filters = [
                        UnicodeNormalizeCharFilter(), 
                        RegexReplaceCharFilter(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", u''), #url
                        RegexReplaceCharFilter(r"\"?([-a-zA-Z0-9.`?{}]+\.jp)\"?", u''), #*.jp
                        RegexReplaceCharFilter(self.exclude_reg, u'')
                       ]
        
        self.token_filters = [
                         NumericReplaceFilter(),
                         POSKeepFilter(self.include_pos),
                         POSStopFilter(self.exclude_posdetail), 
                         LowerCaseFilter()
                        ]
        
        self.analyzer = analyzer.Analyzer(char_filters=self.char_filters, tokenizer=Tokenizer(), token_filters=self.token_filters)
        
    def tokenize(self, text):

        tokens = self.analyzer.analyze(text)
        tokens = [re.sub(r"," ,"\t", str(i)) for i in tokens]
        l = [line.split("\t") for line in tokens]
        
        #Janome response
        #i[] : ['認め', '動詞', '自立', '*', '*', '一段', '連用形', '認める', 'ミトメ', 'ミトメ']

        res = []
        for i in l:
            if i[7] not in self.stopwords:
                res.append(i[7])
                
        return res

"""### Hyper-parameters"""

num_articles = -1
topic_num = 20
passes = 50

"""### Load stopwords"""

res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt")
stopwords = [line.decode("utf-8").strip() for line in res]
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/English.txt")
stopwords += [line.decode("utf-8").strip() for line in res]

stopwords += ['*', '&', '[', ']', ')', '(', '-',':','.','/','0', '...?', '——', '!【', '"', ')、', ')。', ')」']

print("# Stopword : ", len(stopwords))

"""### Load articles"""

doc_path = "./text/"
doc_dir = Path(doc_path)
dirs = [i for i in doc_dir.iterdir() if i.is_dir()]
articles = [a for categ in dirs for a in categ.iterdir()]
random.shuffle(articles)

articles = articles[:num_articles]

tokenizer = docTokenizer(stopwords = stopwords, exclude_reg=r"\d(年|月|日)")

df_exp = pd.read_csv('/home/yamanishi/project/trip_recommend/data/jalan/spot/experience_light.csv')
df_review = pd.read_csv('/home/yamanishi/project/trip_recommend/data/jalan/review/review_all.csv')
train_spot = df_exp[df_exp['valid']>=2].loc[:,'spot_name']
valid_spot = df_exp[df_exp['valid']==1].loc[:, 'spot_name']
test_spot = df_exp[df_exp['valid']==0].loc[:, 'spot_name']
df_review_train = df_review[df_review['spot'].isin(train_spot)]
df_review_valid = df_review[df_review['spot'].isin(valid_spot)]
df_review_test = df_review[df_review['spot'].isin(test_spot)]

docs_train = []
for a in tqdm(df_review_train['review']):
    docs_train.append(tokenizer.tokenize(a))

docs_valid = []
for a in tqdm(df_review_valid['review']):
    docs_valid.append(tokenizer.tokenize(a))

docs_test = []
for a in tqdm(df_review_test['review']):
    docs_test.append(tokenizer.tokenize(a))

# Stopword :  928


  exec(code_obj, self.user_global_ns, self.user_ns)
  1%|▏         | 14363/1069153 [04:26<5:26:38, 53.82it/s] 


KeyboardInterrupt: 

In [14]:
df_exp = pd.read_csv('/home/yamanishi/project/trip_recommend/data/jalan/spot/experience_spare.csv')

In [19]:
tfidf_words = np.load('/home/yamanishi/project/trip_recommend/data/jalan/graph/tfidf_words.npy')
cooc = np.zeros((len(tfidf_words), len(tfidf_words)))

In [20]:
word_id = {word:id for id, word in enumerate(tfidf_words)}

In [31]:
valid_idx = np.load('/home/yamanishi/project/trip_recommend/data/jalan/graph/valid_idx.npy')

In [32]:
train_spots = df_exp['spot_name'].values[valid_idx>1]

In [35]:
import MeCab
tokenizer = MeCab.Tagger()
tokenizer.parse("")
df_review_train = df_review[df_review['spot'].isin(train_spots)]
def get_word(review):
    node = tokenizer.parseToNode(review)
    keywords = []
    while node:
        #if node.feature.split(",")[0] == u"名詞":
        #    keywords.append(node.surface)
        if node.feature.split(",")[0] in  [u"形容詞", u"動詞", u"名詞"]:
            keywords.append(node.feature.split(",")[6])
        #elif node.feature.split(",")[0] == u"動詞":
        #    keywords.append(node.feature.split(",")[6])
        node = node.next
    return keywords

In [37]:
from tqdm import tqdm
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
df_review_train['words'] = df_review_train['review'].parallel_apply(get_word)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=66801), Label(value='0 / 66801')))…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [49]:
word_counts = [0]*len(tfidf_words)
for word in tqdm(df_review_train['words']):
    for w in word:
        if w in word_id:
            word_counts[word_id[w]]+=1
    comb = itertools.combinations(word,2)
    for word1, word2 in comb:
        if word1 in word_id and word2 in word_id:
            id1 = word_id[word1]
            id2 = word_id[word2]
            cooc[id1][id2]+=1

100%|██████████| 1068807/1068807 [10:50<00:00, 1641.90it/s]


In [50]:
cooc.sum()

233405254.0

In [51]:
total_counts = np.outer(np.array(word_counts), np.array(word_counts).T)
pmi = np.log2(cooc*sum(word_counts)/total_counts)

  
  


In [68]:
len(pmi[pmi>=15])

320124

In [74]:
index=np.where(pmi>15)

In [76]:
word_word = np.array([list(index[0]), list(index[1])])
np.save('/home/yamanishi/project/trip_recommend/data/jalan/graph/word_word.npy', word_word)

In [71]:
word_from = []
word_to = []
for i in tqdm(range(len(word_counts))):
    for j in range(len(word_counts)):
        if pmi[i, j]>15:
            #print(tfidf_words[i], tfidf_words[j])
            word_from.append(i)
            word_to.append(i)   
    #if i==3000:
    #    break

  0%|          | 289/68337 [00:14<55:57, 20.27it/s]  


KeyboardInterrupt: 

In [81]:
from torch.nn import LSTMCell
cell = LSTMCell(12, 12)
cell(torch.rand(50, 12), (torch.rand(50, 12), torch.rand(50, 12)))[0].size()

torch.Size([50, 12])