# BERT Japanese Pretrained model 
#### The multilingual pretrained model includes Japanese, so it is possible to use the multilingual pretrained model for Japanese tasks, but it is not appropriate that the basic unit is almost a letter. Therefore, morphological analysis was performed on the input text, and the basic unit was obtained by dividing the morpheme into subwords, and pretraining was performed using only Japanese text (using Wikipedia).

# The details of the Japanese pretrained model are shown below.
#### Input text: All Japanese Wikipedia (about 18 million sentences, normalized to half-width)
Perform morphological analysis on the input text with Juman ++ (v2.0.0-rc2), and further apply Unsupervised Word Segmentation for Neural Machine Translation and Text Generation (BPE) https://github.com/rsennrich/subword-nmt to divide into subwords
Same setting as BERT_ {BASE} (12-layer, 768-hidden, 12-heads)
30 epoch (1GPU (using GeForce GTX 1080 Ti) takes about 1 day for 1epoch, so about 30 days for pretraining)
Use the latest GPU or use a program that can use Multi-GPU.
Number of vocabularies: 32,000 (including morphemes and subwords)
max_seq_length: 128

In [16]:
from bert_juman import BertWithJumanModel
bert = BertWithJumanModel ("/home/ifte/Downloads/L12_H768_A12_E30_BPE_WWM_Transformers-Model/" )

In [39]:
import pandas as pd
df = pd.read_csv("/home/ifte/amiebot_project/MyAI/AI-system/retrieval_Model/processed_perPage_perLine.csv")
df.head()
len(df)

445

In [18]:
for index, row in df.iterrows():
    var = str(row['Data']).split('。')
    break

In [44]:
len(var)
df.iloc[117]

Data      メールMobiControl v14 ManualWindows Embedded: Exc...
PageID                                                  117
Name: 117, dtype: object

In [34]:
import numpy as np

docvec = []

for item in var:
    emb = bert.get_sentence_embedding (item)
    docvec.append(emb.tolist()) 
    

In [38]:
import pickle

with open('docvec.pkl', 'wb') as f:
    pickle.dump(docvec ,f)
    
with open('docvec.pkl', 'rb') as f:
    vector = pickle.load(f)
    
len(vector)    

115

In [None]:
from sklearn.cluster import KMeans
n_clusters = int(np.ceil(len(var)**0.3))
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans = kmeans.fit(vector)
clusters = kmeans.labels_.tolist()
clusters

In [59]:
sentences = var
sentenceDictionary = {}
for idx, sentence in enumerate(sentences):
	sentenceDictionary[idx] = {}
	sentenceDictionary[idx]['text'] = sentence
	sentenceDictionary[idx]['cluster'] = clusters[idx]

In [65]:
clusterDictionary = {}
for key, sentence in sentenceDictionary.items():
	if sentence['cluster'] not in clusterDictionary:
		clusterDictionary[sentence['cluster']] = []
	clusterDictionary[sentence['cluster']].append(sentence['text'])
	sentence['idx'] = len(clusterDictionary[sentence['cluster']]) - 1


In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()

####################################
# Calculate Cosine Similarity Scores
####################################		

# For each cluster of sentences,
# Find the sentence with highet cosine similarity over all sentences in cluster
maxCosineScores = {}
for key, clusterSentences in clusterDictionary.items():
	maxCosineScores[key] = {}
	maxCosineScores[key]['score'] = 0
	tfidf_matrix = vectorizer.fit_transform(clusterSentences)
	cos_sim_matrix = cosine_similarity(tfidf_matrix)
    
	for idx, row in enumerate(cos_sim_matrix):
		sum = 0
		for col in row:
			sum += col
		if sum > maxCosineScores[key]['score']:
			maxCosineScores[key]['score'] = sum
			maxCosineScores[key]['idx'] = idx



In [70]:
####################################
# Construct Document Summary
####################################	

# for every cluster's max cosine score,
# find the corresponding original sentence
resultIndices = []
i = 0
for key, value in maxCosineScores.items():
	cluster = key
	idx = value['idx']
# 	stemmedSentence = clusterDictionary[cluster][idx]
	# key corresponds to the sentences index of the original document
	# we will use this key to sort our results in order of original document
	for key, value in sentenceDictionary.items():
		if value['cluster'] == cluster and value['idx'] == idx:
			resultIndices.append(key)

resultIndices.sort()

# Iterate over sentences and construct summary output
result = ''
for idx in resultIndices:
	result += sentences[idx] + ' '
		

print(result)

1台の会社支給端末(iOS Android)を、複数の従業員で共用する場合は、必須 また、 端末の登録後に、5項のどれかを追加したり、変更することも可能 詳しくは、を参照ください  続けて、「登録ID」、「登録用URL」または「Setup.INI」ファイルに一意的に対応する端末登録ルールに、自らの端末シリアル番号などの属性情報を申告します 文書、写真、動画などのコンテンツが主体 


In [54]:
from sklearn.metrics import pairwise_distances_argmin_min
avg = []
closest = []
for j in range(n_clusters):
    idx = np.where(kmeans.labels_ == j)[0]
    #print("IDX is: ", idx)
    avg.append(np.mean(idx))
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,vector)
ordering = sorted(range(n_clusters), key=lambda k: avg[k])
#Summ = ' '.join([review[closest[idx]] for idx in ordering])
#print("Done for review # = ", rv)

In [56]:
summary = ' '.join([vector[closest[idx]] for idx in ordering])

TypeError: sequence item 0: expected str instance, list found

In [1]:
from bert_juman import JumanTokenizer
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel

mod = "/home/ifte/Downloads/L12_H768_A12_E30_BPE_WWM_Transformers-Model/" 
mod_txt = "/home/ifte/Downloads/L12_H768_A12_E30_BPE_WWM_Transformers-Model/vocab.txt"

model = BertModel.from_pretrained(mod)
bert_tokenizer = BertTokenizer(mod_txt, do_lower_case=False, do_basic_tokenize=False)

juman_tokenizer = JumanTokenizer()
text = "吾輩は猫である。"
tokens = juman_tokenizer.tokenize(text)
bert_tokens = bert_tokenizer.tokenize(" ".join(tokens))
ids = bert_tokenizer.convert_tokens_to_ids(["[CLS]"] + bert_tokens + ["[SEP]"])
tokens_tensor = torch.tensor(ids).reshape(1, -1)

In [2]:
tokens

['吾輩', 'は', '猫', 'である', '。']

In [3]:
bert_tokens

['[UNK]', 'は', '猫', 'である', '。']

In [4]:
ids

[2, 1, 9, 4816, 32, 7, 3]

In [15]:
tokens_tensor

tensor([[   2,    1,    9, 4816,   32,    7,    3]])