In [None]:
import re
import random
from pathlib import Path
import logging
from multiprocessing import Pool

import numpy as np
import MeCab as mecab
from gensim.models import Word2Vec, word2vec
from tqdm import tqdm

seed = 42
random.seed(seed)
np.random.seed(seed)

BASE_PATH = Path('./input')


In [None]:
!mkdir input
!wget https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles.xml.bz2
!mv jawiki-latest-pages-articles.xml.bz2 input
!git clone https://github.com/Language-Media-Lab/commonsense-moral-ja.git
!cp commonsense-moral-ja/data/* input/
!rm commonsense-moral-ja -rf


In [None]:
%pip install -r requirements.txt
%pip install git+https://github.com/attardi/wikiextractor.git@ab8988ebfa9e4557411f3d4c0f4ccda139e18875
!cd input && python -m wikiextractor.WikiExtractor jawiki-latest-pages-articles.xml.bz2
!python -m unidic download


In [None]:
!cd input && find text/ | grep wiki | awk '{system("cat "$0" >> wiki.txt")}'


In [None]:
with open(BASE_PATH / 'wiki.txt', 'r+', encoding='utf-8') as f:
  text = f.read()
  text_ = re.sub(r'^<[^>]*>$', '', text, flags=re.MULTILINE)
  text_ = re.sub(r'^\n', '', text_, flags=re.MULTILINE)
  f.seek(0)
  f.write(text_)
  f.truncate()


In [None]:
with open(BASE_PATH / 'wiki.txt') as f:
  text = f.read()

sentences = []
for s in tqdm(re.split("[\n。]", text)):
  s = s.strip()
  if s:
    sentences.append(s + "。")

sentences = list(filter(None, sentences))
print(len(sentences))

def parse(text):
  tagger = mecab.Tagger('-Owakati')
  morph = tagger.parse(text).strip().split()
  return morph

def tokenize_list(text_list):
  with Pool() as pool:
    results = list(tqdm(pool.imap(parse, text_list), total=len(text_list)))
  return results

batch_size = 1000000
w2v_train_data = []
for i in tqdm(range(0, len(sentences), batch_size)):
  batch = sentences[i:i+batch_size]
  data = tokenize_list(batch)
  w2v_train_data.extend(data)

def save(word_list, filename):
  with open(filename, "w", encoding='utf-8') as f:
    for word in tqdm(word_list):
      f.write(" ".join(word) + "\n")

save(w2v_train_data, 'wiki_wakati.txt')


100%|██████████| 19088614/19088614 [00:03<00:00, 4864771.15it/s]


14590399


In [None]:
w2v_train_data = word2vec.LineSentence('wiki_wakati.txt')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = Word2Vec(w2v_train_data, vector_size=300, window=5, sample=1e-3, negative=5, hs=0, )
model.wv.save_word2vec_format('ja.bin', binary=True)


2024-11-23 15:26:11,338 : INFO : collecting all words and their counts
2024-11-23 15:26:11,339 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-11-23 15:26:11,392 : INFO : PROGRESS: at sentence #10000, processed 276905 words, keeping 20926 word types
2024-11-23 15:26:11,442 : INFO : PROGRESS: at sentence #20000, processed 540033 words, keeping 32105 word types
2024-11-23 15:26:11,486 : INFO : PROGRESS: at sentence #30000, processed 804395 words, keeping 39478 word types
2024-11-23 15:26:11,535 : INFO : PROGRESS: at sentence #40000, processed 1091090 words, keeping 47363 word types
2024-11-23 15:26:11,580 : INFO : PROGRESS: at sentence #50000, processed 1357790 words, keeping 53630 word types
2024-11-23 15:26:11,625 : INFO : PROGRESS: at sentence #60000, processed 1621118 words, keeping 59968 word types
2024-11-23 15:26:11,665 : INFO : PROGRESS: at sentence #70000, processed 1838036 words, keeping 63854 word types
2024-11-23 15:26:11,711 : INFO : PROGRESS: