# Vocab generator

In [2]:
import scipy as sc, seaborn as sns, pandas as pd

from collections import Counter
import os
import subprocess as sp
from multiprocessing.dummy import Pool as ThreadPool
from multiprocessing import Pool

from jNlp.jTokenize import jTokenize
import MeCab as mc

import json

## Count words

In [53]:
def is_useful(x):
    #for w in '名詞 動詞 副詞'.split():
    for w in '助詞 記号 助動詞 助詞 接続詞 感動詞 連体詞 EOS'.split():
        if w in x:
            return False
    return True

In [54]:
c = Counter()
fn_read = []

In [56]:
def count_words_in_files(filenames, fn_read=None):
    if not fn_read:
        fn_read = []
    c = Counter()
    for fn in filenames:
        with open(fn) as f:
            for line in f:
                c.update(
                    x.split(',')[-3]
                    for x in mc.Tagger().parse(line).split('\n') 
                    if is_useful(x) and len(x)>3
                )
            print(f'Finished reading {fn}, most common word is {c.most_common(1)}')
            fn_read.append(fn)
    return c
            

In [None]:
c = count_words_in_files([x for x in os.listdir('.') if r'.txt' in x and x not in fn_read])

In [62]:
c.most_common()

[('*', 930963),
 ('する', 5311),
 ('ん', 3556),
 ('俺', 2629),
 ('通信', 2601),
 ('の', 2443),
 ('いる', 2404),
 ('てる', 1924),
 ('なる', 1885),
 ('こと', 1868),
 ('たち', 1575),
 ('ある', 1502),
 ('私', 1468),
 ('団', 1425),
 ('さん', 1353),
 ('ら', 1329),
 ('ない', 1325),
 ('お前', 1241),
 ('れる', 1226),
 ('それ', 1214),
 ('いい', 1208),
 ('やる', 1171),
 ('何', 1108),
 ('今', 973),
 ('華', 925),
 ('分かる', 919),
 ('昭弘', 913),
 ('くれる', 892),
 ('これ', 890),
 ('鉄', 861),
 ('くる', 861),
 ('人', 860),
 ('言う', 802),
 ('お', 719),
 ('ここ', 715),
 ('員', 689),
 ('く', 674),
 ('できる', 619),
 ('名瀬', 617),
 ('行く', 601),
 ('せる', 592),
 ('一', 576),
 ('なん', 564),
 ('三日月', 559),
 ('もう', 559),
 ('火星', 546),
 ('来る', 544),
 ('雪', 520),
 ('音', 519),
 ('丞', 518),
 ('地球', 515),
 ('ふる', 501),
 ('ため', 495),
 ('思う', 483),
 ('三', 483),
 ('手', 482),
 ('団長', 482),
 ('もの', 450),
 ('様', 443),
 ('あ', 443),
 ('あんた', 437),
 ('え', 424),
 ('いく', 420),
 ('見る', 413),
 ('られる', 413),
 ('話', 400),
 ('部下', 400),
 ('仕事', 387),
 ('あいつ', 377),
 ('出る', 371),
 ('力', 369),


with open('word-freq.json', 'w') as f:
    json.dump(c,f)

In [45]:
with open('word-freq.json', 'r') as f:
    c = json.load(f)
c = Counter(c)

In [46]:
c.most_common()[-100]

('火除', 1)

## Prune less common words and short words

In [135]:
d = [x for x in c.most_common(3000) if x[1] > 0 and len(x[0])>1]

In [136]:
len(d)

2632

In [137]:
d

[('する', 5311),
 ('通信', 2601),
 ('いる', 2404),
 ('てる', 1924),
 ('なる', 1885),
 ('こと', 1868),
 ('たち', 1575),
 ('ある', 1502),
 ('さん', 1353),
 ('ない', 1325),
 ('お前', 1241),
 ('れる', 1226),
 ('それ', 1214),
 ('いい', 1208),
 ('やる', 1171),
 ('分かる', 919),
 ('昭弘', 913),
 ('くれる', 892),
 ('これ', 890),
 ('くる', 861),
 ('言う', 802),
 ('ここ', 715),
 ('できる', 619),
 ('名瀬', 617),
 ('行く', 601),
 ('せる', 592),
 ('なん', 564),
 ('三日月', 559),
 ('もう', 559),
 ('火星', 546),
 ('来る', 544),
 ('地球', 515),
 ('ふる', 501),
 ('ため', 495),
 ('思う', 483),
 ('団長', 482),
 ('もの', 450),
 ('あんた', 437),
 ('いく', 420),
 ('見る', 413),
 ('られる', 413),
 ('部下', 400),
 ('仕事', 387),
 ('あいつ', 377),
 ('出る', 371),
 ('日月', 362),
 ('死ぬ', 352),
 ('終わる', 347),
 ('待つ', 346),
 ('よい', 336),
 ('あれ', 333),
 ('こいつ', 316),
 ('こっち', 314),
 ('蒔苗', 314),
 ('あなた', 307),
 ('生きる', 307),
 ('みんな', 306),
 ('聞く', 304),
 ('もらう', 296),
 ('くださる', 293),
 ('知る', 290),
 ('とき', 285),
 ('いう', 282),
 ('あと', 280),
 ('石動', 277),
 ('ちまう', 273),
 ('自分', 261),
 ('戻る', 255),
 ('そっ', 252),
 (

## Get translations and prune too long/missing translations

In [138]:
qa = []
qq = set()

In [139]:
def query_dict(word):
    return (word, sp.run(f'myougiden --human {word[0]}'.split(), stdout=sp.PIPE, encoding='utf8').stdout)

In [140]:
n=0
m=0
k=0
for word in d:
    if word[0] in qq or len(word[0])==1: continue
    answer = sp.run(f'myougiden --human {word[0]}'.split(), stdout=sp.PIPE, encoding='utf8').stdout
    if answer and len(answer) < 1000:
        qa.append((word[0], answer))
        qq.add(word[0])
    elif answer: 
        k += 1
        print(f'{word[0]} has too long definition')
    else: 
        m += 1
        print(f'{word[0]} has no definition')
    n += 1
    if n%100 == 0: print(f'{n} words processed. {k} too long definitions. {m} no definitions.')

する has too long definition
こと has too long definition
さん has too long definition
やる has too long definition
昭弘 has no definition
くる has too long definition
名瀬 has no definition
出る has too long definition
蒔苗 has no definition
石動 has no definition
そっ has too long definition
ミカ has too long definition
もん has too long definition
100 words processed. 9 too long definitions. 4 no definitions.
シノ has too long definition
おう has too long definition
昌弘 has no definition
とく has too long definition
つく has too long definition
てめぇ has no definition
200 words processed. 13 too long definitions. 6 no definitions.
アン has too long definition
まっ has too long definition
あげる has too long definition
キン has too long definition
かける has too long definition
やれる has no definition
死ねる has no definition
300 words processed. 18 too long definitions. 8 no definitions.
せい has too long definition
上がる has too long definition
でる has too long definition
出せる has no definition
よう has too long definition
上げる has too long de

In [141]:
len(qa)

2509

## Export to Anki importable format

In [142]:
pd.DataFrame(qa).to_csv('anki/vocab.txt', index=None, header=None)

## Misc stats

In [14]:
qa2 = {k:v for (k,v) in qa}

In [15]:
qa_long = [x[0] for x in sorted(qa, key=lambda x: -len(x[1]))[:[x[0] for x in sorted(qa, key=lambda x: -len(x[1]))].index('ラオ')]]

ValueError: 'ラオ' is not in list

In [None]:
qa2 = [(k,v) for (k,v) in qa2.items() if k not in qa_long]

In [None]:
len(qa2)

In [None]:
qa[::-1]

In [143]:
qaa = sc.array([len(x[1]) for x in qa])

In [144]:
qaa

array([ 99, 715, 229, ...,  58,  38, 603])

In [145]:
qaa.mean()

144.31367078517337

In [146]:
sc.percentile(qaa, 99)

766.3600000000006

In [27]:
sorted(qa, key=lambda x: -len(x[1]))

[('エリ',
  'インディアンジュエリー、インディアン・ジュエリー\n1. [n] Indian jewellery (jewelry)\n\n※ エリート\n1. [n,adj-no] elite\n\n※ エリア、エリヤ\n1. [n] area\n\nエリアマーケティング、エリア・マーケティング\n1. [n] area marketing\n\nエリキシル、エリクシア\n1. [n] elixir\n\nエリシウム\n1. [n] Elysium\n\nエリスロポエチン、エリスロポイエチン\n1. [n] erythropoietin\n\nエリスロマイシン\n1. [n] erythromycin\n\n※ サービスエリア、サービス・エリア\n1. [n] service area; toll road rest stop; rest area\n\n※ ジュエリー、ジューリー、ジュウリー\n1. [n] jewelry; jewellery\n\nドゥエリング\n1. [n] dwelling\n\nドエリング\n1. [n] dwelling\n\nパーキングエリア、パーキング・エリア\n1. [n] parking area; parking lot\n2. rest stop; service area\n\nパワーエリート、パワー・エリート\n1. [n] power elite\n\nブランケットエリア、ブランケット・エリア\n1. [n] blanket area\n\nマニエリスム\n1. [n] Mannerism (art)\n\nローカルエリアネットワーク、ローカル・エリア・ネットワーク\n1. [n;comp] local area network; LAN\n\n共通エリア（きょうつうエリア）\n1. [n] common area\n\n襟巻蜥蜴；襟巻き蜥蜴（えりまきとかげ、＊エリマキトカゲ）\n1. [n;uk] frilled lizard (Chlamydosaurus kingii); frill-necked lizard\n\nエリート意識（エリートいしき）\n1. [n] elitism\n\nエリキシル剤（エリキシルざい）\n1. [n] elixir\n\nエリスリトール\n1. [n] erythrito