In [11]:
import glob
import csv
import numpy as np
from cihai.core import Cihai
import string

hiraganaKatakanaToStrokes={
    "ぁ":3,
    "あ":3,
    "ぃ":2,
    "い":2,
    "ぅ":2,
    "う":2,
    "ぇ":2,
    "え":2,
    "ぉ":3,
    "お":3,
    "か":3,
    "が":5,
    "き":3,
    "ぎ":5,
    "く": 1,
    "ぐ":3,
    "け":3,
    "げ":5,
    "こ":2,
    "ご":4,
    "さ":2,
    "ざ":4,
    "し":1,
    "じ":3,
    "す":2,
    "ず":4,
    "せ":3,
    "ぜ":5,
    "そ":1,
    "ぞ":3,
    "た":4,
    "だ":6,
    "ち":2,
    "ぢ":4,
    "っ":1,
    "つ":1,
    "づ":3,
    "て":1,
    "で":3,
    "と":2,
    "ど":4,
    "な":4,
    "に":3,
    "ぬ":2,
    "ね":2,
    "の":1,
    "は":3,
    "ば":5,
    "ぱ":4,
    "ひ":1,
    "び":3,
    "ぴ":2,
    "ふ":4,
    "ぶ":6,
    "ぷ":5,
    "へ":1,
    "べ":3,
    "ぺ":2,
    "ほ":4,
    "ぼ":6,
    "ぽ":5,
    "ま":3,
    "み":1,
    "む":3,
    "め":2,
    "も":3,
    "ゃ":3,
    "や":3,
    "ゅ":2,
    "ゆ":2,
    "ょ":2,
    "よ":2,
    "ら":2,
    "り":2,
    "る":1,
    "れ":2,
    "ろ":1,
    "ゎ":2,
    "わ":2,
    "ゐ":1,
    "ゑ":1,
    "を":3,
    "ん":1,
    "ゔ":4,
    "ゕ":3,
    "ゖ":3,
    "ァ":2,
    "ア":2,
    "ィ":2,
    "イ":2,
    "ゥ":3,
    "ウ":3,
    "ェ":3,
    "エ":3,
    "ォ":3,
    "オ":3,
    "カ":2,
    "ガ":4,
    "キ":3,
    "ギ":5,
    "ク":2,
    "グ":4,
    "ケ":3,
    "ゲ":5,
    "コ":2,
    "ゴ":4,
    "サ":3,
    "ザ":5,
    "シ":3,
    "ジ":5,
    "ス":2,
    "ズ":4,
    "セ":2,
    "ゼ":4,
    "ソ":2,
    "ゾ":4,
    "タ":3,
    "ダ":5,
    "チ":3,
    "ヂ":5,
    "ッ":3,
    "ツ":3,
    "ヅ":5,
    "テ":3,
    "デ":5,
    "ト":2,
    "ド":4,
    "ナ":2,
    "ニ":2,
    "ヌ":2,
    "ネ":4,
    "ノ":1,
    "ハ":2,
    "バ":4,
    "パ":3,
    "ヒ":2,
    "ビ":4,
    "ピ":3,
    "フ":1,
    "ブ":3,
    "プ":2,
    "ヘ":1,
    "ベ":3,
    "ペ":2,
    "ホ":4,
    "ボ":6,
    "ポ":5,
    "マ":2,
    "ミ":3,
    "ム":2,
    "メ":2,
    "モ":3,
    "ャ":2,
    "ヤ":2,
    "ュ":2,
    "ユ":2,
    "ョ":3,
    "ヨ":3,
    "ラ":2,
    "リ":2,
    "ル":2,
    "レ":1,
    "ロ":3,
    "ヮ":2,
    "ワ":2,
    "ヲ":3,
    "ン":2,
    "ヴ":5,
    "ヵ":2,
    "ヶ":3,
    "ヷ":4,
    "ヺ":5,
    "々":3,
    "ー":1
    #"＝":2,
    #"〜":1,
    #"℃":2,
    #"ⅲ":6

}
    
    
c = Cihai()
if not c.unihan.is_bootstrapped:  # download and install Unihan to db
    c.unihan.bootstrap()
    
transTable="＝〜℃ⅲ＆１２２１６／α•／"

        
with open('./../../../data/non_filtered/corpora/pud/jpn_pud.csv', newline='',encoding="utf-8") as csvfile:
    with open("./../../../data/non_filtered/corpora/pud/jpn_pud_strokes.csv", 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["","word","frequency","n_characters"])
        
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(spamreader)
        for row in spamreader:
            word=row[0]
            strokeSum=0
            bad=False
            for char in word:
                if char in hiraganaKatakanaToStrokes:
                    strokeSum+=hiraganaKatakanaToStrokes[char]
                else:
                    query = c.unihan.lookup_char(char)
                    glyph = query.first()
                    if glyph!=None:
                        strokeSum+=int(glyph.kTotalStrokes)
                    else:
                        bad=True
                        break
            if not bad:
                writer.writerow([word,word,row[2],strokeSum])
    
import cutlet
nkatu = cutlet.Cutlet('kunrei')
nkatu.use_foreign_spelling = False

with open('./../../../data/non_filtered/corpora/pud/jpn_pud.csv', newline='',encoding="utf-8") as csvfile:
    with open("./../../../data/non_filtered/corpora/pud/jpn_pud_romaji.csv", 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["","word","frequency","n_characters","romanized_form"])
        
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(spamreader)
        for row in spamreader:
            word=row[0]
            strokeSum=0
            bad=False
            for elem in transTable:
                if elem in word:
                    bad=True
                    break
            if not bad:
                romWord=nkatu.romaji(word).strip()
                if "?" not in romWord:
                    strokeSum+=len(romWord)
                    writer.writerow([word,word,row[2],strokeSum,romWord])
    

    


In [12]:
#If the chinese version is desired, the code works as is for the obtention of the chinese CSV
from cihai.core import Cihai
import csv

c = Cihai()
if not c.unihan.is_bootstrapped:  # download and install Unihan to db
    c.unihan.bootstrap()
    

transTable="＝〜℃ⅲ＆１２２１６／α•／"

        
with open('./../../../data/non_filtered/corpora/pud/zho_pud.csv', newline='',encoding="utf-8") as csvfile:
    with open("./../../../data/non_filtered/corpora/pud/zho_pud_strokes.csv", 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["","word","frequency","n_characters"])
        
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(spamreader)
        for row in spamreader:
            word=row[0]
            strokeSum=0
            bad=False
            for char in word:
                query = c.unihan.lookup_char(char)
                glyph = query.first()
                if glyph!=None:
                    strokeSum+=int(glyph.kTotalStrokes[0])
                else:
                    bad=True
                    break
            if not bad:
                writer.writerow([word,word,row[2],strokeSum])
                
    
with open('./../../../data/non_filtered/corpora/pud/zho_pud.csv', newline='',encoding="utf-8") as csvfile:
    with open("./../../../data/non_filtered/corpora/pud/zho_pud_pinyin.csv", 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["","word","frequency","n_characters","romanized_form"])
        
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(spamreader)
        for row in spamreader:
            word=row[0]
            strokeSum=0
            bad=False
            romWord=""
            for char in word:
                query = c.unihan.lookup_char(char)
                glyph = query.first()
                if glyph!=None:
                    strokeSum+=len(glyph.kMandarin)
                    romWord+=glyph.kMandarin
                else:
                    bad=True
                    break
            if not bad:
                writer.writerow([word,word,row[2],strokeSum,romWord])
