# Initialisation

In [None]:
%pip install fugashi[unidic-lite]
%pip install anki

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting unidic-lite (from fugashi[unidic-lite])
  Downloading unidic-lite-1.0.8.tar.gz (47.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: unidic-lite
  Building wheel for unidic-lite (setup.py) ... [?25l[?25hdone
  Created wheel for unidic-lite: filename=unidic_lite-1.0.8-py3-none-any.whl size=47658818 sha256=1461cb4789b771ded4696c32d23e7bd6fa766db0fe8c1946604f78ee580c645e
  Stored in directory: /root/.cache/pip/wheels/89/e8/68/f9ac36b8cc6c8b3c96888cd57434abed96595d444f42243853
Successfully built unidic-lite
Installing collected packages: unidic-lite
Successfully installed unidic-lite-1.0.8
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import re
from fugashi import Tagger
from anki import collection
import zipfile
import json

In [56]:
def import_frequencies(filename,word_col,freq_col,separator,start_line,ascending):
  unicode_ranges = [
    r'\u3005-\u3006',   # Kanji punctuation
    r'\u3040-\u309F',   # Hiragana
    r'\u30A0-\u30FA',   # Katakana
    r'\u4E00-\u9FAF',   # Kanji
  ]
  filter = '[^' + ''.join(unicode_ranges) + ']'

  frequencies = {}
  tagger = Tagger()

  file = open(filename, 'r', encoding="utf8")
  for l, line in enumerate(file,1):
    if(l>=start_line):
      split = line.split(separator)
      word = split[word_col][:-1]
      frequency = float(split[freq_col])

      match = re.search(filter,word) != None # filter out words with invalid characters
      if(match == False):
        lemma = str(tagger(word)[0].feature.lemma) # lemmatise word to standardise form
        frequencies[lemma] = frequencies.get(lemma, 0) + frequency # add value to key rather than overriding (in the case of multiple lemma occurrences)

  if(ascending): # reverse association of key to value if frequency list ascending
    frequencies = {key: list(reversed(list(frequencies.values())))[i] for i, key in enumerate(frequencies)}
  return dict(sorted(frequencies.items(), key=lambda item: item[1], reverse=True)) # return sorted by value

In [62]:
def import_knowledge(filename,format,start_line=None,col=None,separator=None,deck=None,field=None):
  knowledge = set()
  tagger = Tagger()

  if(format=="anki"):
    col = collection.Collection(filename)
    note_ids = col.find_notes(f"deck:{deck}") # gets note ids of deck
    for note_id in note_ids:
      note = col.get_note(note_id) # get note of note id
      if field in note:
        for word in tagger(note[field]): # append every unique lemma to knowledge
          lemma = word.feature.lemma
          if(lemma not in knowledge):
            knowledge.add(lemma)
  else:
    file = open(filename, 'r', encoding="utf8")
    for l, line in enumerate(file,1):
      if(l>=start_line-1):
        if(format=="tabular"): # only focus on given column if tabular data
          text = line.split(separator)[col]
        elif(format=="full"):
          text = line
        for word in tagger(text): # append every unique lemma to knowledge
          lemma = word.feature.lemma
          if(lemma not in knowledge):
            knowledge.add(lemma)

  return knowledge

In [None]:
def import_content(filename=None, content_string=None):
  if(filename!=None): # opens file if passed in
    file = open(filename, 'r', encoding="utf8")
    text = file.read()
  if(content_string!=None): # reads string if passed in
    text = content_string

  content = []

  lines = text.split("\n")
  for line in lines:
    sentences = line.split("。")
    for sentence in sentences:
      content.append(sentence)

  return content

In [None]:
def import_dictionary(zipname):
  term_col = 0
  def_col = 5
  reading_col = 1

  dictionary = {}

  with zipfile.ZipFile(zipname, 'r') as z:
      for filename in z.namelist():
          with z.open(filename) as f:
              if filename.startswith('term'): # iterate through every term json file in dictionary zip
                  data = f.read()
                  json_data = json.loads(data)

                  for entry in json_data:
                    if(entry[def_col] not in dictionary.get(entry[term_col],[])): # prevent duplicate entries
                      dictionary.setdefault(entry[term_col],[]).append({"content":entry[def_col],"reading":entry[reading_col]}) # add entry and reading to dictionary

  return dictionary

In [None]:
def generate_recommendations(filter_katakana,pos_filter,frequencies,knowledge,content,dictionary):
  recommendations = {}
  tagger = Tagger()

  for s, sentence in enumerate(content):
    for word in tagger(sentence):
      if(filter_katakana):
        katakana = re.search(r"[\u30A0-\u30FA]",str(word)) != None # filter out words with katakana
      else:
        katakana = False

      pos = word.pos.split(",")[0] # filter out words whose part-of-speech tags are blacklisted
      blacklist = pos in pos_filter
      lemma = str(word.feature.lemma)
      if(lemma.find("-")!=-1):
          lemma = lemma[:lemma.find("-")] # unidic adds english translation after "-" to certain katakana lemmas- remove this
      definition = dictionary.get(lemma,[])

      if(not blacklist and not katakana and lemma != "None" and definition!=[] and lemma not in knowledge):
        if(lemma not in recommendations): # create recommendation entry for word
          frequency = frequencies.get(lemma,0)
          recommendations[lemma] = {"freq": frequency, "sent":{}, "pos":pos, "def":definition}

        search_index = recommendations[lemma]["sent"].get(sentence,[(0,0)])[-1][1] # searches for index of word occurence in sentence from index of final occurrence currently found
        word_index = sentence.find(str(word),search_index)
        recommendations[lemma]["sent"].setdefault(sentence, []).append((word_index,word_index+len(str(word)))) # create list of word start and end occurrence indices for sentence if nonexistent, otherwise append

  recommendations = dict(sorted(recommendations.items(), key=lambda item: item[1]["freq"], reverse=True)) # sort by frequency
  if(recommendations!={}):
    max_frequency = recommendations[next(iter(recommendations))]["freq"] # max frequency is first value
    if(max_frequency!=0):
      recommendations = {outer_k: {inner_k: (inner_v / max_frequency if inner_k=="freq" else inner_v) for inner_k, inner_v in outer_v.items()} for outer_k, outer_v in recommendations.items()} # normalise frequencies

  return recommendations

In [48]:
def print_recommendations(recommendations, content, display):
  for r, rec in enumerate(recommendations.items(),1):
    print(str(r) + " \033[1m" + rec[0]+"\033[0m")

    if(display["Frequency Ranking"]):
      print("\033[1m"+"Frequency Ranking: "+"\033[0m"+str(format(rec[1]["freq"],".3f")))

    if(display["Part of Speech"]):
      print("\033[1m"+"Part of Speech: "+"\033[0m"+ ",".join(rec[1]["pos"]))

    if(display["Definition"]):
      print("\033[1m"+"Definition:"+"\033[0m")
      for d, definition in enumerate(rec[1]["def"],1):
        if(definition["reading"]!=""):
          reading = "(" + definition["reading"] + ")  "
        else:
          reading = ""
        print(" " + str(d) + " " + reading + ", ".join(definition["content"]))

    if(display["Source Sentences"]):
      print("\033[1m"+"Source Sentences:"+"\033[0m")

      for s, sentence in enumerate(rec[1]["sent"],1):
        sent_string = " " + str(s) + " "
        string_index = 0
        for word_indices in rec[1]["sent"][sentence]:
          sent_string += sentence[string_index:word_indices[0]] + "\033[1m\033[94m" + sentence[word_indices[0]:word_indices[1]] + "\033[0m"
          string_index = word_indices[1]
        sent_string += sentence[string_index:]

        print(sent_string)

    print("")

# Execution

Execution Instructions:

Import frequency, knowledge, content, and dictionary file, and provide filenames to respective functions.

Sample knowledge and content files are provided.

Run all cells.

In [66]:
frequencies = import_frequencies(filename="sample_data/frequencies.txt", word_col=2, freq_col=1, separator=" ", start_line=5, ascending=False)
knowledge = import_knowledge(filename="sample_data/knowledge.txt", col=0, separator=",", format="tabular", start_line=1)
content = import_content(filename="content.txt")
dictionary = import_dictionary("jmdict_english.zip")

pos_filter = ["感動詞","補助記号","助詞","記号"]
filter_katakana = False

recommendations = generate_recommendations(filter_katakana, pos_filter, frequencies, knowledge, content, dictionary)

display = {
    "Frequency Ranking":True,
    "Part of Speech":False,
    "Source Sentences":True,
    "Definition":True,
}

print_recommendations(recommendations, content, display)

1 [1m好き[0m
[1mFrequency Ranking: [0m1.000
[1mDefinition:[0m
 1 (すき)  liked, well-liked, favourite, favorite
 2 (すき)  in love (with), loved, romantically interested (in)
 3 (すき)  faddism, eccentricity
 4 (すき)  the way one likes, (as) it suits one
 5 (すき)  refined taste, elegant pursuits
 6 (ずき)  love of, affection for, enthusiast for, lover of, fan, -phile
 7 (ずき)  being attractive to, being liked by
[1mSource Sentences:[0m
 1 猫が[1m[94m好き[0mですが犬が[1m[94m好き[0mではない

2 [1m猫[0m
[1mFrequency Ranking: [0m0.311
[1mDefinition:[0m
 1 (ねこ)  cat (esp. the domestic cat, Felis catus)
 2 (ねこ)  shamisen
 3 (ねこ)  geisha
 4 (ねこ)  wheelbarrow
 5 (ねこ)  clay bed-warmer
 6 (ねこ)  bottom, submissive partner of a homosexual relationship
 7 (ねこま)  cat
[1mSource Sentences:[0m
 1 これは[1m[94m猫[0mです
 2 [1m[94m猫[0mが好きですが犬が好きではない

