In [1]:
!apt-get install mecab mecab-ipadic-utf8 libmecab-dev swig
!pip install mecab-python3==0.7
!pip install furigana
!pip install pykakasi

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libmecab2 mecab-ipadic mecab-jumandic mecab-jumandic-utf8 mecab-utils
  swig3.0
Suggested packages:
  swig-doc swig-examples swig3.0-examples swig3.0-doc
The following NEW packages will be installed:
  libmecab-dev libmecab2 mecab mecab-ipadic mecab-ipadic-utf8 mecab-jumandic
  mecab-jumandic-utf8 mecab-utils swig swig3.0
0 upgraded, 10 newly installed, 0 to remove and 39 not upgraded.
Need to get 30.1 MB of archives.
After this operation, 282 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libmecab2 amd64 0.996-5 [257 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libmecab-dev amd64 0.996-5 [308 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 mecab-utils amd64 0.996-5 [4,856 B]
Get:4 http://archive.ubuntu.com/ubuntu bionic/universe amd64 mecab-juma

In [2]:
import pandas as pd
from collections import Counter
import sys
import io
from furigana.furigana import print_html
from nltk import ngrams

# these csv files are passed into the TextDataFrame...
df = pd.read_csv("/content/Real Japanese Sentences - FROM_ONLINE_JP_TO_ENGLISH_DICTIONARY.csv")
one_words = pd.read_csv("/content/one_words.csv")

phrases = pd.read_csv("/content/Real Japanese Sentences - phrases.csv")
phrases["no_punct"] = phrases['phrases'].str.replace('[^\w\s]','')
phrases["no_punct"] = [s.lower() for s in phrases["no_punct"]]
phrases = set(phrases["no_punct"])

print(phrases)

# adding in verb sentences
df2 = pd.read_csv("/content/Real Japanese Sentences - VERBS.csv")

df = df.append(df2, ignore_index=True)
len(df["japanese"])

{'i was thinking', 'ran over', 'dont know how', 'they were', 'look at how', 'how are you doing', 'it', 'tomorrow', 'look at', 'someone will', 'to use the', 'such a', 'would be', 'this is', 'before', 'buy a new', 'we are like', 'losing', 'what about this', 'afraid of', 'im still', 'go ahead', 'picture', 'always wanted', 'still coming', 'as long as', 'those are', 'do you have to', 'how many', 'hold this', 'being who', 'i got a', 'how could', 'go to', 'going to', 'did not', 'did you find', 'couldnt be', 'come in', 'what about', 'time to go', 'the greatest', 'we are here', 'you are a', 'do you have', 'giving', 'did you have', 'wear', 'who could', 'found', 'how could i', 'this is called', 'it was', 'want to watch', 'it seems', 'have to', 'maybe', 'was in the', 'used to be', 'see what', 'i would', 'i think im going to', 'everything into', 'somebody will', 'there could be', 'wait in the', 'not my', 'on my', 'there are', 'we have', 'did you', 'actually i', 'want', 'im not', 'its ok to', 'go ge

56466

In [3]:
class TextDataFrame:
  def __init__(self, df, one_words):
    self.df = df
    self.one_words = one_words
    self.df_setup()

  def df_setup(self):
    self.one_words["no_punct"] = self.one_words['words'].str.replace('[^\w\s]','')
    self.one_words["no_punct"] = [s.lower() for s in self.one_words["no_punct"]]
    self.one_words = set(self.one_words["no_punct"])

    self.df["no_punct"] = self.df['english'].str.replace('[^\w\s]','')

    temp = []
    for i in range(len(self.df)):
      for word in self.df["no_punct"][i].lower().split():
        if word in self.one_words:
          temp.append(self.df.loc[i])
          break

    self.df = pd.DataFrame(temp)
    self.df["no_punct"] = [s.lower() for s in self.df["no_punct"]]

    # replacing all　？　with　。 as question marks don't actually exist...
    self.df["japanese"] = self.df['japanese'].str.replace('？','。')

    self.df = self.df.drop_duplicates(subset="japanese", keep="first")
    self.add_furigana_column()

    self.df["id"] = [i for i in range(len(self.df))]

    self.df = self.df.reset_index(drop=True)

  def filtered_df(self, filter_word):
    temp = []
    for i, sent in enumerate(self.df["no_punct"]):
      for w in sent.lower().split():
        if w == filter_word:
          temp.append(self.df.iloc[i])
          break

    return pd.DataFrame(temp)

  def top_n_grams(self, num_n_grams=100, gram_num=2, filter_word=None):
    if filter_word:
      bigtxt = " ".join(self.filtered_df(filter_word)["no_punct"])
      ngram_counts = Counter(ngrams(bigtxt.split(), gram_num))

      return ngram_counts.most_common(num_n_grams)
    else:
      bigtxt = " ".join(self.df["no_punct"])
      ngram_counts = Counter(ngrams(bigtxt.split(), gram_num))
    
      return ngram_counts.most_common(num_n_grams)

  def sentences_containing(self, phrase, max_num_words=100):
    temp_df = self.df.loc[self.df["no_punct"].str.contains(phrase, case=False)]

    temp = []
    for i in range((len(temp_df))):
      if len(temp_df["no_punct"].iloc[i].split()) <= max_num_words:
        temp.append(temp_df.iloc[i])

    return pd.DataFrame(temp)

  def sentence_to_furigana(self, text):
    """
    note this only converts a single sentence into furigana
    """
    old_stdout = sys.stdout
    new_stdout = io.StringIO()
    sys.stdout = new_stdout

    print_html(text)

    output = new_stdout.getvalue()
    sys.stdout = old_stdout

    output = output.replace("</rb><rt>", "(").replace("</rt></ruby>", ")").replace("<ruby><rb>", "").replace("\r", "").replace("\n", "")

    return output

  def add_furigana_column(self):
    temp = []
    for s in self.df["japanese"]:
      fur = ""
      try:
        fur = self.sentence_to_furigana(s)

      except Exception:
        fur = None
      
      temp.append(fur)

    self.df["furigana"] = temp

In [4]:
text_df = TextDataFrame(df, one_words)
text_df.df.head(5)

Unnamed: 0,japanese,kana,english,no_punct,furigana,id
0,何するの。,なに する の？,What are you doing?,what are you doing,何(なに)するの。,0
1,勉強する。,べんきょう する。,I will study.,i will study,勉強(べんきょう)する。,1
2,頭痛がする。,ずつう が する。,I have a headache.,i have a headache,頭痛(ずつう)がする。,2
3,約束するよ。,やくそく する よ。,I give you my word.,i give you my word,約束(やくそく)するよ。,3
4,心配するな。,しんぱい する な。,Don't worry about it!,dont worry about it,心配(しんぱい)するな。,4


In [5]:
text_df.top_n_grams(num_n_grams=5, gram_num=3, filter_word="what")

[(('what', 'do', 'you'), 87),
 (('what', 'are', 'you'), 50),
 (('what', 'did', 'you'), 30),
 (('do', 'you', 'want'), 26),
 (('do', 'you', 'think'), 24)]

In [6]:
top_100_bigram = text_df.top_n_grams(num_n_grams=100, gram_num=2, filter_word=None)
top_100_bigram = [arr[0][0] + " " + arr[0][1] for arr in top_100_bigram]
top_100_bigram = set(top_100_bigram)

In [7]:
text_df.top_n_grams(num_n_grams=10, gram_num=3, filter_word=None)

[(('i', 'want', 'to'), 188),
 (('a', 'lot', 'of'), 186),
 (('do', 'you', 'have'), 168),
 (('i', 'have', 'a'), 152),
 (('he', 'is', 'a'), 137),
 (('id', 'like', 'to'), 95),
 (('what', 'do', 'you'), 87),
 (('tom', 'is', 'a'), 85),
 (('you', 'have', 'a'), 84),
 (('there', 'is', 'a'), 84)]

In [8]:
i = 0
text_df.sentences_containing("would", max_num_words=6)[i*60:(i+1)*60]



Unnamed: 0,japanese,kana,english,no_punct,furigana,id
158,彼に会いたいものだ。,かれ に あいたい もの だ。,I would like to meet him.,i would like to meet him,彼(かれ)に会(あ)いたいものだ。,158
373,２時でいい。,２じ で いい？,Would two o'clock be all right?,would two oclock be all right,２時(じ)でいい。,373
923,私に教えてくれる。,わたし に おしえてくれる？,Would you teach me?,would you teach me,私(わたし)に教(おし)えてくれる。,923
3255,一緒に遊ばない。,いっしょ に あそばない？,Would you play with me?,would you play with me,一緒(いっしょ)に遊(あそ)ばない。,3255
3264,一緒に遊びませんか。,いっしょ に あそびません か。,Would you play with me?,would you play with me,一緒(いっしょ)に遊(あそ)びませんか。,3264
3795,トムは煙草をやめると言った。,トム は たばこ を やめる と いった。,Tom said he would stop smoking.,tom said he would stop smoking,トムは煙草(たばこ)をやめると言(い)った。,3795
4019,どちらへ行きたいですか。,どちら へ いきたいです か。,Where would you like to go?,where would you like to go,どちらへ行(い)きたいですか。,4019
4150,理解していただきたい。,りかい していただきたい。,I would like you to understand.,i would like you to understand,理解(りかい)していただきたい。,4150
4216,彼らは二度と会うことはなかった。,かれら は にどと あう こと は なかっ た。,They would never meet again.,they would never meet again,彼(かれ)らは二(に)度(ど)と会(あ)うことはなかった。,4216
4423,お前だったら何て言う。,おまえ だったら なに て いう？,What would you say then?,what would you say then,お前(まえ)だったら何(なに)て言(い)う。,4423


In [9]:
s =  "0 20 880 891 1047 1056 1332 1700 1703 1829 1830 2797 2807 3756 6471 7474 7818 8008 8569 8711 11934 15730 23628 24758"
s += " 406 2267 3418 3682 3938 4498 4674 5753 6167 6174 6319 7679 8567 8667 8676 13405 13956 14229 14883 15174 15797 16647 20369 22185 26912 26913 25342 22613"
s += " 356 1385 3186 19923 19925 19927 20037 25528 191 53 292 415 781 3066 1693 1695 6722 7011 10493 11921 14230 17984 18376 5128 365 395 559 1528 1840 1987 2221"
s += " 5375 7590 7792 8671 9766 11779 11783 12182 12417 13121 12695 14191 16713 18762 24401 5370 5373 5419 5439 5503 5824 5939 5940 6400 6401 6461 6469 6874"
s += " 6898 7246 7404 7784 8097 8103 8134 8573 8952 9499 9553 10711 11781 11787 259 262 269 272 9305 14213 17135 19293 26512 260 271 18764"
s += " 361 3100 5411  5414 5860 7938 8548 14537 18911 20817 27285 3327 14922 124 2076 2081 123 4447 3926 3923 7468 11242 11240 16229 16860 21898 22338 190"
s += " 84 461 2599 2113 4679 6132 14386 730 3264 6045 8033 11986 11536 13908 15039 16672 18910 18911 18913 18915 18917 18921 18923 19607 19863 19865 19867"
s += " 20861 22784 23525 24238 24239 5473 5712 5713 6873 8093 8394 18766 13122 11085"

s = [int(s) for s in s.split()]
s = list(set(s))
s.sort()

# text_df.df.iloc[s].to_csv("out.csv", index=False)

In [10]:

# a

In [11]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [12]:
# text_df.sentences_containing("this is the", max_num_words=6)


In [14]:
temp = text_df.df[text_df.df["japanese"].str.contains("物")]
temp



Unnamed: 0,japanese,kana,english,no_punct,furigana,id
207,この品物は安い。,この しなもの は やすい。,This article is cheap.,this article is cheap,この品物(しなもの)は安(やす)い。,207
358,それは、わたしの好物です。,それ は、わたし の こうぶつ です。,It's my favorite food.,its my favorite food,それは、わたしの好物(こうぶつ)です。,358
493,昔の物語によると猫が九性有りと言われています。,むかし の ものがたり に よると ねこ が きゅうせい あり と いわれています。,"According to folktales, cats have nine lives.",according to folktales cats have nine lives,昔(むかし)の物語(ものがたり)によると猫(ねこ)が九(きゅう)性(せい)有(あ)りと言(い...,493
582,甘い物にはつい手が出てしまう。,あまい もの には つい て が でてしまう。,I can't resist sweet things.,i cant resist sweet things,甘(あま)い物(もの)にはつい手(て)が出(で)てしまう。,582
650,人は交わる友によってその人物がわかる。,ひと は まじわる とも によって その じんぶつ が わかる。,A man is known by the company he keeps.,a man is known by the company he keeps,人(ひと)は交(まじ)わる友(とも)によってその人物(じんぶつ)がわかる。,650
663,人間は言葉を持つ唯一の動物である。,にんげん は ことば を もつ ゆいいつ の どうぶつ である。,Man is the only animal that possesses language.,man is the only animal that possesses language,人間(にんげん)は言葉(ことば)を持(も)つ唯一(ゆいいつ)の動物(どうぶつ)である。,663
690,物事をじっくり考える慎重派です。,ものごと を じっくり かんがえる しんちょうは です。,I'm the type who likes to think things over ve...,im the type who likes to think things over ver...,物事(ものごと)をじっくり考(かんが)える慎重(しんちょう)派(は)です。,690
827,食物は生きるうえでなくてはならない。,しょくもつ は いきる うえ でなく てはならない。,Food is essential to life.,food is essential to life,食物(しょくもつ)は生(い)きるうえでなくてはならない。,827
914,博物館は午前九時から開いている。,はくぶつかん は ごぜん きゅうじ から ひらいている。,The museum is open from 9 a.m.,the museum is open from 9 am,博物館(はくぶつかん)は午前(ごぜん)九(きゅう)時(じ)から開(ひら)いている。,914
928,洗濯物干してくれる。,せんたく ものほし てくれる？,Could you hang up the laundry?,could you hang up the laundry,洗濯(せんたく)物(も)干(の)してくれる。,928


In [13]:
"""
話す

3160
3161
3163
私は日本語を話さない。
3165

"""

"""
聞く

俺に聞くなよ。
"""

"""
読む

1809
2753
2754
2761
2762

2784
2791
6656
21635

"""

"""
食べる

70
1035
1162
2473
2474
2483
11677
13117
13120
15174
20037
25528
"""

"""
行く

68
1694
16078
27383
27387
"""

'\n行く\n\n68\n1694\n16078\n27383\n27387\n'