In [64]:
import pandas as pd
import numpy as np
from janome.tokenizer import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [82]:
# 元の文章　青空文庫より　吾輩は猫であるの冒頭を抜粋
sentence = "\
吾輩は猫である。名前はまだ無い。\
どこで生れたかとんと見当がつかぬ。\
何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している。\
吾輩はここで始めて人間というものを見た。\
しかもあとで聞くとそれは書生という人間中で一番獰悪な種族であったそうだ。\
この書生というのは時々我々を捕つかまえて煮にて食うという話である。\
"

In [83]:
target = sentence.split("。")

In [None]:
tokenizer = Tokenizer()

In [84]:
def get_token(text):
    t = Tokenizer()
    tokens = t.tokenize(text)
    word = ""
    for token in tokens:
        part_of_speech = token.part_of_speech.split(",")[0]
        if part_of_speech == "名詞":
            word +=token.surface + " "
        if part_of_speech == "動詞":
            word +=token.base_form+ " "
        if part_of_speech == "形容詞":
            word +=token.base_form+ " "
        if part_of_speech == "形容動詞":
            word +=token.base_form+ " "
    return word

In [85]:
corpus=[]
for item in target:
    token=get_token(item)
    corpus.append(token)

In [89]:
print(corpus)

['吾輩 猫 ', '名前 無い ', 'どこ 生れる 見当 つく ', '何 薄暗い する 所 ニャーニャー 泣く いた事 記憶 する いる ', '吾輩 ここ 始める 人間 もの 見る ', 'あと 聞く それ 書生 人間 中 一番 獰悪 種族 そう ', '書生 の 我々 捕る つかまえる 煮る 食う 話 ', '']


In [87]:
# TF-IDFのベクトル処理
vectorizer = TfidfVectorizer(use_idf=True)
tfidf = vectorizer.fit_transform(corpus)
 
print(tfidf.toarray())

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         1.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.70710678 0.         0.         0.
  0.         0.         0.         0.70710678 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.5        0.5        0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.5        0.         0.         0.         0.         0.5
  0.         0.        ]
 [0.         0.31622777 

In [88]:
# DataFrame型に変換して見やすく処理
import pandas as pd
display(pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names(), index=corpus))

Unnamed: 0,あと,いた事,いる,ここ,する,そう,それ,つかまえる,つく,どこ,...,煮る,獰悪,生れる,種族,聞く,薄暗い,見る,見当,記憶,食う
吾輩 猫,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
名前 無い,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
どこ 生れる 見当 つく,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,...,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0
何 薄暗い する 所 ニャーニャー 泣く いた事 記憶 する いる,0.0,0.316228,0.316228,0.0,0.632456,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.316228,0.0,0.0,0.316228,0.0
吾輩 ここ 始める 人間 もの 見る,0.0,0.0,0.0,0.430142,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.430142,0.0,0.0,0.0
あと 聞く それ 書生 人間 中 一番 獰悪 種族 そう,0.344935,0.0,0.0,0.0,0.0,0.344935,0.344935,0.0,0.0,0.0,...,0.0,0.344935,0.0,0.344935,0.344935,0.0,0.0,0.0,0.0,0.0
書生 の 我々 捕る つかまえる 煮る 食う 話,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.418767,0.0,0.0,...,0.418767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.418767
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
