In [1]:
"""
TFiDF 단어 생성기 : TfidfVectorizer  
  1. 단어 생성기[word tokenizer] : 문장(sentences) -> 단어(word) 생성
  2. 단어 사전[word dictionary] : (word, 고유수치)
  3. 희소행렬[sparse matrix] : 단어 출현 비율에 의해서 가중치 적용[type-TF, TFiDF]
    1] TF : 가중치 설정 - 단어 출현 빈도수
    2] TFiDF : 가중치 설정 - 단어 출현 빈도수 x 문서 출현빈도수의 역수            
    - tf-idf(d,t) = tf(d,t) x idf(t) [d(document), t(term)]
    - tf(d,t) : term frequency - 특정 단어 빈도수 
    - idf(t) : inverse document frequency - 특정 단어가 들어 있는 문서 출현빈도수의 역수
       -> TFiDF = tf(d, t) x log( n/df(t) ) : 문서 출현빈도수의 역수(n/df(t))
"""#  로그해주는 이유느 정규화해주기 위한것

from sklearn.feature_extraction.text import TfidfVectorizer # class

In [2]:
# 문장 
sentences = [
    "Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow.",
    "Professor Plum has a green plant in his study.",
    "Miss Scarlett watered Professor Plum's green plant while he was away from his office last week."
]

In [3]:
# 1. 단어 생성기 : 문장 -> 단어(word) 
tfidf = TfidfVectorizer() # object 생성 
tfidf_fit = tfidf.fit(sentences) # 문장 적용 -> object 생성 
print(tfidf_fit) # object info 

TfidfVectorizer()


In [4]:
# 2. 단어 사전(word dict) - {'word': 고유번호} - 영문자 오름차순
voca = tfidf_fit.vocabulary_
print('word size=', len(voca)) # word size= 31
print(voca) # {'word':고유숫자}

word size= 31
{'mr': 14, 'green': 5, 'killed': 11, 'colonel': 2, 'mustard': 15, 'in': 9, 'the': 24, 'study': 23, 'with': 30, 'candlestick': 1, 'is': 10, 'not': 17, 'very': 25, 'nice': 16, 'fellow': 3, 'professor': 21, 'plum': 20, 'has': 6, 'plant': 19, 'his': 8, 'miss': 13, 'scarlett': 22, 'watered': 27, 'while': 29, 'he': 7, 'was': 26, 'away': 0, 'from': 4, 'office': 18, 'last': 12, 'week': 28}


In [5]:
# 3. 희소행렬(sparse matrix) : text 분석 - DTM(행:doc, 열:term)
spart_tfidf = tfidf.fit_transform(sentences)
type(spart_tfidf) # scipy.sparse.csr.csr_matrix

scipy.sparse.csr.csr_matrix

In [6]:
spart_tfidf.shape # (3, 31) - (Docs, Term)
print(spart_tfidf)

  (0, 3)	0.2205828828763741
  (0, 16)	0.2205828828763741
  (0, 25)	0.2205828828763741
  (0, 17)	0.2205828828763741
  (0, 10)	0.2205828828763741
  (0, 1)	0.2205828828763741
  (0, 30)	0.2205828828763741
  (0, 23)	0.1677589680512606
  (0, 24)	0.4411657657527482
  (0, 9)	0.1677589680512606
  (0, 15)	0.2205828828763741
  (0, 2)	0.2205828828763741
  (0, 11)	0.2205828828763741
  (0, 5)	0.26055960805891015
  (0, 14)	0.4411657657527482
  (1, 8)	0.3464378827197198
  (1, 19)	0.3464378827197198
  (1, 6)	0.4555241832708016
  (1, 20)	0.3464378827197198
  (1, 21)	0.3464378827197198
  (1, 23)	0.3464378827197198
  (1, 9)	0.3464378827197198
  (1, 5)	0.2690399207469689
  (2, 28)	0.27054287522550385
  (2, 12)	0.27054287522550385
  (2, 18)	0.27054287522550385
  (2, 4)	0.27054287522550385
  (2, 0)	0.27054287522550385
  (2, 26)	0.27054287522550385
  (2, 7)	0.27054287522550385
  (2, 29)	0.27054287522550385
  (2, 27)	0.27054287522550385
  (2, 22)	0.27054287522550385
  (2, 13)	0.27054287522550385
  (2, 8)	0.205

In [7]:
# scipy matrix -> numpy matrix
arr_sparse = spart_tfidf.toarray()
type(arr_sparse) # numpy.ndarray

numpy.ndarray

In [9]:
print(arr_sparse)
arr_sparse.shape #  (3, 31) : x -> model -> y'


[[0.         0.22058288 0.22058288 0.22058288 0.         0.26055961
  0.         0.         0.         0.16775897 0.22058288 0.22058288
  0.         0.         0.44116577 0.22058288 0.22058288 0.22058288
  0.         0.         0.         0.         0.         0.16775897
  0.44116577 0.22058288 0.         0.         0.         0.
  0.22058288]
 [0.         0.         0.         0.         0.         0.26903992
  0.45552418 0.         0.34643788 0.34643788 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.34643788 0.34643788 0.34643788 0.         0.34643788
  0.         0.         0.         0.         0.         0.
  0.        ]
 [0.27054288 0.         0.         0.         0.27054288 0.15978698
  0.         0.27054288 0.20575483 0.         0.         0.
  0.27054288 0.27054288 0.         0.         0.         0.
  0.27054288 0.20575483 0.20575483 0.20575483 0.27054288 0.
  0.         0.         0.27054288 0.27054288 0.27054288 0.27054288
  0.    

(3, 31)