In [8]:
# 모든 단어를 문맥이나 순서를 무시하고 일괄적으로 단어에 대해 빈도 값을 부여해 피쳐 값을 추출하는 모델
# Counter Vectorizer 클래스로 BoW 만들기
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['you know I want your love. because I love you']

In [9]:
cvector = CountVectorizer()
type(corpus)

list

In [16]:
cvector.fit(corpus) # 학습
cvector.transform(corpus).toarray() # 변환
cvector.vocabulary_

{'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}

In [18]:
output = cvector.fit_transform(corpus).toarray()
output, output.shape

(array([[1, 1, 2, 1, 2, 1]], dtype=int64), (1, 6))

In [22]:
# 불용어를 제거한 BoW
# 사용자가 직접 정의한 불용어 사용
text = ['Family is not an important thing. It\'s everything.']
vect = CountVectorizer(stop_words = ['the', 'a', 'an', 'is', 'not'])
print(vect.fit_transform(text).toarray())
print(vect.vocabulary_)

[[1 1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}


In [23]:
# CounterVecrtorizer(Scikit-Learn)에서 제공하는 자체불용어 사용
vect = CountVectorizer(stop_words = 'english')
print(vect.fit_transform(text).toarray())
print(vect.vocabulary_)

[[1 1 1]]
{'family': 0, 'important': 1, 'thing': 2}


In [26]:
# NLTK 제공
import nltk
from nltk.corpus import stopwords
sw = stopwords.words('english')
vect = CountVectorizer(stop_words = sw)
print(vect.fit_transform(text).toarray())
print(vect.vocabulary_)

[[1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 3, 'everything': 0}


In [27]:
def get_word(index, voca):
    for key, value in voca.items():
        if value == index:
            return key

In [32]:
get_word(1, vect.vocabulary_)

'family'

In [39]:
# N-gram
text1 = ['Machine learning is fun and not boring']
text2 = ['Machine is boring and learning is not fun']

In [40]:
vect = CountVectorizer()
print(vect.fit_transform(text1).toarray())
print(vect.vocabulary_)

[[1 1 1 1 1 1 1]]
{'machine': 5, 'learning': 4, 'is': 3, 'fun': 2, 'and': 0, 'not': 6, 'boring': 1}


In [41]:
print(vect.fit_transform(text2).toarray())
print(vect.vocabulary_)

[[1 1 1 2 1 1 1]]
{'machine': 5, 'is': 3, 'boring': 1, 'and': 0, 'learning': 4, 'not': 6, 'fun': 2}


In [43]:
# N-gram range : (1, 2)
vect = CountVectorizer(ngram_range = (1, 2))
print(vect.fit_transform(text1).toarray())
print(vect.vocabulary_)
print(vect.fit_transform(text2).toarray())
print(vect.vocabulary_)

[[1 1 1 1 1 1 1 1 1 1 1 1 1]]
{'machine': 9, 'learning': 7, 'is': 5, 'fun': 3, 'and': 0, 'not': 11, 'boring': 2, 'machine learning': 10, 'learning is': 8, 'is fun': 6, 'fun and': 4, 'and not': 1, 'not boring': 12}
[[1 1 1 1 1 2 1 1 1 1 1 1 1 1]]
{'machine': 10, 'is': 5, 'boring': 2, 'and': 0, 'learning': 8, 'not': 12, 'fun': 4, 'machine is': 11, 'is boring': 6, 'boring and': 3, 'and learning': 1, 'learning is': 9, 'is not': 7, 'not fun': 13}


In [45]:
vect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 2),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [49]:
# DTM
corpus = [
    'you know I want your love',
    'I like you',
    'what should I do'
    ]
vect = CountVectorizer()
print(vect.fit_transform(corpus).toarray())
print(vect.vocabulary_)

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [54]:
# TF-DTM Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer()
print(tvect.fit_transform(corpus).toarray().round(2))
print(tvect.vocabulary_)

[[0.   0.47 0.   0.47 0.   0.47 0.   0.36 0.47]
 [0.   0.   0.8  0.   0.   0.   0.   0.61 0.  ]
 [0.58 0.   0.   0.   0.58 0.   0.58 0.   0.  ]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [56]:
tvect = TfidfVectorizer(ngram_range = (1, 2), stop_words = 'english')
print(tvect.fit_transform(corpus).toarray().round(2))
print(tvect.vocabulary_)
tvect.get_params()

[[0.45 0.45 0.   0.45 0.45 0.45]
 [0.   0.   1.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.  ]]
{'know': 0, 'want': 4, 'love': 3, 'know want': 1, 'want love': 5, 'like': 2}


{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 2),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': 'english',
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}