# Bag of Words
- Bag of Words란 단어들의 순서는 전혀 고려하지 않고, 단어들의 출현 빈도(frequency)에만 집중하는 텍스트 데이터의 수치화 표현 방법입니다.
- 문장에 있는 단어들을 가방(Bag)안에 넣어 빈도수를 분석

## count Vectorizer

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['you know I want your love. because I love you.'] # string 리스트로 넣어줘야한다.

In [2]:
cvector = CountVectorizer() # 객체 선언

In [3]:
cvector.fit(corpus) # corpus를 학습(fit)
cvector.transform(corpus).toarray() # 변환

array([[1, 1, 2, 1, 2, 1]], dtype=int64)

In [4]:
cvector.vocabulary_

{'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}

In [5]:
output = cvector.fit_transform(corpus).toarray()
output

array([[1, 1, 2, 1, 2, 1]], dtype=int64)

In [6]:
output.shape

(1, 6)

In [7]:
# 사용자 정의
text = ["Familty is not an important thing. It's everything."]
vect = CountVectorizer(stop_words=["the","a","an","is","not"])
print(vect.fit_transform(text).toarray())
print(vect.vocabulary_)

[[1 1 1 1 1]]
{'familty': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}


In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\igksj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# NLTK 제공
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

text=["Family is not an important thing. It's everything."]
sw = stopwords.words("english")
vect = CountVectorizer(stop_words =sw)
print(vect.fit_transform(text).toarray()) 
print(vect.vocabulary_)

[[1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 3, 'everything': 0}


In [10]:
def get_word(index, voca):
    for key, value in voca.items():
        if value == index:
            return key

In [11]:
get_word(2, vect.vocabulary_)

'important'

### N-gram

In [12]:
text1 = ['Machine learning is fun and not boring']
text2 = ['Machine is boring and learning is not fun']

In [13]:
vect = CountVectorizer()
print(vect.fit_transform(text1).toarray())
print(vect.vocabulary_)

[[1 1 1 1 1 1 1]]
{'machine': 5, 'learning': 4, 'is': 3, 'fun': 2, 'and': 0, 'not': 6, 'boring': 1}


In [14]:
print(vect.fit_transform(text2).toarray())
print(vect.vocabulary_)

[[1 1 1 2 1 1 1]]
{'machine': 5, 'is': 3, 'boring': 1, 'and': 0, 'learning': 4, 'not': 6, 'fun': 2}


In [15]:
vect = CountVectorizer(ngram_range=(1,2))
print(vect.fit_transform(text1).toarray())
print(vect.vocabulary_)

[[1 1 1 1 1 1 1 1 1 1 1 1 1]]
{'machine': 9, 'learning': 7, 'is': 5, 'fun': 3, 'and': 0, 'not': 11, 'boring': 2, 'machine learning': 10, 'learning is': 8, 'is fun': 6, 'fun and': 4, 'and not': 1, 'not boring': 12}


In [16]:
print(vect.fit_transform(text2).toarray())
print(vect.vocabulary_)

[[1 1 1 1 1 2 1 1 1 1 1 1 1 1]]
{'machine': 10, 'is': 5, 'boring': 2, 'and': 0, 'learning': 8, 'not': 12, 'fun': 4, 'machine is': 11, 'is boring': 6, 'boring and': 3, 'and learning': 1, 'learning is': 9, 'is not': 7, 'not fun': 13}


In [17]:
vect = CountVectorizer(ngram_range=(1,3), stop_words='english')
print(vect.fit_transform(text1).toarray())
print(vect.vocabulary_)

[[1 1 1 1 1 1 1 1 1]]
{'machine': 6, 'learning': 3, 'fun': 1, 'boring': 0, 'machine learning': 7, 'learning fun': 4, 'fun boring': 2, 'machine learning fun': 8, 'learning fun boring': 5}


### Hyper parameter

In [18]:
vect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 3),
 'preprocessor': None,
 'stop_words': 'english',
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

## DTM(Document-Term Matrix)

In [20]:
corpus = ['you know I want your love. because I love you.','I like you','What should I do']
vect = CountVectorizer()
print(vect.fit_transform(corpus).toarray())
print(vect.vocabulary_)

[[1 0 1 0 2 0 1 0 2 1]
 [0 0 0 1 0 0 0 0 1 0]
 [0 1 0 0 0 1 0 1 0 0]]
{'you': 8, 'know': 2, 'want': 6, 'your': 9, 'love': 4, 'because': 0, 'like': 3, 'what': 7, 'should': 5, 'do': 1}


## TF-IDF Vectorizer

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer()
print(tvect.fit_transform(corpus).toarray().round(2))
print(tvect.vocabulary_)

[[0.31 0.   0.31 0.   0.62 0.   0.31 0.   0.47 0.31]
 [0.   0.   0.   0.8  0.   0.   0.   0.   0.61 0.  ]
 [0.   0.58 0.   0.   0.   0.58 0.   0.58 0.   0.  ]]
{'you': 8, 'know': 2, 'want': 6, 'your': 9, 'love': 4, 'because': 0, 'like': 3, 'what': 7, 'should': 5, 'do': 1}


In [25]:
tvect = TfidfVectorizer(ngram_range=(1,2),stop_words='english')
print(tvect.fit_transform(corpus).toarray().round(2))
print(tvect.vocabulary_)

[[0.33 0.33 0.   0.67 0.33 0.33 0.33]
 [0.   0.   1.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.  ]]
{'know': 0, 'want': 5, 'love': 3, 'know want': 1, 'want love': 6, 'love love': 4, 'like': 2}
