# Bag of words
### 1. CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
text = 'The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research.'

In [None]:
cv = CountVectorizer()
cv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [None]:
# 학습을 시킬 때에는 2차원 리스트로
output = cv.fit_transform([text])
output

<1x15 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [None]:
output.toarray() # 리스트로 변환했던 텍스트를 배열로 변환

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1]])

In [None]:
cv.vocabulary_ # 현재 텍스트를 구성하고 있는 단어들을 칼럼으로 구성 => 열을 볼 수 있음

{'been': 0,
 'collected': 1,
 'collection': 2,
 'for': 3,
 'have': 4,
 'is': 5,
 'messages': 6,
 'of': 7,
 'research': 8,
 'set': 9,
 'sms': 10,
 'spam': 11,
 'tagged': 12,
 'that': 13,
 'the': 14}

 - 불용어 처리

In [None]:
# 1. 자체 제거
cv = CountVectorizer(stop_words=['a', 'the', 'is', 'that', 'for', 'been', 'of', 'have'])
print(cv.fit_transform([text]).toarray())
print(cv.vocabulary_)

[[1 1 1 1 1 3 2 1]]
{'sms': 5, 'spam': 6, 'collection': 1, 'set': 4, 'tagged': 7, 'messages': 2, 'collected': 0, 'research': 3}


In [None]:
# 2. Scikit-learn에서 제공하는 불용어 사전 쓰기
# Scikit-learn에서 한글 지원은 되지 않음
cv = CountVectorizer(stop_words='english')
print(cv.fit_transform([text]).toarray())
print(cv.vocabulary_)

[[1 1 1 1 1 3 2 1]]
{'sms': 5, 'spam': 6, 'collection': 1, 'set': 4, 'tagged': 7, 'messages': 2, 'collected': 0, 'research': 3}


In [None]:
# NLTK 불용어 사전
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
sw = stopwords.words('english')
len(sw), type(sw)

(179, list)

In [None]:
cv = CountVectorizer(stop_words=sw)
print(cv.fit_transform([text]).toarray())
print(cv.vocabulary_)

[[1 1 1 1 1 3 2 1]]
{'sms': 5, 'spam': 6, 'collection': 1, 'set': 4, 'tagged': 7, 'messages': 2, 'collected': 0, 'research': 3}


 - 인덱스에 해당하는 단어가 무엇인지 알려주는 함수 만들기

In [None]:
voca = cv.vocabulary_
for k, value in voca.items():
  print(k, value)

sms 5
spam 6
collection 1
set 4
tagged 7
messages 2
collected 0
research 3


In [None]:
# 문장 내에서 해당 인덱스의 단어 찾아주는 함수 정의
def get_word(index, voca):
  for k, value in voca.items():
    if value == index:
      return k

In [None]:
get_word(3, cv.vocabulary_)

'research'

### 2. N-gram

In [None]:
# Bag of words 방식의 문제점 => 같은 단어로 조합된 다른 문장을 구분 못 함
txt = ['I work at google', 'I google at work']
cv = CountVectorizer()
print(cv.fit_transform(txt).toarray())
print(cv.vocabulary_)

[[1 1 1]
 [1 1 1]]
{'work': 2, 'at': 0, 'google': 1}


In [None]:
# N-gram 방식 사용 시 (문제는 단어 갯수가 기하급수 적으로 늘어남)
cv = CountVectorizer(ngram_range=(1,2))
print(cv.fit_transform(txt).toarray())
print(cv.vocabulary_)

[[1 1 0 1 0 1 1]
 [1 0 1 1 1 1 0]]
{'work': 5, 'at': 0, 'google': 3, 'work at': 6, 'at google': 1, 'google at': 4, 'at work': 2}


 - TfidfVectorizer(Term Frequency, Inverse document Frequency)

In [None]:
article = ['The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research.' 
, 'It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvt = TfidfVectorizer(stop_words='english')
print(tvt.fit_transform(article).toarray())

[[0.         0.         0.30015142 0.30015142 0.         0.
  0.         0.         0.21356021 0.30015142 0.21356021 0.64068062
  0.42712041 0.21356021]
 [0.31544091 0.31544091 0.         0.         0.31544091 0.31544091
  0.31544091 0.31544091 0.44887761 0.         0.2244388  0.2244388
  0.2244388  0.2244388 ]]


In [None]:
cv = CountVectorizer(stop_words='english')
print(cv.fit_transform(article).toarray())
print(cv.vocabulary_)

[[0 0 1 1 0 0 0 0 1 1 1 3 2 1]
 [1 1 0 0 1 1 1 1 2 0 1 1 1 1]]
{'sms': 11, 'spam': 12, 'collection': 3, 'set': 10, 'tagged': 13, 'messages': 8, 'collected': 2, 'research': 9, 'contains': 4, 'english': 5, '574': 0, 'acording': 1, 'ham': 6, 'legitimate': 7}
