### 베르누이 나이브베이즈
- 데이터 : https://www.kaggle.com/team-ai/spam-text-message-classification

In [2]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
data = pd.read_csv("spam.csv")
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
data["label"] = data["Category"].map({"spam":1, "ham":0})
data.head()

Unnamed: 0,Category,Message,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
X = data["Message"]
y = data["label"]

x_train, x_test, y_train, y_test = train_test_split(X, y,
                                                   test_size = 0.3,
                                                   random_state = 103)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(3900,) (1672,) (3900,) (1672,)


In [8]:
cv = CountVectorizer(max_features=1000, binary=True)
x_train_cv = cv.fit_transform(x_train)

In [10]:
encoded = x_train_cv.toarray()
encoded

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
cv.inverse_transform(encoded[0])

[array(['and', 'come', 'down', 'face', 'feel', 'for', 'have', 'heart',
        'into', 'life', 'loved', 'making', 'me', 'my', 'on', 'smile',
        'sun', 'the', 'you'], dtype='<U15')]

In [12]:
cv.get_feature_names()

['000',
 '04',
 '0800',
 '08000839402',
 '08000930705',
 '08712460324',
 '10',
 '100',
 '1000',
 '10p',
 '11',
 '12hrs',
 '150',
 '150p',
 '150ppm',
 '16',
 '18',
 '1st',
 '20',
 '200',
 '2000',
 '2003',
 '250',
 '2day',
 '2lands',
 '2nd',
 '30',
 '350',
 '50',
 '500',
 '5000',
 '50p',
 '750',
 '800',
 '8007',
 '86688',
 '87066',
 'able',
 'about',
 'abt',
 'ac',
 'account',
 'across',
 'actually',
 'address',
 'aft',
 'after',
 'afternoon',
 'again',
 'age',
 'age16',
 'ago',
 'ah',
 'aight',
 'all',
 'almost',
 'alone',
 'already',
 'alright',
 'also',
 'always',
 'am',
 'amp',
 'an',
 'and',
 'angry',
 'another',
 'ans',
 'answer',
 'any',
 'anyone',
 'anything',
 'anytime',
 'anyway',
 'apply',
 'ard',
 'are',
 'area',
 'around',
 'as',
 'asap',
 'ask',
 'askd',
 'asked',
 'ass',
 'at',
 'attempt',
 'auction',
 'available',
 'await',
 'award',
 'awarded',
 'away',
 'awesome',
 'b4',
 'babe',
 'baby',
 'back',
 'bad',
 'bak',
 'balance',
 'bank',
 'bath',
 'bb',
 'bcoz',
 'be',
 'be

In [13]:
len(cv.get_feature_names())

1000

#### 베르누이 나이브베이즈 분류

In [14]:
nb_clf = BernoulliNB()

nb_clf.fit(x_train_cv, y_train)

BernoulliNB()

In [15]:
x_test_cv = cv.fit_transform(x_test)

In [17]:
print(x_test_cv)

  (0, 840)	1
  (0, 252)	1
  (0, 823)	1
  (0, 832)	1
  (0, 834)	1
  (0, 603)	1
  (0, 82)	1
  (0, 318)	1
  (0, 312)	1
  (0, 236)	1
  (0, 773)	1
  (0, 90)	1
  (0, 61)	1
  (0, 465)	1
  (0, 589)	1
  (1, 940)	1
  (1, 786)	1
  (1, 615)	1
  (2, 789)	1
  (2, 214)	1
  (2, 859)	1
  (2, 970)	1
  (2, 187)	1
  (2, 858)	1
  (2, 652)	1
  :	:
  (1669, 45)	1
  (1669, 508)	1
  (1669, 434)	1
  (1669, 643)	1
  (1669, 529)	1
  (1669, 237)	1
  (1669, 451)	1
  (1670, 897)	1
  (1670, 610)	1
  (1670, 609)	1
  (1670, 923)	1
  (1670, 273)	1
  (1671, 834)	1
  (1671, 90)	1
  (1671, 422)	1
  (1671, 148)	1
  (1671, 599)	1
  (1671, 532)	1
  (1671, 995)	1
  (1671, 607)	1
  (1671, 957)	1
  (1671, 571)	1
  (1671, 69)	1
  (1671, 138)	1
  (1671, 943)	1


In [16]:
encoded2 = x_test_cv.toarray()
encoded2

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [18]:
pred = nb_clf.predict(x_test_cv)

In [20]:
accuracy_score(y_test, pred) #불용어를 제거하면 정확도가 올라갈 수 있다.

0.8086124401913876

### < 과제 >
- 베르누이 나이브베이즈 분류 모델을 사용하여 스팸 메세지 분류
- 정제, 필터링 작업 후 분류하여 성능확인

In [57]:
from nltk.tokenize import sent_tokenize
data=sent_tokenize(data)
print(text)

TypeError: expected string or bytes-like object

In [58]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

vocab=Counter() #파이썬의 Counter 모듈을 이용하면 단어의 모든 빈도를 쉽게 계산할 수 있습니다.

sentences = []
stop_words = set(stopwords.words('english'))

for i in pd.read_csv("spam.csv"):
    sentence=word_tokenize(i) # 단어 토큰화를 수행합니다.
    result = []
    
    for word in sentence:
        word=word.lower() #모든 단어를 소문자화하여 단어의 개수를 줄입니다.
        if word not in stop_words: #단어 토큰화 된 결과에 대해서 불용어를 제거합니다.
            if len(word) >2: # 단어 길이가 2이하인 경우에 대하여 추가로 단어를 제거합니다(영어의 경우).
                result.append(word)
                vocab[word]=vocab[word]+1 #각 단어의 빈도를 Count 합니다.
    sentences.append(result)
print(sentences)

[['category'], ['message']]


In [None]:
#잘 모르곘습니다