# 나이브베이즈
---

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
spam = pd.read_csv('./dataset/spam.csv', encoding='latin-1')
spam.shape

(5572, 5)

In [4]:
spam.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


In [5]:
spam.v1.value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [9]:
c = ' '.join(spam.loc[spam['v1'] == 'ham','v2'])

In [10]:
l = c.split(' ')
l[:10]

['Go',
 'until',
 'jurong',
 'point,',
 'crazy..',
 'Available',
 'only',
 'in',
 'bugis',
 'n']

In [32]:
from collections import Counter

counter = Counter(l)


In [31]:
list(counter.items())

[('Go', 10),
 ('until', 21),
 ('jurong', 1),
 ('point,', 1),
 ('crazy..', 1),
 ('Available', 1),
 ('only', 102),
 ('in', 734),
 ('bugis', 4),
 ('n', 128),
 ('great', 70),
 ('world', 17),
 ('la', 2),
 ('e', 71),
 ('buffet...', 1),
 ('Cine', 1),
 ('there', 109),
 ('got', 200),
 ('amore', 1),
 ('wat...', 12),
 ('Ok', 97),
 ('lar...', 18),
 ('Joking', 1),
 ('wif', 26),
 ('u', 645),
 ('oni...', 2),
 ('U', 229),
 ('dun', 46),
 ('say', 72),
 ('so', 272),
 ('early', 24),
 ('hor...', 2),
 ('c', 41),
 ('already', 44),
 ('then', 138),
 ('say...', 3),
 ('Nah', 7),
 ('I', 1436),
 ("don't", 103),
 ('think', 112),
 ('he', 127),
 ('goes', 26),
 ('to', 1530),
 ('usf,', 2),
 ('lives', 2),
 ('around', 51),
 ('here', 72),
 ('though', 15),
 ('Even', 15),
 ('my', 619),
 ('brother', 9),
 ('is', 638),
 ('not', 320),
 ('like', 209),
 ('speak', 11),
 ('with', 248),
 ('me.', 90),
 ('They', 30),
 ('treat', 11),
 ('me', 537),
 ('aids', 1),
 ('patent.', 1),
 ('As', 27),
 ('per', 11),
 ('your', 373),
 ('request', 6)

In [33]:
df = pd.DataFrame(list(counter.items()))

In [34]:
df.columns = ['words in non-spam', 'count']
df.head()

Unnamed: 0,words in non-spam,count
0,Go,10
1,until,21
2,jurong,1
3,"point,",1
4,crazy..,1


In [35]:
s = ' '.join(spam.loc[spam['v1'] == 'spam','v2'])
l = s.split(' ')
counter = Counter(l)
df2 = pd.DataFrame(list(counter.items()))
df2.columns = ['words in spam', 'count']
df2.head()

Unnamed: 0,words in spam,count
0,Free,35
1,entry,25
2,in,64
3,2,169
4,a,358


In [37]:
from sklearn import feature_extraction

fe = feature_extraction.text.CountVectorizer(stop_words='english') # stopwords 필요없는 단어들

In [38]:
X = fe.fit_transform(spam['v2'])
X.shape

(5572, 8404)

In [43]:
pd.DataFrame(X).head()

Unnamed: 0,0
0,"(0, 4224)\t1\n (0, 5741)\t1\n (0, 2271)\t1..."
1,"(0, 5343)\t1\n (0, 4385)\t1\n (0, 4192)\t1..."
2,"(0, 3265)\t1\n (0, 2875)\t2\n (0, 8185)\t1..."
3,"(0, 2738)\t1\n (0, 6450)\t2\n (0, 2757)\t1..."
4,"(0, 5092)\t1\n (0, 2651)\t1\n (0, 7443)\t1..."


In [44]:
y = spam['v1'].map({'spam':1,'ham':0})
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: v1, Length: 5572, dtype: int64

In [45]:
# 데이터 분리
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=2022, stratify=y)


In [46]:
X_train.shape, X_test.shape

((3900, 8404), (1672, 8404))

In [56]:
from sklearn.naive_bayes import MultinomialNB  # 범주형 데이터에 사용
                                # 가우시안NB  연속형 데이터에 사용

mnb = MultinomialNB()
mnb.fit(X_train, y_train)


MultinomialNB()

In [57]:
y_pred = mnb.predict(X_test)

In [58]:
from sklearn.metrics import accuracy_score, precision_score

print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

0.9808612440191388
0.9033613445378151


In [52]:
from sklearn import svm

clf = svm.SVC()
clf.fit(X_train, y_train)

SVC()

In [53]:
y_pred = clf.predict(X_test)

In [55]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

0.9766746411483254
1.0
