In [48]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB,MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import math
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

In [22]:
train_input = pd.read_csv('SentimentSentence_train.csv')
test_input = pd.read_csv('SentimentSentence_test.csv')

In [26]:
x_train = np.array(train_input.iloc[:, 0])
y_train = np.array(train_input.iloc[:, 1])


x_test = np.array(test_input.iloc[:, 0])
y_test = np.array(test_input.iloc[:, 1])

In [27]:
tfidf = TfidfVectorizer() # Tf-idf 계산 함수
tfidf.fit(x_train)
x_train = tfidf.transform(x_train).toarray()
x_test = tfidf.transform(x_test).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [29]:
x_train.shape

(800, 1774)

In [11]:
model1 = GaussianNB()
model2 = MultinomialNB()
model3 = BernoulliNB() 
model1.fit(x_train, y_train) # 모델 학습
pred1 = model1.predict(x_test) # 모델 예측
model2.fit(x_train, y_train) # 모델 학습
pred2 = model2.predict(x_test) # 모델 예측
model3.fit(x_train, y_train) # 모델 학습
pred3 = model3.predict(x_test) # 모델 예측

In [15]:
rmse1 = math.sqrt(mean_squared_error(pred1, y_test))
rmse2 = math.sqrt(mean_squared_error(pred2, y_test))
rmse3 = math.sqrt(mean_squared_error(pred3, y_test))
print("%.6f" % rmse1)
print("%.6f" % rmse2)
print("%.6f" % rmse3)

0.500000
0.412311
0.430116


In [49]:
print("%.6f" % accuracy_score(y_true=y_test,y_pred=pred1))
print("%.6f" % accuracy_score(y_true=y_test,y_pred=pred2))
print("%.6f" % accuracy_score(y_true=y_test,y_pred=pred3))

print(precision_score(y_true=y_test, y_pred=pred1,average='binary'))
print(precision_score(y_true=y_test, y_pred=pred2,average='binary'))
print(precision_score(y_true=y_test, y_pred=pred3,average='binary'))

print(recall_score(y_true=y_test, y_pred=pred1, average='binary'))
print(recall_score(y_true=y_test, y_pred=pred2, average='binary'))
print(recall_score(y_true=y_test, y_pred=pred3, average='binary'))

print(f1_score(y_true=y_test, y_pred=pred1, average='binary'))
print(f1_score(y_true=y_test, y_pred=pred2, average='binary'))
print(f1_score(y_true=y_test, y_pred=pred3, average='binary'))


0.750000
0.830000
0.815000
0.7295081967213115
0.8673469387755102
0.822429906542056
0.839622641509434
0.8018867924528302
0.8301886792452831
0.7807017543859649
0.8333333333333334
0.8262910798122067


In [31]:
step = len(y_test)
count = 0
for i in range(step):
    if y_test[i] != pred1[i]:
        print(str(i) + " - y_test " + str(y_test[i]) + " y_pred : " + str(pred1[i]))
        count = count + 1

2 - y_test 1 y_pred : 0
11 - y_test 0 y_pred : 1
18 - y_test 0 y_pred : 1
25 - y_test 0 y_pred : 1
28 - y_test 0 y_pred : 1
29 - y_test 0 y_pred : 1
30 - y_test 0 y_pred : 1
32 - y_test 0 y_pred : 1
34 - y_test 1 y_pred : 0
36 - y_test 1 y_pred : 0
43 - y_test 1 y_pred : 0
47 - y_test 0 y_pred : 1
48 - y_test 0 y_pred : 1
51 - y_test 0 y_pred : 1
54 - y_test 0 y_pred : 1
56 - y_test 0 y_pred : 1
59 - y_test 0 y_pred : 1
71 - y_test 0 y_pred : 1
72 - y_test 0 y_pred : 1
74 - y_test 1 y_pred : 0
77 - y_test 1 y_pred : 0
78 - y_test 0 y_pred : 1
83 - y_test 0 y_pred : 1
99 - y_test 0 y_pred : 1
101 - y_test 1 y_pred : 0
110 - y_test 0 y_pred : 1
126 - y_test 0 y_pred : 1
128 - y_test 1 y_pred : 0
138 - y_test 0 y_pred : 1
140 - y_test 0 y_pred : 1
141 - y_test 1 y_pred : 0
142 - y_test 0 y_pred : 1
145 - y_test 0 y_pred : 1
149 - y_test 1 y_pred : 0
153 - y_test 0 y_pred : 1
155 - y_test 0 y_pred : 1
156 - y_test 1 y_pred : 0
157 - y_test 1 y_pred : 0
158 - y_test 0 y_pred : 1
159 - y_tes

In [32]:
print("total : " + str(step) + " diff : " + str(count))

total : 200 diff : 50


In [40]:
print(model2.predict_proba(x_test[2].reshape(1,-1)))

[[0.51485492 0.48514508]]
