-
Notifications
You must be signed in to change notification settings - Fork 1
/
NaiveBayes.py
58 lines (41 loc) · 2.66 KB
/
NaiveBayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
def naive_bayes(real_vectorizer,real_word_matrix,real_frequency_dict,fake_vectorizer,fake_word_matrix,fake_frequency_dict,test_line_list,BoW_gram,stopWords=None):
#####################################################
# GETTING UNIQ WORDS FOR CALCULATE TOTAL UNIQ WORDS #
#####################################################
realwords_set =real_vectorizer.get_feature_names()
fakewords_set =fake_vectorizer.get_feature_names()
######################################################
# NAIVE BAYES PART #
######################################################
real_words_count = real_word_matrix.sum() # WORDS COUNT IN REAL NEWS
fake_words_count = fake_word_matrix.sum() # WORDS COUNT IN FAKE NEWS
p_real=real_words_count/(real_words_count+fake_words_count)# REAL NEW LİKELİHOOD
p_fake=fake_words_count/(real_words_count+fake_words_count)# FAKE NEW LİKELİHOOD
uniq_words = set(realwords_set + fakewords_set)
uniq_words_count = len(uniq_words) # UNIQ WORDS COUNT IN FAKE AND REAL AND TEST NEWS
# computing bayes theorem with laplace smoothing #
test_classifier_result=[]
for line in test_line_list:
test_vectorizer = CountVectorizer(ngram_range=(BoW_gram,BoW_gram),stop_words=stopWords,analyzer='word')
line_list= []
line_list.append(line)
test_word_matrix = test_vectorizer.fit_transform(line_list)
probabilty_real=0
probabilty_fake=0
# calculating conditional probabiltys here and if word not in uniq set applying smoothing.
for item,value in test_vectorizer.vocabulary_.items():
probabilty_real += test_word_matrix.toarray()[0,value]*np.log10((real_frequency_dict.get(item,0)+1)/(real_words_count + uniq_words_count)) #sum of probabilties real
probabilty_fake += test_word_matrix.toarray()[0,value]*np.log10((fake_frequency_dict.get(item,0)+1)/(fake_words_count + uniq_words_count)) #sum of probabilties fake
probabilty_real= probabilty_real + np.log10(p_real)# probabilty of news is real
probabilty_fake= probabilty_fake + np.log10(p_fake)# probabilty of news is fake
if probabilty_real > probabilty_fake:
test_classifier_result.append("real")
elif probabilty_real < probabilty_fake:
test_classifier_result.append("fake")
elif p_real>=p_fake:
test_classifier_result.append("real")
else:
test_classifier_result.append("fake")
return test_classifier_result