# Наивный Байесовский классификатор 

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
import matplotlib.pyplot as plt
from math import log, sqrt
import pandas as pd
import numpy as np
import re
%matplotlib inline

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MASHA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MASHA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
df = pd.read_csv('spam.csv', encoding = 'latin-1', 
                 usecols=['v1', 'v2'])
df.rename(columns={'v1': 'target', 'v2': 'message'}, inplace=True)
df.target.replace({'spam': 1, 'ham': 0}, inplace=True)
df.head()

Unnamed: 0,target,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [42]:
def preprocess_message(m):
    # split into tokens
    words = word_tokenize(m)
    
    # filter out words with length <= 2
    words = [w for w in words if len(w) > 2]
    
    # remove stop-words
    sw = stopwords.words('english')
    words = [word for word in words if word not in sw]
    
    # stem
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

In [43]:
df.message = df.message.apply(preprocess_message)
df.head()

Unnamed: 0,target,message
0,0,jurong point crazy.. Avail bugi great world bu...
1,0,lar ... Joke wif oni ...
2,1,Free entri wkli comp win Cup final tkt 21st Ma...
3,0,dun say earli hor ... alreadi say ...
4,0,Nah n't think goe usf live around though


In [44]:
df.target.value_counts()

0    4825
1     747
Name: target, dtype: int64

### Формула которую нам придется запрограммировать

$$ 
P(spam | message ) = \frac{P(message | spam)  P(spam)}{P(message)}
$$

Чтобы получить ответ, нужно перемножить/разделить 3 величины. Чтобы их посчитать, придется расписать формулу чуть поподробнее:

$$ 
\frac{P(message | spam)  P(spam)}{P(message)} = \frac{P(word_1 \cap word_2 \cap word_3 ... \cap word_n | spam) P(spam)}{P(word_1 \cap word_2 \cap ... \cap word_n)}
$$

Считать совместную вероятность вхождения слов в сообщение - это сложно. Вот тут-то и появляется "наивность". Заключается она в том что мы, для упрощения вычислений, сделаем грубое предположение: появление разных слов в предложении это события независимые. 

В таком случае вероятность одновременного появления слов превратится в перемножение вероятностей встретить каждое слово по-отдельности.

$$ 
P(word_1 \cap word_2 \cap word_3 ... \cap word_n | spam) \approx P(word_1|spam)P(word_2|spam) ... P(word_n|spam) 
$$

Вот и весь смысл наивного Байесовского классификатора. Теперь осталось посчитать все формулы по-отдельности, а затем скомбинировать.

In [45]:
from sklearn.model_selection import train_test_split
X_train, X_valid = train_test_split(df)
X_train.shape, X_valid.shape

((4179, 2), (1393, 2))

In [46]:
from sklearn.feature_extraction.text import CountVectorizer

In [47]:
class SpamDetector():
    
    def __init__(self):
        pass
    
    # typical machine learning model interface
    def fit(self, dataframe):
        self.__checkdf(dataframe)
        self.p_spam = self.__p_spam(dataframe) # probability of pam messages in df
        self.words_if_spam = self._words_if_spam(dataframe) #dictionary: word - count_word in spam
        self.words_if_ham = self._words_if_ham(dataframe) #dictionary: word - count_word in ham
        self.words = self._words(dataframe) #dictionary: word - count_word 
    
    def predict_proba(self, dataframe):
        self.__checkdf(dataframe)
        summ_spam = sum(self.words_if_spam.values())
        summ_ham = sum(self.words_if_ham.values())
        res=np.zeros((dataframe.shape[0], 2))
        for i, mes in enumerate(dataframe.message):
            for j in mes.split():
                res[i][0] += np.log(self.words_if_ham[j] / summ_ham) if j in self.words_if_ham else 0
                res[i][1] += np.log(self.words_if_spam[j] / summ_spam) if j in self.words_if_spam else 0
                res[i][0] -= np.log(self.words[j] / (summ_ham + summ_spam)) if j in self.words else 0
                res[i][1] -= np.log(self.words[j] / (summ_ham + summ_spam)) if j in self.words else 0
        res[:,0] += np.log(1 - self.p_spam)
        res[:,1] += np.log(self.p_spam)
        res[:,0] = np.exp(res[:,0]) 
        res[:,1] = np.exp(res[:,1])
        return res
    
    def predict(self, dataframe):
        self.__checkdf(dataframe)
        res = self.predict_proba(dataframe)
        return res[:, 0] > res[:,1]
    
    # helper functions
    def __checkdf(self, dataframe):
        assert all([c in ['target', 'message'] for c in dataframe.columns])
        
    def __p_spam(self, dataframe):
        return dataframe.target.mean()

    def _words_if_spam(self, dataFrame):
        countvec = CountVectorizer() 
        countvec.fit(dataFrame[dataFrame.target == 1].message)
        counts = countvec.transform(dataFrame[dataFrame.target == 1].message).sum(axis = 0).tolist()[0]
        all_word_counts = dict(zip(countvec.get_feature_names(), counts)) #dictionary: word - count_word in spam
        return all_word_counts
        
    def _words_if_ham(self, dataFrame):
        countvec2 = CountVectorizer()
        countvec2.fit(dataFrame[dataFrame.target == 0].message)
        counts = countvec2.transform(dataFrame[dataFrame.target == 0].message).sum(axis = 0).tolist()[0]
        all_word_counts = dict(zip(countvec2.get_feature_names(), counts)) #dictionary: word - count_word in ham
        return all_word_counts
    
    def _words(self, dataFrame):
        countvec3 = CountVectorizer()
        countvec3.fit(dataFrame.message)
        counts = countvec3.transform(dataFrame.message).sum(axis = 0).tolist()[0]
        all_word_counts = dict(zip(countvec3.get_feature_names(), counts)) #dictionary: word - count_word in ham
        return all_word_counts

In [48]:
sfilter = SpamDetector()
sfilter.fit(X_train)


In [49]:
y_hat_p = sfilter.predict_proba(X_valid)
y_hat = sfilter.predict(X_valid)
y_hat_p

array([[6.14350407e+00, 9.46020007e+25],
       [8.23001766e+00, 2.57744894e+35],
       [3.60118948e+00, 2.66389520e+12],
       ...,
       [2.67851765e+00, 2.04375095e+08],
       [5.92980750e+00, 5.38750438e+12],
       [2.26666987e+00, 7.22075917e+04]])

In [50]:
from sklearn.metrics import classification_report
print(classification_report(y_hat, X_valid.target))

             precision    recall  f1-score   support

      False       0.75      0.93      0.83       961
       True       0.69      0.32      0.43       432

avg / total       0.73      0.74      0.71      1393

