# 1. Imports

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
import pandas as pd
import numpy as np

In [3]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib as mpl

In [4]:
import io
from os import listdir
from os.path import isfile, isdir, join

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 2. Data
## 2.1. Data Import

In [52]:
PATH = '../data/Bayes/pu1/'

In [53]:
parts = ['part%s' % i for i in range(1, 11)]

In [54]:
txt_lsts = [listdir(PATH + part) for part in parts]

In [55]:
part_txt_lst = [join(part, txt) for txt_lst, part in zip(txt_lsts, parts) for txt in txt_lst]

In [56]:
path_txt_lst = [PATH + part_txt for part_txt in part_txt_lst]

In [57]:
data = []
part_ind_arr = [path_txt.split(sep='/')[4] for path_txt in path_txt_lst]
for path_txt, part_ind in zip(path_txt_lst, part_ind_arr):
    with io.open(path_txt, mode="r", encoding="utf-8") as fd:
        context = fd.read()
        spam_indicator = int('spmsg' in path_txt)
        data.append((context, spam_indicator, part_ind))
        
# data = np.array(data)

In [63]:
# Очистка данных
data_pure = []
for context, spam_ind, part_ind in data:
    # Уберем надпись 'Subject: ' вначале писем
    contex_temp = context.replace('Subject: ','')
    # Письмо делится на `Subject` и `Body` с помощью '\n\n'
    # Разделим письмо на эти 2 части
    contex_temp = contex_temp.split(sep='\n\n')
    # В конце каждого числа стоит перенос строки '\n' - удалим
    contex_temp = [contex_temp[0], contex_temp[1].replace('\n','')]
    # Запишим в переменную
    data_pure.append(contex_temp + [spam_ind] + [int(part_ind[4:])])
    
# data_pure = np.array(data_pure)

In [75]:
subject_arr = np.array([arr[0] for arr in data_pure])
body_arr = np.array([arr[1] for arr in data_pure])
spam_ind_arr = np.array([arr[2] for arr in data_pure])
part_ind_arr = np.array([arr[3] for arr in data_pure])

In [78]:
df = pd.DataFrame([subject_arr, body_arr, spam_ind_arr, part_ind_arr]).T
df.columns = ['subject', 'body', 'spam', 'part']
df.head()

Unnamed: 0,subject,body,spam,part
0,5573 47,3677 22660 15981 9594 5573 2130 16502 22064 15...,0,1
1,5581 2130 2005 47,1791 13383 80 8962 2130 15184 17345 9131 2176 ...,0,1
2,1368 15860 14338 5915 82 101 3124,1835 23758 17345 16531 16502 7634 17753 20408 ...,0,1
3,7265 131 14950 7721 47 47 2176 13757 18745 47 ...,22180 11245 14338 2649 13406 1124 47 47 47 47 ...,1,1
4,167 190 82 3115 171 5652 2221 118 18679 95 84 88,82 82 82 82 82 82 82 82 82 82 82 82 82 82 82 8...,0,1


## Train/test split

In [83]:
X_train, X_test, y_train, y_test = train_test_split(df['body'], spam_ind_arr, test_size=0.2, random_state=112358)

In [84]:
X_train.shape
X_test.shape
y_train.shape
y_test.shape

(872,)

(218,)

(872,)

(218,)

# Modeling

$c_{map}=\arg\max\limits_{c \in C} \big[\log P(c) + \sum_{i=1}^n{\log P(w_i|c)}\big]$

$c_{map}=\arg\max\limits_{c \in C} \bigg[\log \frac{D_c}{D} + \sum_{i \in Q}{\log \frac{W_{ic} + 1}{|V| + \sum_{i' \in V} W_{i'c}}}\bigg]$

### fit

In [19]:
vocab_len = X_train.shape[1]

In [20]:
class_arr, doc_counts = np.unique(y_train, return_counts=True)
class_arr
doc_counts

array([0, 1])

array([496, 376])

In [21]:
masks_cl = np.array([y_train == cl for cl in class_arr])

masks_cl.shape 

(2, 872)

In [22]:
total_num_words = np.array([X_train[mask].sum().sum() for mask in masks_cl])
total_num_words

array([318021, 355150], dtype=int64)

### predict

In [23]:
def get_word_indices(txt_lst, cnt_vectorizer):
    word_inds = []
    for txt in txt_lst:
        temp_lst = []
        unknown_cnt = 0
        for word in txt.split(' '):
            try:
                temp_lst.append(cnt_vectorizer.vocabulary_[word])
            except KeyError:
                unknown_cnt += 1
        word_inds.append([temp_lst, unknown_cnt])
    return word_inds

def get_word_frequencies(X_train_by_cl, word_inds):
    freq_lst = []
    for word_ind in word_inds:
        temp_lst = []
        for X_train_cl in  X_train_by_cl:
            # лапласовское смещение
            # print(X_train_cl)
            # print(word_ind)
            freq_smooth = X_train_cl[:, word_ind[0]].sum(axis=0) + 1
            # добавляем единицы, соответствующие неизвестным словам
            freq_smooth = np.append(freq_smooth, np.ones(word_ind[1], dtype=int))
            # 
            temp_lst.append(freq_smooth)
        # 
        freq_lst.append(np.array(temp_lst).T)
    return freq_lst

In [24]:
doc_fracs = doc_counts / doc_counts.sum()
log_probs_doc = np.log(doc_fracs)
log_probs_doc

array([-0.5642135 , -0.84120028])

In [25]:
# txt_lst = ['NA 6818 80 284 13383 80 127 93', 'NA 127 4988 84 16502 3960 12162 1594 4651 NA', 'NA']
txt_lst = body_arr[:]

In [26]:
word_inds = get_word_indices(txt_lst, vectorizer)

In [27]:
X_train_by_cl = [X_train[mask].toarray() for mask in masks_cl]
# X_train_by_cl

In [28]:
freq_lst = get_word_frequencies(X_train_by_cl, word_inds)

In [29]:
denominator = vocab_len + total_num_words

In [30]:
word_smoothed_fracs = [word_freq / denominator for word_freq in freq_lst]
# word_smoothed_fracs

In [31]:
log_probs_words = [np.log(frac).sum(axis=0) for frac in word_smoothed_fracs]

In [32]:
log_probs = log_probs_doc + log_probs_words
# log_probs

In [33]:
y_pred_1 = log_probs.argmax(axis=1)

In [34]:
np.unique(y_pred_1, return_counts=True)

(array([0, 1]), array([591, 499]))

In [35]:
accuracy_score(y_pred_1, spam_ind_arr)

0.9697247706422019

In [49]:
y_pred_2 = np.array([log_probs_doc[0] / log_probs_doc[1] + np.log(frac[:, 0] / frac[:, 1]).sum() for frac in word_smoothed_fracs])

In [50]:
y_pred_2

array([ 112.61194729,   -7.84231476,   28.00478326, ..., 1238.22674907,
        131.97043453,   51.36904185])

In [37]:
y_pred_2[y_pred_2 > 0] = 0
y_pred_2[y_pred_2 < 0] = 1

In [38]:
np.unique(y_pred_2, return_counts=True)

(array([0., 1.]), array([591, 499]))

In [39]:
accuracy_score(y_pred_2, spam_ind_arr)

0.9697247706422019

In [40]:
log_probs_doc[0] / log_probs_doc[1]

0.6707243331348977

In [41]:
log_probs_words

[array([-2089.63356707, -2201.57479002]),
 array([-356.90791598, -348.39487689]),
 array([-204.03923257, -231.37329149]),
 array([-3261.73669217, -2817.55339574]),
 array([-7536.46919496, -8511.40855788]),
 array([-4756.2693619 , -5430.07276545]),
 array([-5239.4974254 , -4797.61949476]),
 array([-84350.44136682, -97658.46068529]),
 array([-10204.0225749 ,  -9669.58025809]),
 array([-1064.66155012, -1127.71073679]),
 array([-9639.50254674, -8587.94541473]),
 array([-13676.48400798, -15576.83196639]),
 array([-1326.00425684, -1435.21692896]),
 array([-12283.83752772, -11289.9070117 ]),
 array([-2013.03409666, -1804.7285035 ]),
 array([-4486.93901177, -4069.967195  ]),
 array([-865.73588829, -862.54589443]),
 array([-2824.5271829 , -2623.99381675]),
 array([-909.40341134, -843.77850339]),
 array([-1409.10419575, -1373.34891641]),
 array([-3508.2551854 , -4173.71624486]),
 array([-2064.22118549, -1924.31963085]),
 array([-3059.24346895, -3209.60392163]),
 array([-5081.46722613, -6555.4903

In [42]:
from BayesClassifier import BayesClassifier

In [43]:
BC = BayesClassifier()

In [44]:
BC.fit(body_arr, spam_ind_arr)

In [45]:
y_pred = BC.predict(body_arr)

[-0.58047402 -0.82014687]


In [46]:
accuracy_score(y_pred, spam_ind_arr)

0.9678899082568807

In [80]:
X_train

<872x24575 sparse matrix of type '<class 'numpy.int64'>'
	with 198796 stored elements in Compressed Sparse Row format>

In [85]:
BC.fit(X_train, y_train)

In [86]:
BC.score(X_train, y_train)

[-0.5642135  -0.84120028]


0.9736238532110092

In [87]:
BC.score(X_test, y_test)

[-0.5642135  -0.84120028]


0.9541284403669725

In [None]:
X_train
X_test
y_train
y_test

### Hide-input mode

In [47]:
from IPython.display import HTML

In [48]:
# HTML('''<script>
# code_show=true; 
# function code_toggle() {
#  if (code_show){
#  $('div.input').hide();
#  } else {
#  $('div.input').show();
#  }
#  code_show = !code_show
# } 
# $( document ).ready(code_toggle);
# </script>
# The raw code for this IPython notebook is by default hidden for easier reading.
# To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')