# 1. Imports

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
import pandas as pd
import numpy as np

In [3]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib as mpl

In [4]:
import io
from os import listdir
from os.path import isfile, isdir, join

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

# 2. Data
## 2.1. Data Import

In [6]:
PATH = '../data/Bayes/pu1/'

In [7]:
parts = ['part%s' % i for i in range(1, 11)]

In [8]:
txt_lsts = [listdir(PATH + part) for part in parts]

In [9]:
part_txt_lst = [join(part, txt) for txt_lst, part in zip(txt_lsts, parts) for txt in txt_lst]

In [10]:
path_txt_lst = [PATH + part_txt for part_txt in part_txt_lst]

In [11]:
data = []
part_ind_arr = [path_txt.split(sep='/')[4] for path_txt in path_txt_lst]
for path_txt, part_ind in zip(path_txt_lst, part_ind_arr):
    with io.open(path_txt, mode="r", encoding="utf-8") as fd:
        context = fd.read()
        spam_indicator = int('spmsg' in path_txt)
        data.append((context, spam_indicator, part_ind))
        
# data = np.array(data)

In [12]:
# Очистка данных
data_pure = []
for context, spam_ind, part_ind in data:
    # Уберем надпись 'Subject: ' вначале писем
    contex_temp = context.replace('Subject: ','')
    # Письмо делится на `Subject` и `Body` с помощью '\n\n'
    # Разделим письмо на эти 2 части
    contex_temp = contex_temp.split(sep='\n\n')
    # В конце каждого числа стоит перенос строки '\n' - удалим
    contex_temp = [contex_temp[0], contex_temp[1].replace('\n','')]
    # Запишим в переменную
    data_pure.append(contex_temp + [spam_ind] + [int(part_ind[4:])])
    
# data_pure = np.array(data_pure)

In [13]:
subject_arr = np.array([arr[0] for arr in data_pure])
body_arr = np.array([arr[1] for arr in data_pure])
spam_ind_arr = np.array([arr[2] for arr in data_pure])
part_ind_arr = np.array([arr[3] for arr in data_pure])

In [14]:
df = pd.DataFrame([subject_arr, body_arr, spam_ind_arr, part_ind_arr]).T
df.columns = ['subject', 'body', 'spam', 'part']
df['spam'] = df['spam'].astype('int8')
df['part'] = df['part'].astype(int)
df.head()

Unnamed: 0,subject,body,spam,part
0,5573 47,3677 22660 15981 9594 5573 2130 16502 22064 15...,0,1
1,5581 2130 2005 47,1791 13383 80 8962 2130 15184 17345 9131 2176 ...,0,1
2,1368 15860 14338 5915 82 101 3124,1835 23758 17345 16531 16502 7634 17753 20408 ...,0,1
3,7265 131 14950 7721 47 47 2176 13757 18745 47 ...,22180 11245 14338 2649 13406 1124 47 47 47 47 ...,1,1
4,167 190 82 3115 171 5652 2221 118 18679 95 84 88,82 82 82 82 82 82 82 82 82 82 82 82 82 82 82 8...,0,1


In [15]:
df.dtypes

subject    object
body       object
spam         int8
part        int64
dtype: object

In [16]:
df['spam'].value_counts()

0    610
1    480
Name: spam, dtype: int64

In [17]:
y = df['spam']

In [18]:
X = df['body']

In [19]:
X = df['subject']

In [20]:
X = df['subject'] + ' ' + df['body']

# Modeling

$$c_{map}=\arg\max\limits_{c \in C} \big[\log P(c) + \sum_{i=1}^n{\log P(w_i|c)}\big]$$

$$c_{map}=\arg\max\limits_{c \in C} \bigg[\log \frac{D_c}{D} + \sum_{i \in Q}{\log \frac{W_{ic} + 1}{|V| + \sum_{i' \in V} W_{i'c}}}\bigg]$$

In [21]:
from BayesClassifier import BayesClassifier

In [22]:
BC = BayesClassifier()

In [23]:
BC.fit(X, y)

In [24]:
BC.score(X, y)

0.9770642201834863

In [25]:
cv = []
for uniq in df['part'].unique():
    mask = (df['part'] == uniq)
    cv.append([mask[~mask].index, mask[mask].index])
cv[0]

[Int64Index([ 109,  110,  111,  112,  113,  114,  115,  116,  117,  118,
             ...
             1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089],
            dtype='int64', length=981),
 Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
             ...
              99, 100, 101, 102, 103, 104, 105, 106, 107, 108],
            dtype='int64', length=109)]

In [26]:
%%time
cv_score = cross_val_score(BC, X, y, scoring='f1', cv=cv, n_jobs=-1)

CPU times: user 175 ms, sys: 181 ms, total: 357 ms
Wall time: 11.6 s


In [27]:
cv_score.mean()

0.9514426840853663

### Hide-input mode

In [47]:
from IPython.display import HTML

In [48]:
# HTML('''<script>
# code_show=true; 
# function code_toggle() {
#  if (code_show){
#  $('div.input').hide();
#  } else {
#  $('div.input').show();
#  }
#  code_show = !code_show
# } 
# $( document ).ready(code_toggle);
# </script>
# The raw code for this IPython notebook is by default hidden for easier reading.
# To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')