In [134]:
import pandas as pd
import re
from collections import defaultdict
import math

In [23]:
df = pd.read_csv('spam.csv', encoding = '437')
df = df[['v1','v2']]

In [51]:
df['v1'].loc[df['v1'] == 'ham'] = 0
df['v1'].loc[df['v1'] == 'spam'] = 1
df = df[['v2','v1']]


In [114]:
df = df.rename(columns={"v2": "text", "v1": "spam"})
test = df.tail(571)
df = df.head(5000)

In [117]:
df

Unnamed: 0,text,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
4995,Happy new year. Hope you are having a good sem...,0
4996,Esplanade lor. Where else...,0
4997,Can you talk with me..,0
4998,"Hmph. Go head, big baller.",0


In [4]:
# возвращает тупо список слов в сообщении
def tokenize(message):
    message = message.lower()
    all_words = re.findall("[a-z0-9]+", message)
    return set(all_words)

In [118]:
# считаем частотность слов в спамных и не спамных сообщениях
def count_words(data):
    counts = defaultdict(lambda: [0,0])
    for index, row in data.iterrows():
        for word in tokenize(row['text']):
            counts[word][0 if row['spam'] else 1] +=1
    return counts

In [119]:
count_words(df)

defaultdict(<function __main__.count_words.<locals>.<lambda>()>,
            {'available': [3, 15],
             'e': [10, 73],
             'la': [0, 6],
             'got': [6, 203],
             'in': [63, 661],
             'there': [16, 179],
             'until': [5, 21],
             'crazy': [3, 9],
             'world': [1, 33],
             'bugis': [0, 6],
             'wat': [0, 93],
             'n': [10, 103],
             'great': [9, 88],
             'cine': [0, 7],
             'buffet': [0, 2],
             'point': [0, 12],
             'amore': [0, 1],
             'jurong': [0, 1],
             'only': [73, 112],
             'go': [27, 207],
             'u': [120, 631],
             'wif': [0, 24],
             'ok': [5, 248],
             'oni': [0, 3],
             'lar': [0, 32],
             'joking': [0, 3],
             'cup': [5, 3],
             'tkts': [4, 0],
             'entry': [18, 0],
             'receive': [27, 5],
             'a': [267, 791],


In [78]:
# преобразуем частотности в вероятности сглаживанием
def prob(counts, all_spam, all_not_spam, k = 1):
    return [ (word, 
            (frequency[0] + k) / (all_spam + 2*k),
            (frequency[1] + k) / (all_not_spam + 2*k))
            for word, frequency in counts.items()]

In [122]:
all_spam = df['spam'].loc[df['spam'] == 1].count()
all_not_spam = df['spam'].loc[df['spam'] == 0].count()
freq = count_words(df)
all_not_spam

4327

In [125]:
result = prob(freq, all_spam, all_not_spam)
result

[('available', 0.005925925925925926, 0.003696003696003696),
 ('e', 0.016296296296296295, 0.017094017094017096),
 ('la', 0.0014814814814814814, 0.001617001617001617),
 ('got', 0.01037037037037037, 0.04712404712404712),
 ('in', 0.09481481481481481, 0.15292215292215292),
 ('there', 0.025185185185185185, 0.04158004158004158),
 ('until', 0.008888888888888889, 0.005082005082005082),
 ('crazy', 0.005925925925925926, 0.00231000231000231),
 ('world', 0.002962962962962963, 0.007854007854007854),
 ('bugis', 0.0014814814814814814, 0.001617001617001617),
 ('wat', 0.0014814814814814814, 0.021714021714021713),
 ('n', 0.016296296296296295, 0.024024024024024024),
 ('great', 0.014814814814814815, 0.02055902055902056),
 ('cine', 0.0014814814814814814, 0.001848001848001848),
 ('buffet', 0.0014814814814814814, 0.000693000693000693),
 ('point', 0.0014814814814814814, 0.003003003003003003),
 ('amore', 0.0014814814814814814, 0.000462000462000462),
 ('jurong', 0.0014814814814814814, 0.000462000462000462),
 ('o

In [83]:
# Теперь посмотрим вероятность спама
def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0
    for word, prob_if_spam, prob_if_not_spam in word_probs:
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam/(prob_if_spam+prob_if_not_spam)
            

In [90]:
df

Unnamed: 0,v2,v1
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,1
5568,Will ╠_ b going to esplanade fr home?,0
5569,"Pity, * was in mood for that. So...any other s...",0
5570,The guy did some bitching but I acted like i'd...,0


In [102]:
for index, row in df.iterrows():
    df2 = df2.append({'text': row['v2'],'prob': spam_probability(r, row['v2'])}, ignore_index=True)


In [100]:
df2 = pd.DataFrame(columns = ['text','spam','prob'])

In [105]:
df2 = df2[['text','prob']]

Unnamed: 0,text,prob
0,"Go until jurong point, crazy.. Available only ...",2.009888e-10
1,Ok lar... Joking wif u oni...,5.792960e-12
2,Free entry in 2 a wkly comp to win FA Cup fina...,1.000000e+00
3,U dun say so early hor... U c already then say...,3.376797e-13
4,"Nah I don't think he goes to usf, he lives aro...",1.546296e-14
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,1.000000e+00
5568,Will ╠_ b going to esplanade fr home?,4.715439e-11
5569,"Pity, * was in mood for that. So...any other s...",3.366445e-10
5570,The guy did some bitching but I acted like i'd...,1.858099e-13


In [106]:
def func(row):
    if row['prob']>0.5:
        return 1
    else:
        return 0

In [108]:
df2['smap'] = df2.apply(func, axis = 1)
df2.to_excel('output.xlsx')

In [137]:
def final(word_probs, message):
#     Также обработываются сообщения
    message_words = tokenize(message)
#     изначально вероятности спама и неспама равны нулю
    spam_prob = not_spam_prob = 0.0
#     наращиваем вероятности в циклах
    for word, prob_if_spam, prob_if_not_spam in word_probs:
#         если слово в сообщении используем посчитанную вероятность
        if word in message_words:
            spam_prob += math.log(prob_if_spam)
            not_spam_prob += math.log(prob_if_not_spam)
#             если слова нет в сообщении используем вероятность противоположного события, т.е 1 - посчитанная вероятность
        else:
            spam_prob += math.log(1.0 - prob_if_spam)
            not_spam_prob += math.log(1.0 - prob_if_not_spam)
    e_spam_prob = math.exp(spam_prob)
    e_not_spam_prob = math.exp(not_spam_prob)
#     непосредственно формула вероятность по теореме Баейса
    return e_spam_prob/(e_spam_prob+e_not_spam_prob)
            

In [128]:
df_test = pd.DataFrame(columns = ['text','spam_','prob'])

In [138]:
for index, row in test.iterrows():
    df_test = df_test.append({'text': row['text'],'prob': final(result, row['text']), 'spam_': row['spam']}, ignore_index=True)

In [139]:
df_test

Unnamed: 0,text,spam_,prob
0,You still around? Looking to pick up later,0,1.023103e-11
1,CDs 4u: Congratulations ur awarded σú500 of CD...,1,1.000000e+00
2,There's someone here that has a year &lt;#&gt...,0,2.454262e-18
3,Guess which pub im in? Im as happy as a pig in...,0,1.637098e-09
4,ILL B DOWN SOON,0,2.719961e-11
...,...,...,...
566,This is the 2nd time we have tried 2 contact u...,1,1.000000e+00
567,Will ╠_ b going to esplanade fr home?,0,5.719964e-11
568,"Pity, * was in mood for that. So...any other s...",0,3.398890e-11
569,The guy did some bitching but I acted like i'd...,0,3.329873e-13


In [156]:
def func2(row):
    if row['prob']>0.6:
        return 1
    else:
        return 0

In [157]:
df_test['smap_final2'] = df_test.apply(func2, axis = 1)
df_test

Unnamed: 0,text,spam_,prob,smap_final,smap_final2
0,You still around? Looking to pick up later,0,1.023103e-11,0,0
1,CDs 4u: Congratulations ur awarded σú500 of CD...,1,1.000000e+00,1,1
2,There's someone here that has a year &lt;#&gt...,0,2.454262e-18,0,0
3,Guess which pub im in? Im as happy as a pig in...,0,1.637098e-09,0,0
4,ILL B DOWN SOON,0,2.719961e-11,0,0
...,...,...,...,...,...
566,This is the 2nd time we have tried 2 contact u...,1,1.000000e+00,1,1
567,Will ╠_ b going to esplanade fr home?,0,5.719964e-11,0,0
568,"Pity, * was in mood for that. So...any other s...",0,3.398890e-11,0,0
569,The guy did some bitching but I acted like i'd...,0,3.329873e-13,0,0


In [151]:
from sklearn.metrics import accuracy_score,  f1_score

In [147]:
df_test['spam_']

0      0
1      1
2      0
3      0
4      0
      ..
566    1
567    0
568    0
569    0
570    0
Name: spam_, Length: 571, dtype: object

In [154]:
# print("\naccuracy:", accuracy_score(df_test['spam_'], df_test['smap_final']))
# print("\nF1:", f1_score(df_test['spam_'], df_test['smap_final']))

In [158]:
df_test.to_excel('out2.xlsx')