In [9]:
import pandas as pd
df = pd.read_csv('spam.csv', encoding='cp1252')

In [10]:
df.shape

(5572, 5)

In [14]:
df= df[['v1','v2']]
df[:10]

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


# TFIDF Features

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

In [35]:
vect = CountVectorizer()

pipeline = Pipeline([
    ('vect', vect),
    ('tfidf', TfidfTransformer())
])

In [82]:
list(vect.vocabulary_.keys())

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat',
 'ok',
 'lar',
 'joking',
 'wif',
 'oni',
 'free',
 'entry',
 'wkly',
 'comp',
 'to',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 '21st',
 'may',
 '2005',
 'text',
 '87121',
 'receive',
 'question',
 'std',
 'txt',
 'rate',
 'apply',
 '08452810075over18',
 'dun',
 'say',
 'so',
 'early',
 'hor',
 'already',
 'then',
 'nah',
 'don',
 'think',
 'he',
 'goes',
 'usf',
 'lives',
 'around',
 'here',
 'though',
 'freemsg',
 'hey',
 'darling',
 'it',
 'been',
 'week',
 'now',
 'and',
 'no',
 'word',
 'back',
 'like',
 'some',
 'fun',
 'you',
 'up',
 'for',
 'still',
 'tb',
 'xxx',
 'chgs',
 'send',
 '50',
 'rcv',
 'even',
 'my',
 'brother',
 'is',
 'not',
 'speak',
 'with',
 'me',
 'they',
 'treat',
 'aids',
 'patent',
 'as',
 'per',
 'your',
 'request',
 'melle',
 'oru',
 'minnaminunginte',
 'nurungu',
 'vettam',
 'has',
 's

In [36]:
tfidf_features = pipeline.fit_transform( df['v2'].values)

In [65]:
tfidf_features = tfidf_features.toarray()

In [74]:
# Character vectorizer
import numpy as np
def get_n_char_vectorizer(values, n):
    all_text = ' '.join(values).lower();
    dictionary={}
    for i in range(0, (len(all_text)-n+1)):
        ch_gram = all_text[i:i+n]
        if ch_gram not in dictionary:
            dictionary[ch_gram] = 0
        dictionary[ch_gram] = dictionary[ch_gram]+1
    
    
        
    
    features_arr = []
    for value in values:
        feature = np.zeros(len(dictionary))
        val=value.lower()
        for i in range(0, (len(value)-n+1)):
            ch_gram = val[i:i+n]
            index = list(dictionary.keys()).index(ch_gram)
            feature[index]=feature[index]+1
        features_arr.append(feature)
    
    return features_arr, list(dictionary.keys())

In [75]:
char_features, char_dict = get_n_char_vectorizer(df['v2'].values,3)

In [80]:
char_dict[:5]

['go ', 'o u', ' un', 'unt', 'nti']

In [76]:
len(char_features)

5572

In [77]:
features = np.hstack((tfidf_features,char_features))

In [78]:
len(features)

5572

In [87]:
feature_names = list(vect.vocabulary_.keys()) + char_dict

In [126]:
data_df = pd.DataFrame(data=features, columns = feature_names)
data_df = data_df.loc[:, ~data_df.columns.duplicated()]

In [312]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_df[feature_names].values, df['v1'],
                                                    stratify=df['v1'], 
                                                    test_size=0.25, shuffle=True, )

In [313]:
def calc_likelihood(data, y):
    prob_spam = data[y == 'ham'].sum(axis=0)
    prob_ham = data[y == 'spam'].sum(axis=0)
    prob_spam /= np.sum(prob_spam)
    prob_ham /= np.sum(prob_ham)
    return np.vstack((prob_ham, prob_spam)).T

In [314]:
def predict(X_test, priors, likelihood): 
    prob = X_test[:, :, np.newaxis] * likelihood[np.newaxis, :, :]
    prob[prob==0] = 1
    score = np.exp(np.log(prob).sum(axis=1)) * priors
    score[score.max(axis=1)==0] = eps
    return score / score.sum(axis=1)[:, np.newaxis]

In [315]:
eps = 1e-15
priors = y_train.value_counts(normalize=True).values

In [316]:
priors

array([0.86599665, 0.13400335])

In [317]:
likelihood = calc_likelihood(X_train, y_train)
likelihood[likelihood==0] = eps
likelihood[likelihood==1] = 1 - eps
pred_probs = predict(X_test, priors, likelihood)
proba = pred_probs[:, 1]
pred = np.zeros_like(proba)
pred[proba>=0.5] = 1

In [331]:
y= [0 if y=='spam' else 1 for y in  y_test]

In [333]:
from sklearn.metrics import f1_score
f1_score( y, pred)  

0.927906976744186

In [334]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)

y_predicted = clf.predict(X_test)

In [335]:
clf.score(X_test,y_test)

0.9468772433596554

In [336]:
df[df['v1']=='ham'].shape[0]/df['v1'].shape[0]

0.8659368269921034

In [337]:
from sklearn.metrics import f1_score
f1_score(y_test, y_predicted, average='macro')  

0.8946086650689917