## Notebook Imports

In [49]:
import pandas as pd 
import numpy as np 

## Constants

In [109]:
VOCAB_SIZE = 2500

TRAINING_DATA_FILE = '/home/goodluck/Desktop/Data/Classification/training-data.txt'
TEST_DATA_FILE = '/home/goodluck/Desktop/Data/Classification/test-data.txt'

TOKEN_SPAM_PROB_FILE = '/home/goodluck/Desktop/Data/Classification/prob-spam.txt'
TOKEN_HAM_PROB_FILE = '/home/goodluck/Desktop/Data/Classification/prob-ham.txt'
TOKEN_ALL_PROB_FILE = '/home/goodluck/Desktop/Data/Classification/prob-all-tokens.txt'

TEST_FEATURES_MATRIX = '/home/goodluck/Desktop/Data/Classification/testing-features.txt'
TEST_TARGET_FILE = '/home/goodluck/Desktop/Data/Classification/testing-target.txt'

## Read and Load features from .txt Files into Numpy Array

In [51]:
sparse_train_data = np.loadtxt(TRAINING_DATA_FILE, delimiter= ' ', dtype=int)
sparse_test_data = np.loadtxt(TEST_DATA_FILE, delimiter= ' ', dtype=int)

In [52]:
print('Number of rows in the training file:', sparse_train_data.shape[0])
print('Number of rows in the testing file:', sparse_test_data.shape[0])

Number of rows in the training file: 290320
Number of rows in the testing file: 131590


In [53]:
print('the number of emails in the training file:', np.unique(sparse_train_data[:, 0]).size)

the number of emails in the training file: 4781


In [54]:
print('the number of emails in the Testing file:', np.unique(sparse_test_data[:, 0]).size)

the number of emails in the Testing file: 2056


### How to create an empty data Frame

In [55]:
column_names = ['DOC_ID'] + ['CATEGORY'] + list(range(0, VOCAB_SIZE))
column_names[:5]

['DOC_ID', 'CATEGORY', 0, 1, 2]

In [56]:
len(column_names)

2502

In [57]:
index_names = np.unique(sparse_train_data[:, 0])
index_names

array([   0,    1,    2, ..., 6893, 6894, 6895])

In [58]:
full_train_data=pd.DataFrame(index=index_names, columns=column_names)
full_train_data.fillna(value=0)

Unnamed: 0,DOC_ID,CATEGORY,0,1,2,3,4,5,6,7,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6888,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6892,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6893,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6894,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Create a full Matrix from a Sparse Matrix

In [59]:
def make_full_matrix (sparse_matrix, nr_words, doc_idx = 0, word_idx=1, cat_idx=2, frequency_idx =3):
    """
    form a full matrix from a sparse matrix, return a pandas dataframe
    
    keyword and Argument:
    nr_words -- size of the voucblary . Total  number of tokens.
    doc_idx -- position of the document id in the sparse matrix. defaultis 1st colun
    word_idx -- position of the word id in the sparse matrix, default secon column
    cat_idx -- psoition of the label (spamis 1 o is for non spam)
    freq_idx -- position of correctness of word in sparse matrix, default is fourth column 
    """
    column_names = ['DOC_ID'] + ['CATEGORY'] + list(range(0, VOCAB_SIZE))
    doc_id_names = np.unique(sparse_matrix[:, 0])
    full_matrix=pd.DataFrame(index=doc_id_names, columns=column_names)
    full_matrix.fillna(value=0)
    
    for i in range (sparse_matrix.shape[0]):
        doc_nr = sparse_matrix[i][doc_idx]
        word_id = sparse_matrix[i][word_idx]
        label = sparse_matrix[i][cat_idx]
        occurance = sparse_matrix[1][frequency_idx]
        
        full_matrix.at[doc_nr, 'DOC_ID'] = doc_nr
        full_matrix.at[doc_nr, 'CATEGORY'] = label
        full_matrix.at[doc_nr, word_id] = occurance
        
    full_matrix.set_index('DOC_ID', inplace = True)
    return full_matrix

In [60]:
%%time
full_train_data = make_full_matrix(sparse_train_data,VOCAB_SIZE)

CPU times: user 15.1 s, sys: 239 ms, total: 15.4 s
Wall time: 15.4 s


### Training the Naive Bayes classifier
#### Calculating the probability of a spam

In [61]:
prob_spam = full_train_data.CATEGORY.sum()/ full_train_data.CATEGORY.size

In [62]:
print('The probabilty of a spam email is:', round(prob_spam, 2))

The probabilty of a spam email is: 0.26


## Total Number of Words / Tokens

In [63]:
full_train_features = full_train_data.loc[:, full_train_data.columns != 'CATEGORY']
full_train_features.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,2.0,2.0,2.0,2.0,2.0,2.0,,,...,,,,,,,,,,
1,2.0,,2.0,,2.0,,2.0,,,,...,,,,,,,,,,
2,,,,2.0,,,,2.0,,2.0,...,,,,,,,,,,
3,,,,,,2.0,,,,,...,,,,,,,,,,
4,,,2.0,,2.0,2.0,2.0,,2.0,2.0,...,,,,,,,,,,


In [64]:
email_leangth = full_train_features.sum(axis=1)
email_leangth.shape

(4781,)

In [65]:
email_wc = email_leangth.sum()
email_wc

580640.0

In [72]:
total_wc = email_leangth.sum()
total_wc


580640.0

## Number of Tokens in spam or Ham

In [66]:
spam_length = email_leangth[full_train_data.CATEGORY == 1]
spam_length.shape

(1246,)

In [67]:
spam_wc = spam_length.sum()
spam_wc

192926.0

In [68]:
ham_lengths = email_leangth[full_train_data.CATEGORY ==0 ] 
ham_lengths.shape

(3535,)

In [69]:
email_leangth.shape[0] - spam_length.shape[0] - ham_lengths.shape[0]



580640.0

In [70]:
nonspam_wc  = ham_lengths.sum()
nonspam_wc

387714.0

In [75]:
spam_wc + nonspam_wc - total_wc

0.0

In [77]:
print("The average number of words in spam e-mails", round(spam_wc/spam_length.shape[0]))

The average number of words in spam e-mails 155


In [83]:
print("The average number of words in ham e-mails", round(nonspam_wc/ham_lengths.shape[0]))

The average number of words in ham e-mails 110


### Summing the tokens Occuring In Spam

In [86]:
train_spam_token = full_train_features.loc[full_train_data.CATEGORY == 1]
train_spam_token.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,2.0,2.0,2.0,2.0,2.0,2.0,,,...,,,,,,,,,,
1,2.0,,2.0,,2.0,,2.0,,,,...,,,,,,,,,,
2,,,,2.0,,,,2.0,,2.0,...,,,,,,,,,,
3,,,,,,2.0,,,,,...,,,,,,,,,,
4,,,2.0,,2.0,2.0,2.0,,2.0,2.0,...,,,,,,,,,,


In [87]:
train_spam_token.shape

(1246, 2500)

In [89]:
summed_spam_tokens = train_spam_token.sum(axis=0)+1

### Summing the tokens occuring in Ham 

In [93]:
train_ham_tokens = full_train_features.loc[full_train_data.CATEGORY ==0]
summed_ham_tokens = train_ham_tokens.sum(axis=0)+1

### P(Token|spam) - Probability that a token occurs the email is spam

In [95]:
prob_token_spam = summed_spam_tokens / (spam_wc + VOCAB_SIZE)

### P(Token|Ham) - Probability that a token occurs the email is Ham

In [96]:
prob_token_nonspam = summed_ham_tokens / (nonspam_wc + VOCAB_SIZE)

### P(Token) - Probability that a token occurs

In [97]:
 prob_token_all = full_train_features.sum(axis=0) / total_wc

In [98]:
prob_token_all.sum()

1.0

### Saving the trained Model

In [108]:
np.savetxt(TOKEN_SPAM_PROB_FILE, prob_token_spam)
np.savetxt(TOKEN_HAM_PROB_FILE, prob_token_nonspam)
np.savetxt(TOKEN_ALL_PROB_FILE, prob_token_all)

## Prepare Test Data

In [110]:
%%time
full_test_data = make_full_matrix(sparse_test_data, nr_words=VOCAB_SIZE)

CPU times: user 5.31 s, sys: 96.4 ms, total: 5.41 s
Wall time: 5.37 s


In [111]:
x_test = full_test_data.loc[:, full_test_data.columns != 'CATEGORY']
y_test = full_test_data.CATEGORY

In [113]:
np.savetxt(TEST_TARGET_FILE, y_test)
np.savetxt(TEST_FEATURES_MATRIX, x_test)