In [117]:
# Import modules
# For data manipulation
import pandas as pd
# For matrix operations
import numpy as np
# For numerical division
from __future__ import division
# For regular expression (text cleaning)
import re



In [118]:
# Load data set and display a few observations
data = pd.read_csv('C:\\code\\ML-practicE\\shit-easy projects for practice smth\\algorithms without libs\\datasets\\data_spam.csv')
# Create column labels
data.columns = ["content","label"]
data.head()


Unnamed: 0,content,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [119]:
# Add another column with corresponding comment length
data['length'] = data['content'].map(lambda text: len(str(text).split()))

# Summary statistics (mean, stdev, min, max)
data[["label","length"]].describe()

Unnamed: 0,label,length
count,3000.0,3000.0
mean,0.166667,235.882667
std,0.37274,562.174964
min,0.0,0.0
25%,0.0,67.0
50%,0.0,134.0
75%,0.0,235.0
max,1.0,13303.0


In [120]:
# Set seed so we get same random allocation on each run of code
np.random.seed(2017)

# Add column vector of randomly generated numbers form U[0,1]
data["uniform"] = np.random.uniform(0,1,len(data.index)) 

# About 75% of these numbers should be less than 0.75
data_train = data[data["uniform"] < 0.75]

# About 25% of these numbers should be more than 0.75
data_test = data[data["uniform"] > 0.75]

# Check that both training and test data have both spam and ham comments
data_train["label"].describe()

count    2254.000000
mean        0.161491
std         0.368065
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: label, dtype: float64

In [121]:
# Test data summary statistics
data_test["label"].describe()

count    746.000000
mean       0.182306
std        0.386355
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
Name: label, dtype: float64

In [122]:
# Join all the comments into a big list
training_list_words = "".join(data_train.iloc[:,0].values)

# Split the list of comments into a list of unique words
train_unique_words = set(training_list_words.split(' '))

# Number of unique words in training 
vocab_size_train = len(train_unique_words)

# Description of summarized comments in training data
print('Unique words in training data: %s' % vocab_size_train)
print('First 5 words in our unique set of words: \n % s' % list(train_unique_words)[1:6])

Unique words in training data: 26393
First 5 words in our unique set of words: 
 ['sachs', 'personnal', 'being', 'plug', 'coords']


In [123]:
# Only keep letters and numbers
train_unique_words = [re.sub(r'[^a-zA-Z0-9]','', words) for words in train_unique_words]

# Convert to lower case and get unique set of words
train_unique_words = set([words.lower() for words in train_unique_words])

# Number of unique words in training 
vocab_size_train = len(train_unique_words)

# Description of summarized comments in training data
print('Unique words in processed training data: %s' % vocab_size_train)
print('First 5 words in our processed unique set of words: \n % s' % list(train_unique_words)[1:6])

Unique words in processed training data: 25678
First 5 words in our processed unique set of words: 
 ['sachs', 'personnal', 'being', 'plug', 'coords']


In [124]:
# Dictionary with comment words as "keys", and their label as "value"
trainPositive = dict()
trainNegative = dict()

# Intiailize classes
positiveTotal = 0
negativeTotal = 0

# Initialize Prob. of
pSpam = 0.0
pNotSpam = 0.0

# Laplace smoothing
alpha = 1

In [125]:
#def initialize_dicts():

# Initialize dictionary of words and their labels   
for word in train_unique_words:
    
    # Classify all words for now as ham (legitimate)
    trainPositive[word] = 0
    trainNegative[word] = 0

In [140]:
# Count number of times word in comment appear in spam and ham comments
def processEmail(email,label):
    global positiveTotal
    global negativeTotal
    
    # Split comments into words
    email = email.split(' ')
    
    # Go over each word in email
    for word in email:
        
        # ham commments
        if(label == 0 and word != ' '):
            
            # Increment number of times word appears in ham emails
            trainNegative[word] = trainNegative.get(word,0)+1
            negativeTotal += 1
            
        # spam comments
        elif(label == 1 and word != ' '):
            
            # Increment number of times word appears in spam comments
            trainPositive[word] = trainPositive.get(word,0)+1
            positiveTotal += 1

1


In [127]:
# Define Prob(word|spam) and Prob(word|ham)
def conditionalWord(word,label):
    
    # Laplace smoothing parameter
    global alpha
    
    # word in ham email
    if(label == 0):
        # Compute Prob(word|ham)
        return (trainNegative.get(word,0)+alpha)/(float)(negativeTotal+alpha*vocab_size_train)
    
    # word in spam email
    else:
        
        # Compute Prob(word|ham)
        return (trainPositive.get(word,0)+alpha)/(float)(positiveTotal+alpha*vocab_size_train)

In [128]:
# Define Prob(spam|comment) or Prob(ham|comment)
def conditionalEmail(email,label):
    
    # Initialize conditional probability
    prob_label_email = 1.0
    
    # Split comments into list of words
    email = str(email).split(' ')
    
    # Go through all words in emails
    for word in email:
        
        # Compute value proportional to Prob(label|email)
        # Conditional indepdence is assumed here
        prob_label_email *= conditionalWord(word,label)
    
    return prob_label_email

In [129]:
# Train naive bayes by computing several conditional probabilities in training data
def train():
    
    print('Starting training')
    global pSpam
    global pNotSpam

    # Initiailize 
    total = 0
    numNegative = 0
    
    # Go over each email in training data
    for idx, email in data_train.iterrows():
        
        # Comment is ham 
        if email.label == 0:
            
            # Increment ham email counter
            numNegative += 1
        
        # Increment comment number
        total += 1
        
        # Update dictionary of ham and spam comments
        processEmail(email.content,email.label)
    
    # Compute prior probabilities, P(spam), P(ham)
    pSpam = numNegative/float(total)
    pNotSpam = (total - numNegative)/float(total)
    
    print('Training is now finished')

In [130]:
train()

Starting training


NameError: name 'ttrainNegative' is not defined

In [131]:
# Classify comment are spam or ham
def classify(email):
    
    global pSpam
    global pNotSpam
    
    # Compute value proportional to Pr(comment|ham)
    isNegative = pSpam * conditionalEmail(email,0)
    
    # Compute value proportional to Pr(comment|spam)
    isPositive = pNotSpam * conditionalEmail(email,1)
    
    # Output True = spam, False = ham
    return (isNegative < isPositive)



In [132]:
# Initialize spam prediction in test data
prediction_test = []

# Get prediction accuracy on test data
for email in data_test["content"]:

    # Classify comment 
    prediction_test.append(classify(email))

# Check accuracy
test_accuracy = np.mean(np.equal(prediction_test, data_test["label"]))

#print prediction_test
print("Proportion of comments classified correctly on test set: %s" % test_accuracy)



Proportion of comments classified correctly on test set: 0.8176943699731903


In [133]:
classify('FREE gift special for you right now!!!')

False

In [134]:
# Compute tfidf(word, comment, data)
def TFIDF(email, train):
    
    # Split comment into list of words
    email = email.split(' ')
    
    # Initiailize tfidf for given comment
    tfidf_email = np.zeros(len(email))
    
    # Initiailize number of comments containing a word
    num_email_word = 0
    
    # Intialize index for words in email
    word_index = 0
    
    # Go over all words in comment
    for word in email:
        
        # Compute term frequence (tf)
        # Count frequency of word in email
        tf = email.count(word)
        
        # Find number of emails containing word
        for text in train["content"]:
            
            # Increment word counter if word found in comment
            if text.split(' ').count(word) > 0:
                num_email_word += 1
        
        # Compute inverse document frequency (idf)
        # log(Total number of emails/number of comments with word)
        if num_email_word == 0:
            idf = np.log(len(train.index)/(num_email_word+1))
        else:
            idf = np.log(len(train.index)/num_email_word)
        
        # Update tf-idf weight for word
        tfidf_email[word_index] = tf*idf
        
        # Reset comment containing word counter
        num_email_word = 0
        
        # Move onto next word in email
        word_index += 1
        
    return tfidf_email

In [135]:
TFIDF("Check out my new music video plz",data_train)

array([7.72046169, 1.27792153, 1.21020335, 1.49195069, 3.98279208,
       4.35316586, 7.72046169])

In [136]:
classify("")

False