# This workbook uses a known spam email's text to identify similar emails


In [299]:
# import pandas to handle dataframes
import pandas as pd

# import emails.csv as a data frame
df = pd.read_csv('emails.csv')
print (df.sample(5))

# note that the meta_category is something I used to classify these messages.
# These are fake emails and this is a classification that I manually created to measure results
# Use meta_category to check results!

     email_id date_sent           recipient_address  \
65      15423   12/7/17    greta_davis@yourmail.com   
21      15379   12/3/17    elle_fuller@yourmail.com   
76      15434   12/8/17  felicia_evans@yourmail.com   
215     15573  12/29/17      ken_black@yourmail.com   
112     15470  12/12/17     carol_hull@yourmail.com   

                                               message  spam_boolean  \
65   Hello,  I'm selling vi.agara 99 month;y supply...             1   
21   Greetings,  Selling v-i-a-g-a-r-a 99 monthly s...             1   
76   Greetings,  Want to buy v I a g a r a 99 per m...             1   
215  Hey, Let's talk about that work thing. XOXOXOX...             0   
112    Hello Carol, Are you available Friday? -Michael             0   

    meta_category  
65   spam_pattern  
21   spam_pattern  
76   spam_pattern  
215      personal  
112      personal  


In [300]:
# create a new dataframe including only spam comments
spam_df = df[df['spam_boolean'] == 1]
print(spam_df.sample(5))

     email_id date_sent         recipient_address  \
217     15575  12/30/17     ann_king@yourmail.com   
191     15549  12/25/17   carol_hull@yourmail.com   
39      15397   12/5/17  elle_fuller@yourmail.com   
178     15536  12/23/17   carol_hull@yourmail.com   
31      15389   12/4/17   john_clark@yourmail.com   

                                               message  spam_boolean  \
217  Hello, I am contacting you to present you an o...             1   
191  I am diplomat agent Mr. John Lee in charge of ...             1   
39   Dear sir,  Selling v-i-a-g-a-r-a 99$ monthly s...             1   
178  I am diplomat agent Mr. John Lee in charge of ...             1   
31   Dear sir,  Want to buy v I a g a r a 99$ month...             1   

    meta_category  
217    spam_other  
191    spam_other  
39   spam_pattern  
178    spam_other  
31   spam_pattern  


# Use sequence matcher to compute a match score between two strings

In [301]:
# This is the type of spam comment we want to identify:
training_email_id = 15416
training_email_message = str(
    df[
        df['email_id'] == training_email_id
    ].iloc[0]['message']
)
print(training_email_message)

Hello-  I sell viagara 99 monthly supply goto --> http://www.get_v_now.com/sfdsretg


In [302]:
# Use difflib's SequenceMatcher to return a match score for any two strings
import difflib
from difflib import SequenceMatcher
from functools import partial
# documentation: https://docs.python.org/2/library/difflib.html

#this function will return a match score for the text of any two emails
def apply_sequence_match(email_id_1, email_id_2, column_name): 
    email_text_1 = str(df[df['email_id'] == email_id_1].iloc[0][column_name])
    email_text_2 = str(df[df['email_id'] == email_id_2].iloc[0][column_name])
    match_score = difflib.SequenceMatcher(None, email_text_1, email_text_2).ratio()
        # None = Optional argument isjunk must be None (the default) or a one-argument function that takes a sequence element and returns true if and only if the element is “junk” and should be ignored. 
        # Passing None for isjunk is equivalent to passing lambda x: 0; in other words, no elements are ignored. 
    print("email 1 | id = "+str(email_id_1)+ " | message = ")
    print(email_text_1+"\n")
    print("email 2 | id = "+str(email_id_2)+ " | message = ")
    print(email_text_2+"\n")
    print("Match score = "+str(match_score))
        


In [303]:
# compare two KNOWN spam comments of the same pattern to return a match score
apply_sequence_match(15398, 15416, 'message')

email 1 | id = 15398 | message = 
Dear sir,  We are selling v I a g a r a 97$ month;y supply GO TO http://bit.ly/1S4JKID

email 2 | id = 15416 | message = 
Hello-  I sell viagara 99 monthly supply goto --> http://www.get_v_now.com/sfdsretg

Match score = 0.520710059172


# Use sequence matcher to compute a match score between a training email message and an entire dataframe

In [304]:
#this function uses a single training comment's text to compute a match score for all rows in the dataframe

def apply_sequence_match_dataframe(df_row, training_message, column_name, rounding_decimals, dataframe): 
    match_score = difflib.SequenceMatcher(None, training_message, str(df_row[column_name])).ratio()
    match_score_rounded = round(match_score,rounding_decimals)
    return match_score_rounded

In [305]:
# create a new row in dataframe for each row's score 
df['match_score'] = df.apply(
    partial(
        apply_sequence_match_dataframe, 
        training_message = training_email_message, 
        column_name = 'message',
        rounding_decimals = 4,
        dataframe = df
    )
    , axis=1)

# rounded scores create buckets for visualizations
df['match_score_rounded'] = df.apply(
    partial(
        apply_sequence_match_dataframe, 
        training_message = training_email_message, 
        column_name = 'message',
        rounding_decimals = 1,
        dataframe = df
    )
    , axis=1)

In [306]:
#Create a pivot table, looking at the distribution between true spam comments and true non-spam comments
df.pivot_table(
    values ='email_id',
    index='match_score_rounded',
    columns ='spam_boolean',
    aggfunc ='count',
    fill_value=0
)

spam_boolean,0,1
match_score_rounded,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,3,0
0.1,60,0
0.2,53,25
0.3,33,0
0.4,0,1
0.5,0,11
0.6,0,16
0.7,0,14
0.8,0,6
0.9,0,1


Per the chart above, note that non-spam comments never have a score above 0.35 (the score is rounded to the nearest tenth).  Spam comments are strangely distributed: A large chunk at 0.15-0.25 and another set clustering around 0.6.  

As I mentioned above, there are emails in the spam set that aren't of the same pattern we're testing, like this one: "Hello, I am contacting you to present you an opportunity to receive $11.5M US Dollars without contravening the law. Reply if you are interested. Regards, Andre."

Since I've already classified the types of emails in the 'meta_category' column, let's look at that distribution below.

In [307]:
df.pivot_table(
    values ='email_id',
    index='match_score_rounded',
    columns ='meta_category',
    aggfunc ='count',
    fill_value=0
)

meta_category,marketing,personal,spam_other,spam_pattern
match_score_rounded,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0,3,0,0
0.1,12,48,0,0
0.2,6,47,25,0
0.3,0,33,0,0
0.4,0,0,0,1
0.5,0,0,0,11
0.6,0,0,0,16
0.7,0,0,0,14
0.8,0,0,0,6
0.9,0,0,0,1


In [308]:
# Given a minimum score, how many "spam pattern" comments fall above and below it?
min_score = 0.5
summary_df = df[df['meta_category'] == 'spam_pattern']

print("Spam pattern emails with score of >= "+str(min_score)+": ")
print((summary_df[summary_df['match_score'] >= min_score])['email_id'].count())
print("Spam pattern emails with score of < "+str(min_score)+": ")
print((summary_df[summary_df['match_score'] < min_score])['email_id'].count())

Spam pattern emails with score of >= 0.5: 
46
Spam pattern emails with score of < 0.5: 
4


Now, you can see that all of those 0.15-0.25 scored comments are the "other" spam pattern.

For purposes of this exercise, using a spam score of >= 0.5 would give us:
- 46 comments correctly identified as part of this spam pattern
- 0 false positives (non-spam comments identified as part of this pattern)
- 4 false negative (is spam pattern, but has match score lower than the threshold)

We could lower the threshold to 0.4, which would produce no false negatives, OR we could clean up the message strings to see if that helps (see below)


# We can also layer in functions which clean up strings and recalculate the match score

In [309]:
# make message text lowercase
def lower_string(message):
    string = str(message)
    string_lower = string.lower()
    if len(string) <= 0:
        return ""
    else: 
        return string_lower
    
# Here is an example to illustrate what this function does: 
example_string = 'Lauren wAs HERE'

print(example_string
     + "  -->  "
     + lower_string(example_string))

Lauren wAs HERE  -->  lauren was here


In [310]:
# make sure string is CLEANED of adjacent duplicates before looking for dictionary words
def remove_adjacent_duplicate_text(message):
    string = str(message)
    # remove adjacent duplicates
    cleaned_msg = ''.join(ch for ch, _ in itertools.groupby(string))
    if len(string) <= 0:
        return ""
    else: 
        return cleaned_msg
    
    
# Here is an example to illustrate what this function does: 
example_string = 'laurrrennnn wassss    heerrree'

print(example_string
     + "  -->  "
     + remove_adjacent_duplicate_text(example_string))

laurrrennnn wassss    heerrree  -->  lauren was here


In [311]:
import re #for 

def strip_illegal_characters(message):
    string = str(message)
    cleaned_stripped_msg = re.sub('[>!@#$->_]', '', message)
    if len(string) <= 0:
        return ""
    else: 
        return cleaned_stripped_msg
    
    
# Here is an example to illustrate what this function does: 
example_string = 'lauren>>> was_ !here!'

print(example_string
     + "  -->  "
     + strip_illegal_characters(example_string))

lauren>>> was_ !here!  -->  lauren was here


In [312]:
# Apply all functions to dataframe in any order:
df['cleaned_message'] = df['message']
df['cleaned_message'] = df['cleaned_message'].apply(lower_string) 
df['cleaned_message'] = df['cleaned_message'].apply(remove_adjacent_duplicate_text) 
df['cleaned_message'] = df['cleaned_message'].apply(strip_illegal_characters) 

# Now that we have nifty new ways of cleaning text, does it make a difference in the match score computation?

In [313]:
# Make message lowercase and recalculate the match score
_email_id = 15416 #always use this for training email

new_col_name = 'lower_string'
df[target_column] = df['message'].apply(lower_string) 
_email_message = lower_string(str(df[df['email_id'] == _email_id].iloc[0]['message']))


df[new_col_name] = df.apply(
    partial(
        apply_sequence_match_dataframe, 
        training_message = training_email_message, 
        column_name = target_column,
        rounding_decimals = 1,
        dataframe = df
    )
    , axis=1)

df.pivot_table(
    values ='email_id',
    index= new_col_name,
    columns ='meta_category',
    aggfunc ='count',
    fill_value=0
)

meta_category,marketing,personal,spam_other,spam_pattern
lower_string,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0,5,0,0
0.1,5,51,0,0
0.2,13,42,25,0
0.3,0,33,0,0
0.5,0,0,0,9
0.6,0,0,0,16
0.7,0,0,0,17
0.8,0,0,0,6
0.9,0,0,0,1
1.0,0,0,0,1


Converting the text to lowercase DOES help improve the accuracy of the match function, without increasing the incidence of false positives!  If we were training a model around the sample comment, the lower_message function should be included in the final model.

In [314]:
# Remove adjacent duplicate text
_email_id = 15416 #always use this for training email

new_col_name = 'remove_adjacent_duplicate_text'
df[target_column] = df['message'].apply(remove_adjacent_duplicate_text) 
_email_message = remove_adjacent_duplicate_text(str(df[df['email_id'] == _email_id].iloc[0]['message']))


df[new_col_name] = df.apply(
    partial(
        apply_sequence_match_dataframe, 
        training_message = training_email_message, 
        column_name = target_column,
        rounding_decimals = 1,
        dataframe = df
    )
    , axis=1)

df.pivot_table(
    values ='email_id',
    index= new_col_name,
    columns ='meta_category',
    aggfunc ='count',
    fill_value=0
)


meta_category,marketing,personal,spam_other,spam_pattern
remove_adjacent_duplicate_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0,4,0,0
0.1,12,47,8,0
0.2,6,44,17,0
0.3,0,36,0,8
0.4,0,0,0,7
0.5,0,0,0,8
0.6,0,0,0,18
0.7,0,0,0,3
0.8,0,0,0,5
0.9,0,0,0,1


Removing adjacent duplicate characters actually weakens the spam score to the point where some "real" emails' scores are higher than some spam email scores.  This function would not be included in the final model for this particular case, but if spammers were doing something like trying to confuse filters by changing a word like "google" into "gooooogggle" or "goooggleee", it might be a good function to add.

In [315]:
# Make message lowercase and recalculate the match score
_email_id = 15416 #always use this for training email

new_col_name = 'strip_illegal_characters'
df[target_column] = df['message'].apply(strip_illegal_characters) 
_email_message = strip_illegal_characters(str(df[df['email_id'] == _email_id].iloc[0]['message']))


df[new_col_name] = df.apply(
    partial(
        apply_sequence_match_dataframe, 
        training_message = training_email_message, 
        column_name = target_column,
        rounding_decimals = 1,
        dataframe = df
    )
    , axis=1)

df.pivot_table(
    values ='email_id',
    index= new_col_name,
    columns ='meta_category',
    aggfunc ='count',
    fill_value=0
)

meta_category,marketing,personal,spam_other,spam_pattern
strip_illegal_characters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0,5,0,0
0.1,7,41,0,0
0.2,11,41,25,0
0.3,0,44,0,3
0.4,0,0,0,6
0.5,0,0,0,12
0.6,0,0,0,19
0.7,0,0,0,4
0.8,0,0,0,5
0.9,0,0,0,1


Stripping "illegal" characters actually weakens the spam score to the point where some "real" emails' scores are higher than some spam email scores.  This function would not be included in the final model for this particular case, but it might work for other spam patterns.