In [582]:
import os, sys
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import re
import numpy as np
from collections import Counter
import nltk
from nltk.stem.porter import PorterStemmer

In [583]:
datasets_folder = "spam_datasets"

In [584]:
def spam_ham_string(x, spam_or_ham):
    if (spam_or_ham != "spam" and spam_or_ham!="ham"):
        print("Error in input string")
        return False
    if spam_or_ham in x:
        return True
    return False

In [585]:
def split_spam_ham_name(all_folders):
    spam_folders = list(filter((lambda x: spam_ham_string(x, "spam")), all_folders))
    ham_folders = list(filter((lambda x: spam_ham_string(x, "ham")), all_folders))
    return ([spam_folders, ham_folders])

In [586]:
def extract_spam_ham_name(datasets_folder):
    current_folder_path = os.path.abspath(os.getcwd()) + "/"
    dataset_path = current_folder_path + datasets_folder + "/"
    all_folders = os.listdir(dataset_path)
    
    spam_folders, ham_folders = split_spam_ham_name(all_folders)
    
    spam_folders_path = [dataset_path + folder + "/" for folder in spam_folders]
    ham_folders_path = [dataset_path + folder + "/" for folder in ham_folders]
    
    return [spam_folders_path, ham_folders_path]

In [587]:
spam_folders_path, ham_folders_path = extract_spam_ham_name(datasets_folder)

In [588]:
spam_folders_path

['/home/hugo/Machine_learning/Machine_learning_course/chap_3/spam_datasets/20030228_spam_2/',
 '/home/hugo/Machine_learning/Machine_learning_course/chap_3/spam_datasets/20021010_spam/',
 '/home/hugo/Machine_learning/Machine_learning_course/chap_3/spam_datasets/20030228_spam/']

In [589]:
ham_folders_path

['/home/hugo/Machine_learning/Machine_learning_course/chap_3/spam_datasets/20030228_easy_ham_2/',
 '/home/hugo/Machine_learning/Machine_learning_course/chap_3/spam_datasets/20030228_hard_ham/',
 '/home/hugo/Machine_learning/Machine_learning_course/chap_3/spam_datasets/20030228_easy_ham/',
 '/home/hugo/Machine_learning/Machine_learning_course/chap_3/spam_datasets/20021010_easy_ham/',
 '/home/hugo/Machine_learning/Machine_learning_course/chap_3/spam_datasets/20021010_hard_ham/']

In [590]:
def separate_header_content(raw_text, file):
    separator_pattern = re.compile(r"\n\n")
    mail = separator_pattern.split(raw_text, maxsplit=1)
    if len(mail)==1:
        heading = mail[0]
        content=""
        print(file)
    else:
        heading = mail[0]
        content = mail[1]
    header_in_content = False
    while(content != "" and header_in_content):
        header_in_content = False
        for pattern in heading_pattern:
            if pattern.match(content) != None:
                mail = separator_pattern.split(content, maxsplit=1)
                header_in_content = True
                content = mail[1]
                heading = heading + mail[0]
    
    return (heading, content)
            

In [591]:
def extract_txt(filepath):
    try:
        f = open(filepath, "r", encoding='utf8', errors='replace')#, 
        text = f.read()
    except:
        return (False, filepath)
    return (True, text)

Patterns in the heading
---

In [592]:

received_pattern = re.compile("^Received:.*\n\t.*\n\t.*\n|^Received:.*\n\t.*\n|^Received:.*\n",
                              flags=re.M|re.X)
messageId_pattern = re.compile("^Message-ID:.*\n",
                              flags=re.M|re.X)
returnpath_pattern = re.compile("^Return-Path:.*\n",
                              flags=re.M|re.X)
from_pattern = re.compile("^From[:\s].*\n",
                         flags=re.M|re.X)
deliver_pattern = re.compile("^Delivered-To:.*\n", 
                            flags=re.M|re.X)
subject_pattern = re.compile("^Subject:.*\n", 
                            flags=re.M|re.X)
date_pattern = re.compile("^Date:.*\n", 
                            flags=re.M|re.X)
content_type_pattern = re.compile("^Content-Type:.*\n", 
                            flags=re.M|re.X)
content_transfer_pattern = re.compile("^Content-Transfer-Encoding:.*\n", 
                            flags=re.M|re.X)
content_pattern = re.compile("^Content-.*:.*\n",
                            flags=re.M|re.X)
mime_pattern = re.compile("^MIME-Version:.*\n", 
                            flags=re.M|re.X)
bcc_pattern = re.compile("^Bcc:.*\n", 
                            flags=re.M|re.X)
importance_pattern = re.compile("^Importance:.*\n", 
                            flags=re.M|re.X)
to_pattern = re.compile("^To:.*\n", 
                            flags=re.M|re.X)
xmailer_pattern = re.compile("^X-Mailer:.*\n", 
                            flags=re.M|re.X)
xpriority_pattern = re.compile("^X-Priority:.*\n", 
                            flags=re.M|re.X)
x_spam_pattern = re.compile("^X-Spam:.*\n",
                           flags=re.M|re.X)
x_pattern = re.compile("^X-.*:.*\n", 
                            flags=re.M|re.X)
replyto_pattern = re.compile("^Reply-To:.*\n", 
                            flags=re.M|re.X)
list_pattern = re.compile("^List-.*:.*\n", 
                            flags=re.M|re.X)
errors_pattern = re.compile("^Errors:.*\n", 
                            flags=re.M|re.X)
hyphen_to_pattern = re.compile("^[\w\-]+-To:.*\n", 
                            flags=re.M|re.X)

heading_pattern = [
    received_pattern, 
    messageId_pattern,
    returnpath_pattern,
    from_pattern,
    deliver_pattern,
    subject_pattern,
    date_pattern,
    content_type_pattern,
    content_transfer_pattern,
    mime_pattern,
    bcc_pattern,
    importance_pattern,
    to_pattern,
    xmailer_pattern,
    xpriority_pattern,
    x_spam_pattern,
    x_pattern,
    replyto_pattern,
    list_pattern,
    errors_pattern,
    hyphen_to_pattern,
    ]

Patterns in the content
----

In [593]:
HTML_pattern = re.compile("\<.*?\>", re.S)
URL_pattern = re.compile("http.*\s", re.I | re.X)
mail_pattern = re.compile("\w+@[\w\.]+\W", re.I)
arrow_pattern = re.compile("^\>", re.M)
price_pattern = re.compile("[\$£]\s{0,2}\d+[,\.]\d+ | [\$£]\s{0,2}\d+ | \d+\s{0,2}[$£] | \d+[,\.]\d+\s{0,2}[$£]", re.X)
one_two_letter_word_pattern = re.compile(r"\b[a-zA-Z]{1,2}\b", re.X | re.M)

punctuation_pattern = re.compile(r"[\?\!]", re.X)
hour_date_pattern = re.compile("\d{4}\s*[a-zA-Z]{2,8}\s*\d{1,2} | \d{1,2}\s*[a-zA-Z]{2,8}\s*\d{4} | \d{1,2}:\d{2}:\d{2}", re.X)

maj_pattern = re.compile("[A-Z]", re.X)
number_pattern = re.compile("\d+", re.X)

Most Used word from previous analysis
-----

In [594]:
most_used_word_subject = [
'perl',
 'free',
 'user',
 'from',
 'and',
 'for',
 'get',
 'sadev',
 'new',
 'use',
 'razor',
 'spam',
 'you',
 'spambay',
 'ilug',
 'best',
 'satalk',
 'with',
 'adv',
 'the',
 'mortgag',
 'apt',
 'your',
 'spamassassin',
 'wa',
 'onlin']

In [595]:
most_used_word_content = [
'phone',
 'had',
 'for',
 'there',
 'type',
 'get',
 'should',
 'not',
 'busi',
 'new',
 'them',
 'think',
 'web',
 'about',
 'world',
 'govern',
 'just',
 'date',
 'with',
 'the',
 'other',
 'url',
 'see',
 'thi',
 'receiv',
 'size',
 'wa',
 'pleas',
 'becaus',
 'need',
 'are',
 'click',
 'compani',
 'linux',
 'even',
 'who',
 'could',
 'from',
 'text',
 'have',
 'they',
 'some',
 'one',
 'also',
 'grant',
 'would',
 'home',
 'peopl',
 'use',
 'email',
 'you',
 'ani',
 'make',
 'over',
 'cfont',
 'out',
 'much',
 'inform',
 'their',
 'your',
 'spamassassin',
 'into',
 'list',
 'more',
 'order',
 'nbsp',
 'wrote',
 'don',
 'work',
 'call',
 'way',
 'free',
 'mail',
 'and',
 'can',
 'then',
 'ha',
 'ffont',
 'send',
 'face',
 'now',
 'what',
 'time',
 'remov',
 'here',
 'been',
 'our',
 'internet',
 'than',
 'onli',
 'form',
 'how',
 'it',
 'will',
 'user',
 'which',
 'want',
 'know',
 'were',
 'but',
 'most',
 'that',
 'name',
 'content',
 'money',
 'well',
 'all',
 'address',
 'may',
 'messag',
 'color',
 'when',
 'help',
 'like']

General class to process heading
---

In [596]:
class HeadPattern():
    def __init__(self, pattern):
        self.pattern = pattern
        self.pattern_list = []
    
    @property
    def pattern_cat(self):
        return 0    
    
    def find(self, header):
        self.pattern_list = self.pattern.findall(header)
        
    def isNone(self):
        return (self.pattern_list == [])
    
    def display_pattern(self):
        print(self.pattern_list)
    
    def len_list_pattern(self):
        return (len(self.pattern_list))
    
    def processing(self, header):
        pass

General class to process content
----

In [597]:
class ContentPattern():
    def __init__(self, pattern):
        self.pattern = pattern
        self.pattern_list = []
    
    @property
    def pattern_cat(self):
        return 0    
    
    def find(self, content):
        self.pattern_list = self.pattern.findall(content)
        
    def isNone(self):
        return (self.pattern_list == [])
    
    def display_pattern(self):
        print(self.pattern_list)
    
    def len_list_pattern(self):
        return (len(self.pattern_list))
    
    def count(self, content):
        self.find(content)
        return (self.len_list_pattern())
    
    def replace(self, content):
        new_content = self.pattern.sub("", content)
        return (new_content)
    
    def count_replace(self, content):
        pattern_nb = 0
        new_content, pattern_nb = self.pattern.subn("", content)
        return (new_content, pattern_nb)
    
    def processing(self, content):
        pass

Subclasses for processing
----

In [598]:
class FromPattern(HeadPattern):
    def __init__(self):
        HeadPattern.__init__(self, from_pattern)
    
    def processing(self, header):
        self.find(header)
        name = ''
        domain = ''
        extension = ''
        if (self.isNone() == False):
            length = self.len_list_pattern()
            for i in range(length):
                mail_pattern1 = re.compile("(From:\s)<?(?P<name>.*)@(?P<domain>.*)\.(?P<extension>[\w\.]*)>?", re.I)
                #mail_pattern2 = re.compile("(From:\s)(?P<name>.*)@(?P<domain>.*)\.(?P<extension>.*)", re.I)
                mail_address = mail_pattern1.search(self.pattern_list[i])
                #if (mail_address == None):
                #    mail_address = mail_pattern2.search(self.pattern_list[i])
                if (mail_address != None):
                    name = mail_address.group('name')
                    domain = mail_address.group('domain')
                    extension = mail_address.group('extension')
        len_name = len(name)
        number_in_name = re.findall("\d", name)
        if (number_in_name != None):
            len_number_in_name = len(re.findall("\d", name))
        else:
            len_number_in_name = 0
        return ([len_name, len_number_in_name, domain, extension])
    
    @property
    def pattern_cat(self):
        return ("from")

In [599]:
class DatePattern(HeadPattern):
    def __init__(self):
        HeadPattern.__init__(self, date_pattern)
    
    def processing(self, header):
        self.find(header)
        hour = np.nan
        if (self.isNone() == False):
            hour_patt = re.compile("(?P<hour>\d{1,2}):(?P<minute>\d{2}):(?P<second>\d{2})")
            hour_search = hour_patt.search(self.pattern_list[0])
            hour = int(hour_search.group('hour')) if (hour_search!=None) else np.nan
        return (hour)
    @property
    def pattern_cat(self):
        return("hour")

In [600]:
class ContentTypePattern(HeadPattern):
    def __init__(self):
        HeadPattern.__init__(self, content_type_pattern)
    
    def processing(self, header):
        self.find(header)
        content_type = np.nan
        if (self.isNone() == False):
            content_type_patt = re.compile("""(Content-Type:)\s(?P<type>.*)[;\n]""", re.X)
            content_type_match = content_type_patt.search(self.pattern_list[0])
            content_type = content_type_match.group('type') if (content_type_match!=None) else np.nan
        return (content_type)
    
    @property
    def pattern_cat(self):
        return("content-type")

In [601]:
class ImportancePattern(HeadPattern):
    def __init__(self):
        HeadPattern.__init__(self, importance_pattern)
    
    def processing(self, header):
        self.find(header)
        importance = np.nan
        if (self.isNone() == False):
            importance = self.pattern_list[0][12:]
        return (importance)
    @property
    def pattern_cat(self):
        return("importance")

In [602]:
class XSpamPattern(HeadPattern):
    def __init__(self):
        HeadPattern.__init__(self, x_spam_pattern)
    
    def processing(self, header):
        self.find(header)
        x_spam_word = np.nan
        if (self.isNone() == False):
            x_spam_word = self.pattern_list[0][8:]
        return (x_spam_word)
    
    @property
    def pattern_cat(self):
        return("x-spam")

In [603]:
class X_Pattern(HeadPattern):
    def __init__(self):
        HeadPattern.__init__(self, x_pattern)
    
    def processing(self, header):
        self.find(header)
        return (self.len_list_pattern())
    
    @property
    def pattern_cat(self):
        return("x-*")

In [604]:
class HTMLPattern(ContentPattern):
    def __init__(self):
        ContentPattern.__init__(self, HTML_pattern)
    
    def processing(self, content):
        return self.count_replace(content)
    @property
    def pattern_cat(self):
        return("html")

In [605]:
class URLPattern(ContentPattern):
    def __init__(self):
        ContentPattern.__init__(self, URL_pattern)
    
    def processing(self, content):
        return self.count_replace(content)
    
    @property
    def pattern_cat(self):
        return("url")

In [606]:
class MailPattern(ContentPattern):
    def __init__(self):
        ContentPattern.__init__(self, mail_pattern)
    
    def processing(self, content):
        return self.count_replace(content)
    
    @property
    def pattern_cat(self):
        return("mail")

In [607]:
class ArrowPattern(ContentPattern):
    def __init__(self):
        ContentPattern.__init__(self, arrow_pattern)
    
    def processing(self, content):
        return self.count_replace(content)
    
    @property
    def pattern_cat(self):
        return("> count")

In [608]:
class PricePattern(ContentPattern):
    def __init__(self):
        ContentPattern.__init__(self, price_pattern)
    
    def processing(self, content):
        return self.count_replace(content)
    
    @property
    def pattern_cat(self):
        return("price")

In [609]:
class PunctuationPattern(ContentPattern):
    def __init__(self):
        ContentPattern.__init__(self, punctuation_pattern)
    
    def processing(self, content):
        return self.count_replace(content)
    
    @property
    def pattern_cat(self):
        return("? or !")

In [610]:
class NumberPattern(ContentPattern):
    def __init__(self):
        ContentPattern.__init__(self, number_pattern)
    
    def processing(self, content):
        return self.count_replace(content)
    
    @property
    def pattern_cat(self):
        return("Number count")

In [611]:
class HourDatePattern(ContentPattern):
    def __init__(self):
        ContentPattern.__init__(self, hour_date_pattern)
    
    def processing(self, content):
        return self.replace(content)

In [612]:
class OneTwoLetterWordPattern(ContentPattern):
    def __init__(self):
        ContentPattern.__init__(self, one_two_letter_word_pattern)
    
    def processing(self, content):
        return self.replace(content)

In [613]:
class MajPattern(ContentPattern):
    def __init__(self):
        ContentPattern.__init__(self, maj_pattern)
    
    def processing(self, content):
        return self.count(content)
    
    @property
    def pattern_cat(self):
        return("Majuscule count")

Count presence of words
---
Functions to isolate remaining words and then count the common word with the predifined lists most common word

In [614]:
def isolate_all_word(content):
    pattern = re.compile("\w+", re.I | re.X | re.M)
    new_content = pattern.findall(content)
    return new_content

In [615]:
def count_common(content, list2compare):
    word_list = isolate_all_word(content)
    word_list_length = len(word_list)
    if len(word_list) != 0:
        counting_list = [round(word_list.count(x)/len(word_list), 4) for x in list2compare]
    else:
        counting_list = [0 for x in list2compare]
    return counting_list

Content word features
----
This class will extract features about the word in the content

In [616]:
word_pattern = re.compile("\\b[\w\+\-\=\&\*]+\\b", re.I | re.X)

class WordPattern(ContentPattern):
    def __init__(self):
        ContentPattern.__init__(self, word_pattern)
    
    def processing(self, content):
        self.find(content)
        number_of_word = self.len_list_pattern()
        len_word_list = list(map(len, self.pattern_list))
        if len(len_word_list) != 0:
            mean_len_word = round(sum(len_word_list)/len(len_word_list), 4)
            longuest_word = max(len_word_list)
        else:
            mean_len_word = 0
            longuest_word = 0
        
        count_common_word = count_common(content, most_used_word_content)
        return ([number_of_word, mean_len_word, longuest_word, count_common_word])
    @property
    def pattern_cat(self):
        return("content_attr")
        

Subject is treated differently from other heading pattern as precious info could be extract from these
---

In [617]:
class SubjectPattern(HeadPattern):
    def __init__(self):
        HeadPattern.__init__(self, subject_pattern)
    
    def processing(self, header):
        self.find(header)
        number_of_maj = 0
        number_of_number = 0
        number_of_char = 0
        number_of_price = 0
        number_of_special_char = 0
        common_word_list = [0 for x in most_used_word_subject]
        if (self.isNone() == False):
            # Extract info on number of char in function of categories (maj, punctuation)
            number_of_maj = len(re.findall("[A-Z]", self.pattern_list[0])) - 1 #Subtract "S" of "Subject"
            number_of_char = len(self.pattern_list[0]) - 8 #Subtract len("Subject:"))
            number_of_special_char = len(re.findall("[\!\?\$]", self.pattern_list[0]))
            
            # Extract info on number of char in function of categories (price,number)
            # And replace by "" those pattern
            new_subject, number_of_price = PricePattern().processing(self.pattern_list[0])
            new_subject, number_of_number = NumberPattern().processing(new_subject)
            # Replace all one or two letters word by ""
            new_subject = OneTwoLetterWordPattern().processing(new_subject)
            
            # Extact all remaining words except Subject of course
            new_subject = new_subject[8:]
            common_word_list = count_common(new_subject, most_used_word_subject)
        
        return ([number_of_maj, 
                 number_of_char, 
                 number_of_special_char, 
                 number_of_price, 
                 number_of_number, 
                 common_word_list])
    
    @property
    def pattern_cat(self):
        return("subject")

In [618]:
important_heading_pattern = [
    FromPattern(),
    SubjectPattern(),
    DatePattern(),
    ContentTypePattern(),
    ImportancePattern(),
    XSpamPattern(),
    X_Pattern(),
]

In [619]:
def process_heading(heading):
    attributes = []
    for important_pattern in important_heading_pattern:
        attributes.append(important_pattern.processing(heading))
    return attributes  #heading_dataframe.loc[len(heading_dataframe.index)] = 

In [620]:
count_replace_content_pattern_list = [
    HTMLPattern(),
    URLPattern(),
    MailPattern(),
    PricePattern(),
    ArrowPattern(),
    PunctuationPattern(),
    NumberPattern(),    
]

replace_content_list = [
    OneTwoLetterWordPattern(),
    HourDatePattern(),
]

count_content_list = [
    MajPattern(),
    WordPattern(),
]

In [621]:
def process_content(content):
    attributes = []
    new_content = content
    for count_repl_patt in count_replace_content_pattern_list:
        new_content, count = count_repl_patt.processing(new_content)
        attributes.append(count)
    for repl_patt in replace_content_list:
        new_content = repl_patt.processing(new_content)
    for count_patt in count_content_list:
        count = count_patt.processing(new_content)
        attributes.append(count)
    return attributes

In [622]:
def create_dataframe_cat():
    # Create Dataframe
    mail_dataframe = pd.DataFrame()
    
    # Add a spam or not cat
    mail_dataframe['spam']=[]
    
    # Add Heading Attributes
    for important_pattern in important_heading_pattern:
        mail_dataframe[important_pattern.pattern_cat]=[]
    
    # Add Content Attributes
    for count_repl_patt in count_replace_content_pattern_list:
        mail_dataframe[count_repl_patt.pattern_cat]=[]
    for count_patt in count_content_list:
        mail_dataframe[count_patt.pattern_cat]=[]
    
    return mail_dataframe
    

In [623]:
def process_mail(raw_text, mail_dataframe, column_name_list, file, spam=True):
    heading, content = separate_header_content(raw_text, file)
    heading_attributes = process_heading(heading)
    content_attributes = process_content(content)
    mail_attributes =  [int(spam)] + heading_attributes + content_attributes
    #len(mail_dataframe.index)
    dict2add = dict(zip(column_name_list, mail_attributes))
    #print(dict2add)
    return mail_dataframe.append(dict2add, ignore_index=True)
    #mail_dataframe.loc[] = mail_attributes

In [626]:
def extract_data_from_file(spam_folders_path, ham_folders_path, mail_dataframe=None):
    if mail_dataframe == None:
        # if no mail has been processed and saved before create dataframe
        mail_dataframe = create_dataframe_cat()
        
    column_name_list = list(mail_dataframe.columns)
    for folder_path in spam_folders_path:
        under_path = folder_path + os.listdir(folder_path)[0] + "/"
        file_list = os.listdir(under_path)
        
        for file in file_list:
            filepath = under_path + file
            if (file!="cmds"):
                extraction, text = extract_txt(filepath)
                if (extraction == True):
                    mail_dataframe = process_mail(text, mail_dataframe,column_name_list, file, spam=True)
                else:
                    print(file)
                #return 0
            
    for folder_path in ham_folders_path:
        under_path = folder_path + os.listdir(folder_path)[0] + "/"
        file_list = os.listdir(under_path)
        for file in file_list:
            filepath = under_path + file
            if (file!="cmds"):
                extraction, text = extract_txt(filepath)
                if (extraction == True):
                    mail_dataframe = process_mail(text, mail_dataframe,column_name_list,file,  spam=False)
                else:
                    print(file)
                #return 0
    return mail_dataframe

In [627]:
mail_dataframe = extract_data_from_file(spam_folders_path, ham_folders_path, mail_dataframe=None)

0000.7b1b73cf36cf9dbc3d64e3f2ee2b91f1


In [628]:
mail_dataframe

Unnamed: 0,spam,from,subject,hour,content-type,importance,x-spam,x-*,html,url,mail,price,> count,? or !,Number count,Majuscule count,content_attr
0,1.0,"[17, 13, yahoo, com]","[4, 38, 1, 0, 0, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",3.0,text/html,,,4.0,16.0,0.0,0.0,1.0,0.0,5.0,4.0,47.0,"[81, 4.4815, 10, [0.0, 0.0, 0.0247, 0.0, 0.0, ..."
1,1.0,"[5, 0, mail, ru]","[5, 28, 1, 0, 0, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",15.0,,,,3.0,0.0,0.0,1.0,27.0,0.0,23.0,88.0,1107.0,"[1290, 6.4512, 23, [0.0, 0.0, 0.013, 0.0008, 0..."
2,1.0,"[16, 0, netscape, net]","[2, 16, 0, 0, 0, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",8.0,text/plain,,,1.0,2.0,3.0,0.0,0.0,0.0,3.0,14.0,122.0,"[165, 6.5697, 121, [0.0, 0.0, 0.012, 0.0, 0.0,..."
3,1.0,"[9, 0, 2x12.2xthemoney, com]","[9, 41, 0, 0, 0, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",19.0,,,,1.0,0.0,3.0,0.0,0.0,0.0,5.0,0.0,52.0,"[88, 5.375, 13, [0.0, 0.0, 0.0227, 0.0, 0.0, 0..."
4,1.0,"[10, 3, pac9.westernbarge, com]","[6, 37, 0, 0, 0, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",17.0,,,,1.0,0.0,3.0,0.0,2.0,0.0,7.0,3.0,42.0,"[75, 5.0667, 12, [0.0, 0.0, 0.013, 0.0, 0.0, 0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9344,0.0,"[29, 0, tribute, ca]","[4, 31, 0, 0, 2, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",0.0,multipart/alternative,,,0.0,2025.0,2.0,0.0,0.0,0.0,15.0,1432.0,498.0,"[1389, 5.3362, 25, [0.0, 0.0014, 0.0092, 0.001..."
9345,0.0,"[35, 0, DivX.at.krieger.mailshell, com]","[11, 64, 0, 0, 0, [0.0, 0.0, 0.0, 0.0, 0.1, 0....",19.0,text/plain,,,10.0,14.0,3.0,0.0,1.0,0.0,4.0,18.0,226.0,"[602, 5.4585, 14, [0.0, 0.0, 0.0099, 0.0, 0.0,..."
9346,0.0,"[61, 9, newsletter.online, com]","[6, 42, 0, 0, 0, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",6.0,text/html,,,2.0,1009.0,1.0,2.0,0.0,0.0,6.0,33.0,372.0,"[703, 5.8208, 19, [0.0, 0.0014, 0.011, 0.0, 0...."
9347,0.0,"[10, 0, mountainviewdata, com]","[0, 19, 0, 0, 0, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",17.0,text/plain; charset=us-ascii,,,10.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,105.0,"[99, 5.8687, 15, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."


Save dataframe to csv
---

In [629]:
mail_dataframe.to_csv(os.path.abspath(os.getcwd()) + "/mail_extract.csv", index = False)

Split categories in function of spam or not 
----

In [None]:
def split_ham_spam_dataframe(mail_dataframe, cat2split):
    spam = mail_dataframe.loc[mail_dataframe["spam"] == 1.0][cat2split]
    ham = mail_dataframe.loc[mail_dataframe["spam"] == 0.0][cat2split] 
    ham.reset_index(drop=True, inplace=True)
    
    return (spam, ham)