In [8]:
import os
import re
import email
import pandas as pd
import numpy as np
from tqdm import tqdm
from bs4 import BeautifulSoup

In [9]:
def read_email_from_file(path_file):
    try:
        with open(path_file, 'r') as file:
            message = email.message_from_file(file)
    except:
        with open(path_file, 'r', encoding='ISO-8859-1') as file:
            message = email.message_from_file(file)
    return message


def read_email_from_string(s):
    message = email.message_from_string(s)
    return message


def extract_email_body(message):
    if message.is_multipart():
        for part in message.walk():
            type_content = part.get_content_maintype()
            if type_content == 'text':
                message = part
                break
        else:
            return 'escapenonetext'

    if message.get('Content-Transfer-Encoding') == 'base64':
        try:
            body = message.get_payload(decode=True).decode()
        except:
            body = message.get_payload(decode=True).decode(encoding='ISO-8859-1')
    else:
        body = message.get_payload(decode=False)
    return body


def remove_html(s):
    soup = BeautifulSoup(s, 'lxml')
    for sp in soup(['script', 'style', 'head', 'meta', 'noscript']):
        sp.decompose()
    s = ' '.join(soup.stripped_strings)
    return s


def email_body_to_text(body):
    body = remove_html(body)
    # punctuation = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
    # body = re.sub('[{}]'.format(punctuation), ' ', body)
    # body = re.sub('\n+', ' ', body)
    # body = re.sub('\\s+', ' ', body)
    # body = re.sub(r'[0-9]+', 'escapenumber', body)
    # body = body.lower()
    # body = re.sub(r'[a-z0-9]{20,}', 'escapelong', body)
    return body

In [10]:
PATH_ORIGINAL_EMAILS = 'email_origin.csv'
df_email_origin = pd.read_csv(PATH_ORIGINAL_EMAILS)
df_email_origin

Unnamed: 0,label,origin
0,1,Return-Path: <RickyAmes@aol.com>\nReceived: fr...
1,0,Return-Path: <bounce-debian-mirrors=ktwarwic=s...
2,1,Return-Path: <7stocknews@tractionmarketing.com...
3,1,Return-Path: <vqucsmdfgvsg@ruraltek.com>\nRece...
4,1,Return-Path: <dcube@totalink.net>\nReceived: f...
...,...,...
75414,1,Return-Path: <Merrill8783@168city.com>\nReceiv...
75415,1,Return-Path: <Merrill8783@168city.com>\nReceiv...
75416,0,Return-Path: <r-help-bounces@stat.math.ethz.ch...
75417,0,Return-Path: <r-help-bounces@stat.math.ethz.ch...


In [11]:
df_email_origin['text'] = df_email_origin['origin'].map(read_email_from_string).map(extract_email_body).map(email_body_to_text)
df_email_origin


If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  soup = BeautifulSoup(s, 'lxml')


Unnamed: 0,label,origin,text
0,1,Return-Path: <RickyAmes@aol.com>\nReceived: fr...,Do you feel the pressure to perform and not ri...
1,0,Return-Path: <bounce-debian-mirrors=ktwarwic=s...,"Hi, i've just updated from the gulus and I che..."
2,1,Return-Path: <7stocknews@tractionmarketing.com...,Mega authenticV I A G R A $ DISCOUNT priceC...
3,1,Return-Path: <vqucsmdfgvsg@ruraltek.com>\nRece...,"Hey Billy, \n\nit was really fun going out the..."
4,1,Return-Path: <dcube@totalink.net>\nReceived: f...,"system"" of the home. It will have the capabil..."
...,...,...,...
75414,1,Return-Path: <Merrill8783@168city.com>\nReceiv...,While we may have high expe...
75415,1,Return-Path: <Merrill8783@168city.com>\nReceiv...,While we may have high expe...
75416,0,Return-Path: <r-help-bounces@stat.math.ethz.ch...,"For those who are interested, I just cook a li..."
75417,0,Return-Path: <r-help-bounces@stat.math.ethz.ch...,"Hello,\n\nas I wrote I call\n\n sqlFetch(chan..."


In [12]:
df_email_dp_dup = df_email_origin.drop_duplicates(subset='text')
df_email_dp_dup

Unnamed: 0,label,origin,text
0,1,Return-Path: <RickyAmes@aol.com>\nReceived: fr...,Do you feel the pressure to perform and not ri...
1,0,Return-Path: <bounce-debian-mirrors=ktwarwic=s...,"Hi, i've just updated from the gulus and I che..."
2,1,Return-Path: <7stocknews@tractionmarketing.com...,Mega authenticV I A G R A $ DISCOUNT priceC...
3,1,Return-Path: <vqucsmdfgvsg@ruraltek.com>\nRece...,"Hey Billy, \n\nit was really fun going out the..."
4,1,Return-Path: <dcube@totalink.net>\nReceived: f...,"system"" of the home. It will have the capabil..."
...,...,...,...
75413,1,Return-Path: <hearst@cdrcorp.com>\nReceived: f...,Versuchen Sie unser Produkt und Sie werden fuh...
75414,1,Return-Path: <Merrill8783@168city.com>\nReceiv...,While we may have high expe...
75416,0,Return-Path: <r-help-bounces@stat.math.ethz.ch...,"For those who are interested, I just cook a li..."
75417,0,Return-Path: <r-help-bounces@stat.math.ethz.ch...,"Hello,\n\nas I wrote I call\n\n sqlFetch(chan..."


In [13]:
df_email_dp_na = df_email_dp_dup[df_email_dp_dup['text'].str.contains(r"\b\w+\b")]
df_email_dp_na

Unnamed: 0,label,origin,text
0,1,Return-Path: <RickyAmes@aol.com>\nReceived: fr...,Do you feel the pressure to perform and not ri...
1,0,Return-Path: <bounce-debian-mirrors=ktwarwic=s...,"Hi, i've just updated from the gulus and I che..."
2,1,Return-Path: <7stocknews@tractionmarketing.com...,Mega authenticV I A G R A $ DISCOUNT priceC...
3,1,Return-Path: <vqucsmdfgvsg@ruraltek.com>\nRece...,"Hey Billy, \n\nit was really fun going out the..."
4,1,Return-Path: <dcube@totalink.net>\nReceived: f...,"system"" of the home. It will have the capabil..."
...,...,...,...
75413,1,Return-Path: <hearst@cdrcorp.com>\nReceived: f...,Versuchen Sie unser Produkt und Sie werden fuh...
75414,1,Return-Path: <Merrill8783@168city.com>\nReceiv...,While we may have high expe...
75416,0,Return-Path: <r-help-bounces@stat.math.ethz.ch...,"For those who are interested, I just cook a li..."
75417,0,Return-Path: <r-help-bounces@stat.math.ethz.ch...,"Hello,\n\nas I wrote I call\n\n sqlFetch(chan..."


In [None]:
df_email_dp_na[['label', 'text']].to_csv("emails.csv", index=False, escapechar='\\\\\\')