In [15]:
import pandas as pd
import email
import os
from bs4 import BeautifulSoup
import re

In [11]:
import warnings
from bs4 import MarkupResemblesLocatorWarning
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

In [3]:
data_path = "Data/Raw_data"
chinese_path = "trec06c"
english_path = "trec06p"
index_path = "full/index"

In [6]:
labels_ch = pd.read_csv(os.path.join(data_path,chinese_path,index_path), sep=" ", header=None)
labels_en = pd.read_csv(os.path.join(data_path,english_path,index_path), sep=" ", header=None)
labels_ch.columns = ["class", "subpath"]
labels_en.columns = ["class", "subpath"]

In [7]:
def html_remover(txt):
        """
        Removes html tags in a given text
        Input: doc
        Output: String
        """
        soup=BeautifulSoup(txt,'html.parser')
        a=soup.get_text()
        return a

In [50]:
def extract_message(row, lang):
    subpath = row["subpath"][3:]
    if lang=="en":
        full_path = os.path.join(data_path, english_path, subpath)
        encode_type = 'iso-8859-1'
    elif lang=="ch":
        full_path = os.path.join(data_path, chinese_path, subpath)
        encode_type = 'gb2312'
    try:
        with open(full_path, "r", encoding=encode_type) as f:
            email_content = f.read()
            parsed_email_content = email.message_from_string(email_content)
    except:
        return row
    new_encode_type = parsed_email_content.get_content_charset()
    if new_encode_type:
        new_encode_type = re.sub(r"charset|\"|=|'", "", new_encode_type)
        if new_encode_type != encode_type:
            try:
                with open(full_path, "r", encoding=new_encode_type) as f:
                    new_email_content = f.read()
                    parsed_email_content = email.message_from_string(new_email_content)
            except:
                parsed_email_content = email.message_from_string(email_content)
    try: 
        row["subject"] = str(email.header.make_header(email.header.decode_header(parsed_email_content["subject"])))
        row["email_to"] = str(email.header.make_header(email.header.decode_header(parsed_email_content["to"])))
        row["email_from"] = str(email.header.make_header(email.header.decode_header(parsed_email_content["from"])))
    except:
        return row
    row["contains_img"] = False
    if parsed_email_content.is_multipart():
        message = []
        for part in parsed_email_content.get_payload():
            part_payload = part.get_payload()
            payload_type = part.get_content_type()
            if payload_type in ("text/plain", "text/html"):
                if payload_type == "text/html":
                    part_payload = html_remover(part_payload)
                message.append(part_payload)
            elif payload_type == "multipart/alternative":
                for sub_part in part_payload:
                    sub_part_payload = sub_part.get_payload()
                    sub_part_type = sub_part.get_content_type()
                    if sub_part_type in ("text/plain", "text/html"):
                        if payload_type == "text/html":
                            sub_part_payload = html_remover(sub_part_payload)
                        message.append(sub_part_payload)
            elif payload_type == "image/png": 
                row["contains_img"] = True
        row["message"] = " ".join(message)
    else:
        message = parsed_email_content.get_payload()
        payload_type = parsed_email_content.get_content_type()
        if payload_type == "text/html":
            message = html_remover(message)
        row["message"] = message
    return row

extract_en = lambda row: extract_message(row, lang="en")
extract_ch = lambda row: extract_message(row, lang="ch")

In [51]:
ch_emails = labels_ch.apply(extract_ch, axis=1)

In [53]:
ch_emails.dropna(inplace=True)

In [54]:
ch_emails.head()

Unnamed: 0,class,contains_img,email_from,email_to,message,subject,subpath
1,ham,False,"""pan"" <pan@jdl.ac.cn>",shi@ccert.edu.cn,讲的是孔子后人的故事。一个老领导回到家乡，跟儿子感情不和，跟贪财的孙子孔为本和睦。\n老领导...,● 问一部魏宗万的电影名称,../data/000/001
2,spam,False,张海南 <jian@163.con>,xing@ccert.edu.cn,尊敬的贵公司(财务/经理)负责人您好！ \n 我是深圳金海实业有限公司（广州...,公司业务.代开发票！,../data/000/002
3,spam,False,代开发票 <pan@12.com>,ling@ccert.edu.cn,贵公司负责人(经理/财务）您好： \n 深圳市华龙公司受多家公司委托向外低点代开部分增...,低点代开发票!,../data/000/003
4,spam,False,"""mei"" <mei@dghhkjk.com>",tang@ccert.edu.cn,这是一封HTML格式信件！\n\n-----------------------------...,一边上网冲浪，一边赚钱，何乐而不为？,../data/000/004
5,spam,False,"ke@163.com"" <chunyang-sz@163.com>",yuan@ccert.edu.cn,\nTO：贵公司经理、财务\n\n \n 您好！ \n 深圳市春洋贸易有...,优惠代开各种发票!!,../data/000/005


In [56]:
ch_emails.to_pickle("Data/ch_emails_raw.pkl")

In [55]:
len(ch_emails)

50445

In [57]:
en_emails = labels_en.apply(extract_en, axis=1)

In [58]:
en_emails.head()

Unnamed: 0,class,contains_img,email_from,email_to,message,subject,subpath
0,ham,,,,,new Catholic mailing list now up and running,../data/000/000
1,spam,False,"""Stella Lowry"" <rookcuduq@yahoo.com>","""Brian"" <bernice@groucho.cs.psu.edu>",\n ...,re[12]:,../data/000/001
2,spam,False,"""Walter"" <trwmpca@downtowncumberland.com>",<arline@groucho.cs.psu.edu>,Academic Qualifications available from prestig...,Take a moment to explore this.,../data/000/002
3,ham,False,Scott Schwartz <schwartz@groucho.cs.psu.edu>,9fans <plan9-fans@cs.psu.edu>,Greetings all. This is to verify your subscri...,Greetings,../data/000/003
4,spam,False,"""Mr Jailyn Koepke"" <kiflsbizc@attheworld.com>",melvin@groucho.cs.psu.edu,try chauncey may conferred the luscious not co...,LOANS @ 3.17% (27 term),../data/000/004


In [59]:
en_emails.dropna(inplace=True)

In [61]:
len(en_emails)

36837

In [62]:
en_emails.to_pickle("Data/en_emails_raw.pkl")

In [63]:
pd.read_pickle("Data/ch_emails_raw.pkl")

Unnamed: 0,class,contains_img,email_from,email_to,message,subject,subpath
1,ham,False,"""pan"" <pan@jdl.ac.cn>",shi@ccert.edu.cn,讲的是孔子后人的故事。一个老领导回到家乡，跟儿子感情不和，跟贪财的孙子孔为本和睦。\n老领导...,● 问一部魏宗万的电影名称,../data/000/001
2,spam,False,张海南 <jian@163.con>,xing@ccert.edu.cn,尊敬的贵公司(财务/经理)负责人您好！ \n 我是深圳金海实业有限公司（广州...,公司业务.代开发票！,../data/000/002
3,spam,False,代开发票 <pan@12.com>,ling@ccert.edu.cn,贵公司负责人(经理/财务）您好： \n 深圳市华龙公司受多家公司委托向外低点代开部分增...,低点代开发票!,../data/000/003
4,spam,False,"""mei"" <mei@dghhkjk.com>",tang@ccert.edu.cn,这是一封HTML格式信件！\n\n-----------------------------...,一边上网冲浪，一边赚钱，何乐而不为？,../data/000/004
5,spam,False,"ke@163.com"" <chunyang-sz@163.com>",yuan@ccert.edu.cn,\nTO：贵公司经理、财务\n\n \n 您好！ \n 深圳市春洋贸易有...,优惠代开各种发票!!,../data/000/005
...,...,...,...,...,...,...,...
64615,spam,False,刘伟源 <ning@163.com>,guo@ccert.edu.cn,贵公司负责人(经理/财务)您好：\n 我公司是深圳市华源实业有限公司.本公司实力雄厚(...,优惠代开发票~!!!,../data/215/115
64616,spam,False,优惠代开发票 <zhong@12.com>,mo@ccert.edu.cn,尊敬的商家朋友您好： \n 我是深圳市裕华实业有限公司的。我司实力雄厚，有着良\n好的...,代开发票验证后付款！,../data/215/116
64617,spam,False,杨先生 <che@163.com>,zeng@ccert.edu.cn,贵公司负责人(经理/财务）您好! \n 我是...,优惠代开各类发票！,../data/215/117
64618,spam,False,zeng@126.com,long@ccert.edu.cn,\n 这是一个HTML格式的邮件\n FRAME: easymain\n\n\n\n,千色坊―时尚女士箱包特卖场,../data/215/118


In [14]:
full_path = os.path.join(data_path, english_path, "data/003/095")
with open(full_path, "r", encoding='charset="gb2312"') as f:
    email_content = f.read()
    parsed_email_content = email.message_from_string(email_content)
    print(parsed_email_content.get_payload())
    print(parsed_email_content.get_content_charset())

LookupError: unknown encoding: charset="gb2312"