In [1]:
import pandas as pd
import email
import os
from bs4 import BeautifulSoup
import re
import base64

In [2]:
import warnings
from bs4 import MarkupResemblesLocatorWarning
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

In [32]:
data_path = "Data/Raw_data"
chinese_path = "trec06c"
english_path = "trec06p"
index_path = "full/index"

In [33]:
labels_ch = pd.read_csv(os.path.join(data_path,chinese_path,index_path), sep=" ", header=None)
labels_en = pd.read_csv(os.path.join(data_path,english_path,index_path), sep=" ", header=None)
labels_ch.columns = ["class", "subpath"]
labels_en.columns = ["class", "subpath"]

In [34]:
def html_remover(txt):
        """
        Removes html tags in a given text
        Input: doc
        Output: String
        """
        soup=BeautifulSoup(txt,'html.parser')
        a=soup.get_text()
        return a

In [35]:
def extract_message(row, lang):
    subpath = row["subpath"][3:]
    if lang=="en":
        full_path = os.path.join(data_path, english_path, subpath)
        encode_type = 'iso-8859-1'
    elif lang=="ch":
        full_path = os.path.join(data_path, chinese_path, subpath)
        encode_type = 'gb2312'
    try:
        with open(full_path, "r", encoding=encode_type) as f:
            email_content = f.read()
            parsed_email_content = email.message_from_string(email_content)
    except:
        return row
    new_encode_type = parsed_email_content.get_content_charset()
    if new_encode_type:
        new_encode_type = re.sub(r"charset|\"|=|'", "", new_encode_type)
        if new_encode_type != encode_type:
            try:
                with open(full_path, "r", encoding=new_encode_type) as f:
                    new_email_content = f.read()
                    parsed_email_content = email.message_from_string(new_email_content)
                    encode_type = new_encode_type
            except:
                parsed_email_content = email.message_from_string(email_content)
    try: 
        row["subject"] = str(email.header.make_header(email.header.decode_header(parsed_email_content["subject"])))
        row["email_to"] = str(email.header.make_header(email.header.decode_header(parsed_email_content["to"])))
        row["email_from"] = str(email.header.make_header(email.header.decode_header(parsed_email_content["from"])))
    except:
        return row
    row["contains_img"] = False
    row["base64"] = False
    if parsed_email_content.is_multipart():
        message = []
        for part in parsed_email_content.get_payload():
            part_payload = part.get_payload()
            payload_type = part.get_content_type()
            if payload_type in ("text/plain", "text/html"):
                if part["Content-Transfer-Encoding"] == "base64":
                    try:
                        part_payload = base64.b64decode(part_payload).decode(encode_type)
                        #print(subpath)
                        row["base64"] = True
                    except:
                        pass
                if payload_type == "text/html":
                    part_payload = html_remover(part_payload)
                message.append(part_payload)
            elif payload_type == "multipart/alternative":
                for sub_part in part_payload:
                    sub_part_payload = sub_part.get_payload()
                    sub_part_type = sub_part.get_content_type()
                    if sub_part_type in ("text/plain", "text/html"):
                        if part["Content-Transfer-Encoding"] == "base64":
                            try:
                                part_payload = base64.b64decode(part_payload).decode(encode_type)
                                #print(subpath)
                                row["base64"] = True
                            except:
                                pass
                        if payload_type == "text/html":
                            sub_part_payload = html_remover(sub_part_payload)
                        message.append(sub_part_payload)
            elif payload_type == "image/png": 
                row["contains_img"] = True
        row["message"] = " ".join(message)
    else:
        message = parsed_email_content.get_payload()
        if parsed_email_content["Content-Transfer-Encoding"] == "base64":
            try:
                message = base64.b64decode(message).decode(encode_type)
                #print(subpath)
                row["base64"] = True
            except:
                pass
        payload_type = parsed_email_content.get_content_type()
        if payload_type == "text/html":
            message = html_remover(message)
        row["message"] = message
    return row

extract_en = lambda row: extract_message(row, lang="en")
extract_ch = lambda row: extract_message(row, lang="ch")

In [25]:
with open("Data/Raw_data/trec06p/data/015/188", "r", encoding="gb2312") as f:
    new_email_content = f.read()
    parsed_email_content = email.message_from_string(new_email_content)

In [26]:
message = parsed_email_content.get_payload()
print(message)

DQogICAgICAgICDO0rDvxOPXrMeuuN/Qy8Lwo7+x8Ljf0Mu1xMyr1OejrM7S1ruw78Tj1/a1vTEw
MDDUqtLUuvPSqr+/xOPX1Ly6o6zS8s6qztK7udKqsO/G5Mv7yMujrA0KICDE48u1ttSyu6O/srvE
3MDP0sC/v87So6ENCiAgICAgICAgICBodHRwOi8vem1ibi5jb20vP2lkPWN5cA0KDQoNCj09PT09
PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09DQrW0MCyo6EtLc34yc+5
usLyssrGsaOs1tC9sc2o1qqjrNfUtq+20r2xoaMNCsPit9HXorLho6zK17TOs+TWtcvNMdeissrG
saOhzfjWt6O6aHR0cDovL3d3dy56aG9uZ2xhLmNu



In [149]:
message = message[1].get_payload()

In [28]:
import base64
base64.b64decode(message).decode("gb2312")



In [187]:
ch_emails = labels_ch.apply(extract_ch, axis=1)

In [36]:
en_emails = labels_en.apply(extract_en, axis=1)

In [38]:
ch_emails.dropna(inplace=True)
en_emails.dropna(inplace=True)
ch_emails["full_subpath"] = ch_emails.subpath.apply(lambda x: f"trec06c{x[2:]}")
en_emails["full_subpath"] = en_emails.subpath.apply(lambda x: f"trec06p{x[2:]}")
ch_emails.set_index("full_subpath", inplace=True)
en_emails.set_index("full_subpath", inplace=True)

In [190]:
ch_emails.to_pickle("Data/ch_emails_raw.pkl")
en_emails.to_pickle("Data/en_emails_raw.pkl")

In [40]:
full = pd.concat([ch_emails, en_emails])

In [41]:
temp = full[full.base64].copy()
temp.drop_duplicates(subset="message", inplace=True)

In [42]:
temp

Unnamed: 0_level_0,base64,class,contains_img,email_from,email_to,message,subject,subpath
full_subpath,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
trec06c/data/000/145,True,spam,False,"""zeng"" <zeng@mail.com>",sun@ccert.edu.cn,,（17010.com）英超、意甲足球、篮球经理游戏,../data/000/145
trec06p/data/000/103,True,spam,False,"""Olive Alvarado"" <olive_alvaradooc@youronline....","audrey@groucho.cse.psu.edu, duane@groucho.cse....",American Energy Review\r\n\r\nIn the current o...,New Penny Stock Report,../data/000/103
trec06p/data/000/159,True,spam,False,eintgmxabfbmafb@msa.hinet.net,"phyllis@groucho.cse.psu.edu,tyler@groucho.cse....",\n\n\n\nÔËÈÙÉÌ³Ç-Õæ°®ÇéÈ¤\n\n\n\n\n\n\n\n\n°®...,运荣商城-爱侣专卖 货到付款 保密配送,../data/000/159
trec06p/data/000/259,True,spam,False,カワシマ <da2d1s2f1sd@sohu.com>,<juliana@groucho.cse.psu.edu>,無料お試し体験、大人の時間をお楽しみ下さい。\r\n\r\nhttp://www.yello...,夜は空いていますか,../data/000/259
trec06p/data/001/019,True,spam,False,"""{©sªÛ}¢b¢c¢d¢e¶R¥»¯¸ VCD/DVD ³Ó¹L§A¶R¼Æ¤Q®a¤ù...",edgar@groucho.cse.psu.edu,\n\n\n·s¼Wºô­¶1\n\n\n\n\n\n\r\n{ÂE°í}¢b¢c¢d¢e¶...,Daniel~ ¦Ê¦ì¨k¤k³¥¥~¶}³]Âø¥æ¬£¹ï ¥ú¤Ñ¤Æ¤é´N·F¤...,../data/001/019
...,...,...,...,...,...,...,...,...
trec06p/data/121/128,True,spam,False,"""MICROLOTTERYNL"" <Microlottinter@netscape.net>",<DMDX@psy1.psych.arizona.edu>,MICRO LOTTERY INTERNATIONAL\r\nVIJZELSTRAAT 56...,MICRO NEWS,../data/121/128
trec06p/data/121/232,True,spam,False,"""Marla Johnson"" <TSmith@netscape.net>","""Tammie Barber"" <devfinance@ag.ohio-state.edu>...","Don't miss this weeks pick, Our last pick show...",Harvest Gains From This stock?,../data/121/232
trec06p/data/122/024,True,ham,False,Lori Ungurait &lt;LUngurait@tennessee.edu&gt;,"""Mike Herrmann"" &lt;Mike.Herrmann@state.tn.us&...","Kellie,\r\nThis announcement just went out reg...",Re: Fwd: Question,../data/122/024
trec06p/data/122/040,True,ham,False,BICDance@aol.com,economicsbb@columbia.edu,"FOUNTAINHEADÂ® TANZ THEATRE\ne â LETTER, Ber...",Fountainhead; XXI Black International Cinema B...,../data/122/040


In [44]:
temp.to_pickle("Data/b64.pkl")

In [170]:
ch_emails_messages = ch_emails_raw[["message"]].copy()
en_emails_messages = en_emails_raw[["message"]].copy()

In [171]:
all_messages = pd.concat([ch_emails_messages, en_emails_messages])

In [172]:
all_messages.to_pickle("Data/all_messages.pkl")

In [173]:
all_messages = pd.read_pickle("Data/all_messages.pkl")

In [174]:
all_messages.drop_duplicates(subset="message", inplace=True)

In [175]:
all_messages.to_pickle("Data/unique_messages.pkl")

In [183]:
unique_messages = pd.read_pickle("Data/unique_messages.pkl")
import random
random.seed(42)
# Shuffling for more even spread when processing batches
unique_messages = unique_messages.sample(frac=1)

In [181]:
len(unique_messages)

42882

In [184]:
nrows = len(all_messages)
first = all_messages[:nrows//4]
second = all_messages[nrows//4:2*nrows//4]
third = all_messages[2*nrows//4:3*nrows//4]
fourth = all_messages[3*nrows//4:]

In [185]:
first.to_pickle("Data/first.pkl")
second.to_pickle("Data/second.pkl")
third.to_pickle("Data/third.pkl")
fourth.to_pickle("Data/fourth.pkl")

# After processing

In [53]:
first = pd.read_json("first.json", lines=True)
first.set_index("doc_id", inplace=True)
second = pd.read_json("second.json", lines=True)
second.set_index("doc_id", inplace=True)
third = pd.read_json("third.json", lines=True)
third.set_index("doc_id", inplace=True)
fourth = pd.read_json("fourth.json", lines=True)
fourth.set_index("doc_id", inplace=True)
last = pd.read_json("last.json", lines=True)
last.set_index("doc_id", inplace=True)

In [54]:
full_processed = pd.concat([first, second, third, fourth, last])

In [55]:
ch_emails = pd.read_pickle("Data/ch_emails_raw.pkl")
en_emails = pd.read_pickle("Data/en_emails_raw.pkl")
full = pd.concat([ch_emails, en_emails])

In [56]:
full.dropna(inplace=True)
b64 = full[full.base64]

In [57]:
full_wo_b64 = full_processed[~full_processed.index.isin(b64.index)]

In [58]:
b64_messages = pd.read_json("b64.json", lines=True)
b64_messages.set_index("doc_id", inplace=True)
full_messages = pd.concat([full_wo_b64, b64_messages])

In [59]:
full_messages

Unnamed: 0_level_0,pretranslation,translated,processed
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
trec06c/data/000/023,IFRAME: http://www.dacong.com/bx.htm,IFRAME: http://www.dacong.com/bx.htm,iframe link
trec06c/data/000/007,尊敬的负责人（经理／财务）：您好！ 我是深圳伟仕嘉贸易有公司：兴办贸易、物资供销，实力雄厚；...,Dear person in charge (manager/finance): Hello...,dear person in charge managerfinance hello i a...
trec06c/data/000/014,"您好！很高兴认识您。我司有意与您们合作:可长久给您们带来 巨大的效益,另因合作项目的高度自动...",Hello! Nice to meet you. Our company is intere...,hello nice to meet you our company is interest...
trec06c/data/000/005,TO：贵公司经理、财务 您好！ 深圳市春洋贸易有限公司（东莞分公司）...,TO: Your company’s manager and finance manager...,to your company s manager and finance manager ...
trec06c/data/000/013,新型遥控飞机销路广 百元可办厂 玩具飞机品种繁多，但用微型小开关控制的新型能在空中自控飞翔的...,New remote-controlled airplanes are widely sol...,new remotecontrolled airplanes are widely sold...
...,...,...,...
trec06p/data/030/141,SÌ}~Å·B\r \µYê½ª èÜµÄ...,D\r - B\r ...\r A ზაა\r ¸R¸l¸l¸\r http://www.j...,d b a r l l link u b p u g num t c y num n...
trec06p/data/058/203,RIFFx~�WAVEfmt �����+��+����dataS~�...,RIFFx~ WAVEfmt + + dataS~ ~y{yw{ zuvx}}ytmjhed...,riffx wavefmt datas yyw zuvxytmjhedflnmknw p...
trec06p/data/014/167,dangerous cork blob thirteenth alliterate argu...,dangerous cork blob thirteenth alliterate argu...,dangerous cork blob thirteenth alliterate argu...
trec06p/data/025/231,\r :*.☆。o:☆';*。:*.☆。o:☆';*。:*.☆。o:☆';*。:*.☆...,:*.☆. o:☆';*. :*.☆. o:☆';*. :*.☆. o:☆';*. :*.☆...,o o o o the largest online commun...


In [60]:
full_df = pd.merge(full_messages, full, left_index=True, right_index=True, how="left")

In [62]:
full_df.to_pickle("Data/full_df.pkl")

In [65]:
full_df[full_df.index=="trec06p/data/025/231"].iloc[0].pretranslation

"\r \u3000\u3000\u3000:*.☆。o:☆';*。:*.☆。o:☆';*。:*.☆。o:☆';*。:*.☆。o:☆';*。:*.☆\r \r \u3000\u3000\u3000\u3000\u3000\u3000\u3000\u3000\u3000史上最大のONLINEコミュニティー誕生！\r \r \u3000\u3000\u3000:*.☆。o:☆';*。:*.☆。o:☆';*。:*.☆。o:☆';*。:*.☆。o:☆';*。:*.☆\r \r \u3000\u3000\u3000\u3000\u3000\u3000\r \r \u3000\u3000\u3000\u3000\u3000\u3000\u3000\u3000どうしてもオススメしたいサイトが誕生しました。\r \r \r \u3000\u3000\u3000やっとあったかくなってきたのに心はあまり暖かくならない。\r \u3000\u3000\u3000地球は温暖化しているのに、財布も心も暖まらない…\r \r \u3000\u3000\u3000\u3000\u3000\u3000\u3000そんな春になってしまいそうな方にプレゼントします！\r \r \r \u3000\u3000\u3000＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊＊\r \r \u3000\u3000\u3000\u3000オススメしたい理由があります！\r \r \u3000\u3000\u3000\u3000登録料・利用料\u3000・・・・・・・・・【無料】\r \u3000\u3000\u3000\u3000メールの送受信\u3000・・・・・・・・・【無料】\r \u3000\u3000\u3000\u3000ユーザーの検索\u3000・・・・・・・・・【無料】\r \u3000\u3000\u3000\u3000掲示板の閲覧・書込み\u3000・・・・・・【無料】\r \u3000\u3000\u3000\u3000画像交換・アップロード\u3000・・・・・【無料】\r \u3000\u3000\u3000\u3000アドレス交換・電話番号交換\u3000・・・【無料】\r \r \u3000\u3000\u3000\u3000全てのサービスが無料で楽しめるのは当た