In [1]:
import pandas as pd
import email
import os
from bs4 import BeautifulSoup
import re

In [2]:
import warnings
from bs4 import MarkupResemblesLocatorWarning
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

In [3]:
data_path = "Data/Raw_data"
chinese_path = "trec06c"
english_path = "trec06p"
index_path = "full/index"

In [6]:
labels_ch = pd.read_csv(os.path.join(data_path,chinese_path,index_path), sep=" ", header=None)
labels_en = pd.read_csv(os.path.join(data_path,english_path,index_path), sep=" ", header=None)
labels_ch.columns = ["class", "subpath"]
labels_en.columns = ["class", "subpath"]

In [7]:
def html_remover(txt):
        """
        Removes html tags in a given text
        Input: doc
        Output: String
        """
        soup=BeautifulSoup(txt,'html.parser')
        a=soup.get_text()
        return a

In [50]:
def extract_message(row, lang):
    subpath = row["subpath"][3:]
    if lang=="en":
        full_path = os.path.join(data_path, english_path, subpath)
        encode_type = 'iso-8859-1'
    elif lang=="ch":
        full_path = os.path.join(data_path, chinese_path, subpath)
        encode_type = 'gb2312'
    try:
        with open(full_path, "r", encoding=encode_type) as f:
            email_content = f.read()
            parsed_email_content = email.message_from_string(email_content)
    except:
        return row
    new_encode_type = parsed_email_content.get_content_charset()
    if new_encode_type:
        new_encode_type = re.sub(r"charset|\"|=|'", "", new_encode_type)
        if new_encode_type != encode_type:
            try:
                with open(full_path, "r", encoding=new_encode_type) as f:
                    new_email_content = f.read()
                    parsed_email_content = email.message_from_string(new_email_content)
            except:
                parsed_email_content = email.message_from_string(email_content)
    try: 
        row["subject"] = str(email.header.make_header(email.header.decode_header(parsed_email_content["subject"])))
        row["email_to"] = str(email.header.make_header(email.header.decode_header(parsed_email_content["to"])))
        row["email_from"] = str(email.header.make_header(email.header.decode_header(parsed_email_content["from"])))
    except:
        return row
    row["contains_img"] = False
    if parsed_email_content.is_multipart():
        message = []
        for part in parsed_email_content.get_payload():
            part_payload = part.get_payload()
            payload_type = part.get_content_type()
            if payload_type in ("text/plain", "text/html"):
                if payload_type == "text/html":
                    part_payload = html_remover(part_payload)
                message.append(part_payload)
            elif payload_type == "multipart/alternative":
                for sub_part in part_payload:
                    sub_part_payload = sub_part.get_payload()
                    sub_part_type = sub_part.get_content_type()
                    if sub_part_type in ("text/plain", "text/html"):
                        if payload_type == "text/html":
                            sub_part_payload = html_remover(sub_part_payload)
                        message.append(sub_part_payload)
            elif payload_type == "image/png": 
                row["contains_img"] = True
        row["message"] = " ".join(message)
    else:
        message = parsed_email_content.get_payload()
        payload_type = parsed_email_content.get_content_type()
        if payload_type == "text/html":
            message = html_remover(message)
        row["message"] = message
    return row

extract_en = lambda row: extract_message(row, lang="en")
extract_ch = lambda row: extract_message(row, lang="ch")

In [51]:
ch_emails = labels_ch.apply(extract_ch, axis=1)

In [53]:
ch_emails.dropna(inplace=True)

In [56]:
ch_emails.to_pickle("Data/ch_emails_raw.pkl")

In [57]:
en_emails = labels_en.apply(extract_en, axis=1)

In [59]:
en_emails.dropna(inplace=True)

In [62]:
en_emails.to_pickle("Data/en_emails_raw.pkl")

In [4]:
ch_emails_raw = pd.read_pickle("Data/ch_emails_raw.pkl")
en_emails_raw = pd.read_pickle("Data/en_emails_raw.pkl")

In [24]:
ch_emails_raw["full_subpath"] = ch_emails_raw.subpath.apply(lambda x: f"trec06c{x[2:]}")
en_emails_raw["full_subpath"] = en_emails_raw.subpath.apply(lambda x: f"trec06p{x[2:]}")
ch_emails_raw.set_index("full_subpath", inplace=True)
en_emails_raw.set_index("full_subpath", inplace=True)

In [26]:
ch_emails_raw.to_pickle("Data/ch_emails_raw.pkl")
en_emails_raw.to_pickle("Data/en_emails_raw.pkl")

In [28]:
ch_emails_messages = ch_emails_raw[["message"]].copy()
en_emails_messages = en_emails_raw[["message"]].copy()

In [30]:
all_messages = pd.concat([ch_emails_messages, en_emails_messages])

In [32]:
all_messages.to_pickle("Data/all_messages.pkl")

In [3]:
all_messages = pd.read_pickle("Data/all_messages.pkl")

In [46]:
all_messages.drop_duplicates(subset="message", inplace=True)

In [48]:
all_messages.to_pickle("Data/unique_messages.pkl")

In [3]:
all_messages = pd.read_pickle("Data/unique_messages.pkl")

In [4]:
len(all_messages)

42885

In [5]:
nrows = len(all_messages)
first = all_messages[:nrows//5]
second = all_messages[nrows//5:2*nrows//5]
third = all_messages[2*nrows//5:3*nrows//5]
fourth = all_messages[3*nrows//5:4*nrows//5]
last = all_messages[4*nrows//5:]

In [None]:
first.to_pickle("Data/first.pkl")
second.to_pickle("Data/second.pkl")
third.to_pickle("Data/third.pkl")
fourth.to_pickle("Data/fourth.pkl")
last.to_pickle("Data/last.pkl")

In [5]:
sample = all_messages.sample(n=2000)

In [6]:
sample.to_pickle("Data/sample.pkl")

In [57]:
sample

Unnamed: 0_level_0,message
full_subpath,Unnamed: 1_level_1
trec06c/data/205/284,rerererererererererererererererererererererere...
trec06c/data/116/017,"己阅\n 标 题: Re: 哇,女生版怎么混的只剩女生了?\n \n ..."
trec06p/data/053/084,@\nFDB.dJ.FNBæ.K...
trec06p/data/084/129,"\nHello Taro Sato,\n\n I am a student in o..."
trec06p/data/057/268,I'm inclined to agree. These things are not e...
...,...
trec06p/data/025/000,\nWorld Top10 Branded Watches at 90% off \nthe...
trec06c/data/106/061,广州市中化贸易有限公司 \n尊...
trec06p/data/116/076,"To complete the inventory process, we have to ..."
trec06c/data/011/162,yei8xsg=?=\nX-Priority: 3\nX-Originating-IP: [...


In [66]:
list(zip(list(sample.index), list(sample.message)))

[('trec06c/data/205/284',
  'rererererererererererererererererererererererererererererererererererererere\nrererererererererererererererererererererererererererererererererererererere\nrererererererererererererererererererererererererererererererererererererere\nrererererererererererererererererererererererererererererererererererererere\nrerererererererererererererererererererererererererererererererererere！\nbless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bles\ns~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~\nbless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bles\ns~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~\nbless~~~bless~~~\n         对一个女生，还有比求婚和生日更重要的日子吗？今天是她的生日了，决定和你分享\n     我们的故事。\n         大二，刚洗去新生的稚气，我们相遇在机房。在等机位玩星际的我，发现有个很可爱\n'),
 ('trec06c/data/116/017',
  '己阅\n     标  题: Re: 哇,女生版怎么混的只剩女生了?\n     \n     我们都不错\n     你也不错\n     \n     : 你水灌的挺多啊..呵呵..4w\n     \n     \n   

In [68]:
from deep_translator import GoogleTranslator
auto_translator = GoogleTranslator(source='auto', target='en')

In [75]:
texts = list(sample.iloc[:10].message)
ids = list(sample.iloc[:10].index)

In [72]:
texts

['rererererererererererererererererererererererererererererererererererererere\nrererererererererererererererererererererererererererererererererererererere\nrererererererererererererererererererererererererererererererererererererere\nrererererererererererererererererererererererererererererererererererererere\nrerererererererererererererererererererererererererererererererererere！\nbless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bles\ns~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~\nbless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bles\ns~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~bless~~~\nbless~~~bless~~~\n         对一个女生，还有比求婚和生日更重要的日子吗？今天是她的生日了，决定和你分享\n     我们的故事。\n         大二，刚洗去新生的稚气，我们相遇在机房。在等机位玩星际的我，发现有个很可爱\n',
 '己阅\n     标  题: Re: 哇,女生版怎么混的只剩女生了?\n     \n     我们都不错\n     你也不错\n     \n     : 你水灌的挺多啊..呵呵..4w\n     \n     \n     --\n     发信人: UK (独立小熊), 信区: Single                  