<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Package-import" data-toc-modified-id="Package-import-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Package import</a></span></li><li><span><a href="#Data-Loading" data-toc-modified-id="Data-Loading-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Loading</a></span></li><li><span><a href="#Preprocessing" data-toc-modified-id="Preprocessing-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Preprocessing</a></span><ul class="toc-item"><li><span><a href="#parsing" data-toc-modified-id="parsing-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>parsing</a></span><ul class="toc-item"><li><span><a href="#typo_parser" data-toc-modified-id="typo_parser-3.1.1"><span class="toc-item-num">3.1.1&nbsp;&nbsp;</span>typo_parser</a></span></li><li><span><a href="#email_address_parser" data-toc-modified-id="email_address_parser-3.1.2"><span class="toc-item-num">3.1.2&nbsp;&nbsp;</span>email_address_parser</a></span></li><li><span><a href="#bytedata_parser" data-toc-modified-id="bytedata_parser-3.1.3"><span class="toc-item-num">3.1.3&nbsp;&nbsp;</span>bytedata_parser</a></span></li><li><span><a href="#structure_parser" data-toc-modified-id="structure_parser-3.1.4"><span class="toc-item-num">3.1.4&nbsp;&nbsp;</span>structure_parser</a></span></li><li><span><a href="#reference_parser" data-toc-modified-id="reference_parser-3.1.5"><span class="toc-item-num">3.1.5&nbsp;&nbsp;</span>reference_parser</a></span></li></ul></li><li><span><a href="#main-structural_email" data-toc-modified-id="main-structural_email-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>main structural_email</a></span><ul class="toc-item"><li><span><a href="#merged-features" data-toc-modified-id="merged-features-3.2.1"><span class="toc-item-num">3.2.1&nbsp;&nbsp;</span>merged features</a></span></li></ul></li><li><span><a href="#attribute-selection" data-toc-modified-id="attribute-selection-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>attribute selection</a></span><ul class="toc-item"><li><span><a href="#module-test" data-toc-modified-id="module-test-3.3.1"><span class="toc-item-num">3.3.1&nbsp;&nbsp;</span>module test</a></span></li></ul></li></ul></li><li><span><a href="#Feature-engineering" data-toc-modified-id="Feature-engineering-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Feature engineering</a></span></li><li><span><a href="#EDA" data-toc-modified-id="EDA-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>EDA</a></span></li></ul></div>

## Package import 

In [1]:
import pandas as pd 
import numpy as np
import os
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import mailparser
import re


## Data Loading
- from file into DataFrame

In [2]:
def load_data_folder(path):
    """
    @param folders: the train or test directory
    @return: document list with [doc_path, doc, label, original_idx]
    """
    folders = glob(path+"/**")  # explore all the folder under the directory

    docs = []
    for classes in folders:
        label = classes.split("\\")[-1]
        doc_paths = glob(classes+"\\**")
        
        for doc_path in doc_paths:
            original_idx = doc_path.split("\\")[-1]
            
            with open(doc_path, encoding="UTF-8") as f:
                text = f.read()
            docs.append([doc_path, text, label, original_idx])

    print(f"\nLoaded folder under {path}: \n")
    for folder in folders:
        print(folder)
        
    return docs


corpus_train_docs = load_data_folder(path="../data/train")
corpus_test_docs = load_data_folder(path="../data/test")


Loaded folder under ../data/train: 

../data/train\alt.atheism
../data/train\comp.graphics
../data/train\comp.os.ms-windows.misc
../data/train\comp.sys.ibm.pc.hardware
../data/train\comp.sys.mac.hardware
../data/train\comp.windows.x
../data/train\misc.forsale
../data/train\rec.autos
../data/train\rec.motorcycles
../data/train\rec.sport.baseball
../data/train\rec.sport.hockey
../data/train\sci.crypt
../data/train\sci.electronics
../data/train\sci.med
../data/train\sci.space
../data/train\soc.religion.christian
../data/train\talk.politics.guns
../data/train\talk.politics.mideast
../data/train\talk.politics.misc
../data/train\talk.religion.misc

Loaded folder under ../data/test: 

../data/test\alt.atheism
../data/test\comp.graphics
../data/test\comp.os.ms-windows.misc
../data/test\comp.sys.ibm.pc.hardware
../data/test\comp.sys.mac.hardware
../data/test\comp.windows.x
../data/test\misc.forsale
../data/test\rec.autos
../data/test\rec.motorcycles
../data/test\rec.sport.baseball
../data/test

## Preprocessing

### parsing

In [3]:
corpus_train = pd.DataFrame(corpus_train_docs, columns=["doc_path", "text", "label", "original_idx"])
corpus_train = corpus_train.reset_index().rename(columns={"index":"global_index"})

corpus_test = pd.DataFrame(corpus_test_docs, columns=["doc_path", "text", "label", "original_idx"])
corpus_test = corpus_test.reset_index().rename(columns={"index":"global_index"})

print("original_idx duplicate count:", corpus_train.shape[0] - corpus_train.original_idx.drop_duplicates().shape[0], " on ", corpus_train.shape[0])
print("original_idx duplicate count:", corpus_test.shape[0] - corpus_test.original_idx.drop_duplicates().shape[0], " on ", corpus_test.shape[0])

original_idx duplicate count: 1060  on  11083
original_idx duplicate count: 770  on  7761


#### typo_parser

In [476]:
def typo_parser(x):
    """
    1. replace irrelevant symbol "|" or "*"
    2. remove extra space "  "
    3. replace extra \n "\n\n" into "\n"
    4. replace "> *>" into ">>" for further analysis

    @param string: email body string
    @return: cleaned email body string, extracted emails
    
    # test_string = 'www.\n com\n\n or ?\n>\n    >>\n    \n > > >|> (note) \n> \n I\nam not good enough with regex>'
    # typo_parser(test_string)

    """
    # x = re.sub('([,:;?!\.”\)])\n', '\g<1> ', x)  # add space for symbol like .\n or ?\n
    # x = re.sub('(\w)\n(\w)', '\g<1> \g<2>', x)  # add space for symbol like word\nword
    x = re.sub('\n', ' \n ', x)  # add space for between \n
    x = re.sub("[\*|\|\^]", "", x) # replace irrelevant symbol "|" or "*"
    
    x = re.sub(">[ >]*>", ">>", x)# compress > [?] > 
    x = re.sub("\[.*?\]", "", x, flags=re.S)  # separate for typo like [a)
    x = re.sub("\(.*?\)", "", x, flags=re.S)

    x = re.sub("\n[ \n]*\n", "\n", x) # compress \n
    return x



#### email_address_parser

In [463]:
def email_address_parser(string):
    """
    extract and remove email from the body
    @param string: email body string
    @return: cleaned email body string, extracted emails
    """
    emails = None
    emails = re.findall(" ?[\S]+@[\S]+ ?", string)
    string = re.sub(" ?[\S]+@[\S]+ ?", " ", string)
    return string, emails

#### bytedata_parser

In [359]:
def bytedata_parser(string, threshold=50):
    """
    Since 99% of english words length ranged from [1,20], but consider special symbol there, we set the threshold with 50 for only parse bytdata like photo
    If length of span larger than threshold, then we will not treat it as a word. 
    sep can only use space
    """
    bytedata = None
    clean_string = " ".join([word for word in re.split(" ", string) if len(word)<=threshold])
    ## sentence length is the same
    # clean_string = "\n".join([word for word in re.split("\n", clean_string) if len(word)<=threshold])
    bytedata = [word for word in re.split(" ", string) if len(word)>threshold]
    return clean_string, bytedata

#### structure_parser

In [360]:
def structure_parser(string):
    """
    @param parser: email string
    @return: structural information for email header, body, others
    """
    error_message = None
    header = {}
    body = ""
    others = []
    try:
        mail = mailparser.parse_from_string(string)
        if mail.has_defects:  # [first line error]
            remove_first_line_string = "\n".join(string.split("\n")[1:])
            mail = mailparser.parse_from_string(remove_first_line_string)
            # print("remove_first_line_string update for ")
        header, body = mail.headers, mail.body
        others = [mail.date, mail.delivered_to, mail.to_domains, error_message]

    except Exception as error:
        error_message = error
    return header, body, others

#### reference_parser

In [485]:
def extra_parser(x):
    """
    remove_flag and extra space
    """
    x = re.sub("(?:In article)?.*writes:" , "", x, flags=re.S)
    x = re.sub(" {2,}", " ", x) # compress space
    return x

def reference_parser(string, match_type=2):
    """
    Consider reply with referencing previous email, we need to separate them to make prediction separately.
    @param 
        string: email body string
        match_type: 0 with return only main body, 1 with return main body + previous one reference, 2 with more reference
    @return: 
        reply, previous_one, previous_two in the email
    
    
    @ test with the following code
    string = " \n\n\n\n    >>>zero email \n\n >>first email\n >second email\n reply email \n"
    reply, previous_one, previous_two = reference_parser(string, match_type=2)
    print("## reply\n", repr(reply))
    print("## previous_one\n", repr(previous_one))
    print("## previous_two\n", repr(previous_two))
    """
    
    previous_one, previous_two, reply = '', '', ''

    # extract reply with out containing >
    reply = " ".join([s for s in string.split("\n") if ">" not in s])
    reply = extra_parser(reply)
    
    # add "\n" before string to matchign [^>]{1}
    if match_type>0:
        previous_one = " ".join(re.findall("[^>]{1}>{1}([^>]{1}[\S ]*)\n", "\n" + string)) # matching >
        previous_one = extra_parser(previous_one)
        
    if match_type>1: # flag reference_two
        previous_two = " ".join(re.findall("[^>]{1}>{2}([^>]{1}[\S ]*)\n", "\n" + string)) # matching >>
        previous_two = extra_parser(previous_two)
    # previous_two_more_pt = "[^>]{1}>{2,}([^>]{1}[\S ]*)\n" # matching >> or >>> more
    return reply, previous_one, previous_two

### main structural_email 

In [487]:

def structural_email(data, bytedata_parser_threshold=50, reference_parser_match_type=2):
    """
    this is a parser pipeline, parser order matters.
    1. string => structure email to separate => header, body, others
    2. body => remove typo and some irrelevant words => body
    3. body => parse and remove email from body => body_no_email
    4. body_no_email => parse and remove binary data like BMP or picture from body => body_no_binary_no_email
    5. body_no_binary_no_email => separate email reference and reply => reply, previous_one, previous_two
    
    @param data: data text dataframe series including all the training set or test set
    @return: structural information
    """
    print("Preprocessing for unstructure email...")
    header_info = []
    body_info = []
    others_info = []
    for string in tqdm(data):
        header, body, others = structure_parser(string)
        body = typo_parser(body)
        body_no_email, emails = email_address_parser(body)
        body_no_binary_no_email, bytedata = bytedata_parser(body_no_email, threshold=bytedata_parser_threshold)
        reply, previous_one, previous_two = reference_parser(body_no_binary_no_email, match_type=reference_parser_match_type)

        header_info.append(header)
        body_info.append([reply, previous_one, previous_two])
        others_info.append(others+[emails]+[bytedata])

    a1 = pd.DataFrame.from_dict(header_info)
    a2 = pd.DataFrame(body_info, columns=["reply", "reference_one", "reference_two"])
    a3 = pd.DataFrame(others_info, columns=["date", "delivered_to", "to_domains", "error_message", "contained_emails", "long_string"])
    structure_email = pd.concat([a1, a2, a3], axis=1)
    return structure_email

#### merged features 

In [None]:
structural_train = structural_email(corpus_train["text"])
structural_test = structural_email(corpus_test["text"])

  0%|                                                                                | 1/11083 [00:00<25:29,  7.25it/s]

Preprocessing for unstructure email...


 54%|████████████████████████████████████████▊                                   | 5957/11083 [00:55<00:44, 114.32it/s]Email content 'x-usenet-faq' not handled
Email content 'x-usenet-faq' not handled
Email content 'x-usenet-faq' not handled
100%|████████████████████████████████████████████████████████████████████████████| 11083/11083 [02:17<00:00, 80.31it/s]
  0%|▎                                                                              | 27/7761 [00:00<00:28, 267.29it/s]

Preprocessing for unstructure email...


 81%|███████████████████████████████████████████████████████████████▌              | 6320/7761 [01:06<00:26, 55.18it/s]

In [None]:
train = pd.concat([corpus_train, structural_train], axis=1)
test = pd.concat([corpus_test, structural_test], axis=1)
all_cols = train.columns.tolist()
print(all_cols)

### attribute selection

In [None]:
t = train.isnull().sum().sort_values()
not_used_cols = t[t > train.shape[0]*0.1].index.tolist()
print("not_used_cols: \n", not_used_cols)

select_cols = ["global_index", "doc_path", "label", "reply", "reference_one", "reference_two",
               "Subject", "From", "Lines", "Organization", "contained_emails", "long_string", "text"]
print("may use cols: \n", select_cols)
t.to_frame().style

In [None]:
train = train[select_cols]
test = test[select_cols]

In [None]:
train[["global_index", "reply", "reference_one", "reference_two",]].sample(3, random_state=1160).style

#### module test

In [None]:
def checking_text(idx, write_in_local=True):
    x = train[train["global_index"] == idx]
    string = x["text"].iloc[0]
    body = x["reply"].iloc[0]
    x_path = x["doc_path"].iloc[0]
    x_label = x["label"].iloc[0]
    
    if write_in_local:
        with open("E:/wyang_github/Text-Classification/text.txt", "w", encoding="utf-8") as f:
            f.write(x_label+"\n\n")
            f.write(x_path+"\n\n")
            f.write(string)
    return string, body, x_path, x_label


module_test = True

if module_test:
    # 可以分开一个pyfile, 并且把这里的过程保存下来, 然后写在report中
    # idx = 22
    idx = 9187

    string, reply, x_path, x_label = checking_text(idx)

    header, body, others = structure_parser(string)
    print("\nrepr(header):   \n", repr(header))
    print("\nrepr(body):   \n", repr(body))
    print("\nrepr(others):   \n", repr(others))

    body = typo_parser(body)
    print("\nrepr(body):   \n", repr(body))

    body_no_email, emails = email_address_parser(body)
    print("\nrepr(body):   \n", repr(body))
    print("\nrepr(emails):   \n", repr(emails))
    print("\nrepr(body_no_email):   \n", repr(body_no_email))

    body_no_binary_no_email, bytedata = bytedata_parser(body_no_email, threshold=25)
    print("\nrepr(bytedata):   \n", repr(bytedata))
    print("\nrepr(body_no_binary_no_email):   \n", repr(body_no_binary_no_email))

    reply, previous_one, previous_two = reference_parser(body_no_binary_no_email, match_type=2)

    print("\nrepr(reply):   \n", repr(reply))
    print("\nrepr(previous_one):   \n", repr(previous_one))
    print("\nrepr(previous_two):   \n", repr(previous_two))

In [368]:
l = train['long_string'].dropna().apply(len)
l[l>0]
train['long_string'].sample(10).tolist()

[[],
 [],
 [],
 [],
 [],
 ['\n--------------------------------------------------------------------\n'],
 [],
 [],
 [],
 ['\nTony\n-----------------------------------------------------------------------\n--',
  '//\n-------------------------------------------------------------------\n--']]

In [369]:
t = train[train["reply"].apply(lambda x: "In article" in x)]
t.groupby("label").size()

label
alt.atheism                 102
comp.graphics                20
comp.os.ms-windows.misc      35
comp.sys.ibm.pc.hardware     30
comp.sys.mac.hardware        38
comp.windows.x               34
misc.forsale                  7
rec.autos                    40
rec.motorcycles              79
rec.sport.baseball           65
rec.sport.hockey             11
sci.crypt                    87
sci.electronics              35
sci.med                     119
sci.space                    69
soc.religion.christian       81
talk.politics.guns           89
talk.politics.mideast       142
talk.politics.misc           75
talk.religion.misc           69
dtype: int64

## Feature engineering

In [348]:
train[["reply", "doc_path", "label"]].sample(5).style

Unnamed: 0,reply,doc_path,label
7693,"For those of you interested in the above Procedure, I am able to add the following facts: 1) This Procedure is not done in Philadelphia. 2) It is performed in Maryland at Johns Hopkins for corrections between 0 and -5 and from -10 to -20 3) It is performed in New York City at Manhattan Eye and Ear for corrections between 0 and -6. The magic words to use when requesting information on this is not PRK (they think you mean RK) but the excimer laser study This will get you to the proper people. -- Barry D. Benowitz EMail: Phone: +1 609 866 1000 x354 Snail: Telesciences CO Systems, 351 New Albany Rd, Moorestown, NJ, 08057-1177",../data/train\sci.med\59190,sci.med
5702,"The tater that Jack Morris served to Griffey the Younger in his first at-bat this year went 394 feet, if I remember right (I'll have to check my scorecard at home) I think that's the longest so far in the Kingdome through the first stand there. A weak showing, despite some promising taterball candidates . . Ben McDonald, Rich DeLucia, and the rest of the Mariner bullpen . . making appearances. Anyone have the tape-measure value for Omar Vizquel's grand slam in the Skydome? --- Jeff Brown Big Enchilada of the Brown Bag Lunches Astronomy Dept. U. of Washington",../data/train\rec.sport.baseball\104798,rec.sport.baseball
3041,"Is there aything available for X similar to QuicKeys for the Macintosh -- something that will allow me to store and playback sequences of keystrokes, menu selections, and mouse actions - directing them towards another application? If so, could someone send me information on its availability -- and if not, how hard do we think it might be to send input to other X applications and, hopefully, deal with their responses appropriately? (If an application is going to take a few seconds to process I probably have to wait for it to complete before sending another command. thanks, david,",../data/train\comp.windows.x\67015,comp.windows.x
4004,The subject says it all. My 1984 Chev S10 Pickup's left turn signal does not stop after turning. What cause this to stop automaticaly? Is this a mechanical problem by the steering wheel? NOTE: This truck has an after market steering wheel installed.,../data/train\rec.autos\101554,rec.autos
3889,"FOR SALE - Steyr GB 9mm is an excellent handgun for the first time buyer or an experienced handgunner. It is in excellent condition. I never had a misfire with it. Make: Steyr Model GB 9mm Parabellum Magazine: 18 rounds Barrel: Hard-chrome-plated inside and outside for long term durability and wear resistance. Fixed mount. Price: $375, obo. Comes with 2 magazines, original owner's manual. Contact: T. Ahmad, ihlpm!tarq, 979-0838",../data/train\misc.forsale\76292,misc.forsale


## EDA

In [332]:
train["char_length"] = train["text"].apply(lambda x:len(x))

# c = (train["char_length"].sort_values())
# sent_cdf = c.cumsum()
# sent_pdf = c # / c.sum()
# sent_pdf.plot(kind="hist", bins=100)
# plt.xlabel("char_length")  # 需要先sort, 才能说是index of chars.
# plt.ylabel("char_cum_counts_perc")
# plt.title("MAX_DOC_LEN CDF")
# plt.show()

In [333]:
train.sort_values(by="char_length")

Unnamed: 0,global_index,doc_path,label,reply,reference_one,reference_two,Subject,From,Lines,Organization,contained_emails,long_string,text,char_length
494,494,../data/train\comp.graphics\37928,comp.graphics,,,,Re: Graphics Library Package,hl7204@eehp22 (H L),2,University of Illinois at Urbana,[],[],From: hl7204@eehp22 (H L)\nSubject: Re: Graphi...,125
2402,2402,../data/train\comp.sys.mac.hardware\51522,comp.sys.mac.hardware,art,,,modem question,kwgst+@pitt.edu (Mr. Someone),2,pre-EE,[],[],From: kwgst+@pitt.edu (Mr. Someone)\nSubject: ...,125
4039,4039,../data/train\rec.autos\101589,rec.autos,hello testing,,,just testing,swdwan@napier.uwaterloo.ca (Donald Wan),3,University of Waterloo,[],[],From: swdwan@napier.uwaterloo.ca (Donald Wan)\...,131
2421,2421,../data/train\comp.sys.mac.hardware\51541,comp.sys.mac.hardware,,,,re: Dead mouse ?,news@news.claremont.edu (The News System),1,"Harvey Mudd College, Claremont CA 91711",[],[],From: news@news.claremont.edu (The News System...,139
3595,3595,../data/train\misc.forsale\75911,misc.forsale,,,,Terminal forsale,ibeshir@nyx.cs.du.edu (Ibrahim),1,"Nyx, Public Access Unix @ U. of Denver Math/CS...",[],[],From: ibeshir@nyx.cs.du.edu (Ibrahim)\nSubject...,142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10176,10176,../data/train\talk.politics.mideast\76392,talk.politics.mideast,Accounts of Anti-Armenian Human Right Violatio...,,,Accounts of Anti-Armenian Human Right Violatio...,dbd@urartu.sdpa.org (David Davidian),912,S.D.P.A. Center for Regional Studies,[ dbd@urartu. ],[+--------------------------------------------...,From: dbd@urartu.sdpa.org (David Davidian)\nSu...,62187
1362,1362,../data/train\comp.os.ms-windows.misc\9704,comp.os.ms-windows.misc,"Hi, everybody: I guess my subject has said i...","gmu. edu writes: #! _F #CD , 56# #D. ! =>_D ...","G P---7+'U0/, PURM8MMVG<G(4U0KQRM4RM8QWG-5145....",Re: More Cool BMP files??,james@dlss2 (James Cummings),1021,RedRock Development,"[ 17301@gmuvax2. , rwang@gmuvax2. , %@"" , @...","[6VF""3YMU-34\nMU-34U-2EI=34I=2HU-34U74U-34U7U/...",From: james@dlss2 (James Cummings)\nSubject: R...,63095
2826,2826,../data/train\comp.windows.x\66322,comp.windows.x,"Archive-name: Xt-FAQ Version: $Id: FAQ-Xt, v 1...",#include <X11/StringDefs. h> \ntypedef struct ...,,comp.windows.x.intrinsics Frequently Asked Que...,ware@cis.ohio-state.edu (Peter Ware),1609,The Ohio State University Dept. of Computer an...,"[ ware@cis. , cathyr@ora. , adrian@ora. , g...",[\n-------------------------------------------...,From: ware@cis.ohio-state.edu (Peter Ware)\nSu...,66459
9625,9625,../data/train\talk.politics.guns\54684,talk.politics.guns,Try the firearms archive. Larry Cipriani's ins...,Date: 8 Apr 93 22: 50: 09 GMT prompt asking...,,Re: Need Senate Bill numbers and House Resolut...,mjp@watson.ibm.com (Michael Phelps),1835,IBM Kingston NY,"[ lvc@cbvox1. , 7274@cbnews. , magnum@mimsy....",[directory\n\t/usr0/anon/pub/firearms/politics...,From: mjp@watson.ibm.com (Michael Phelps)\nSub...,71400
