<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Package-import" data-toc-modified-id="Package-import-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Package import</a></span></li><li><span><a href="#Data-Loading" data-toc-modified-id="Data-Loading-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Loading</a></span><ul class="toc-item"><li><span><a href="#parsing" data-toc-modified-id="parsing-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>parsing</a></span></li><li><span><a href="#attribute-selection" data-toc-modified-id="attribute-selection-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>attribute selection</a></span></li></ul></li><li><span><a href="#Feature-engineering" data-toc-modified-id="Feature-engineering-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Feature engineering</a></span></li><li><span><a href="#EDA" data-toc-modified-id="EDA-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>EDA</a></span><ul class="toc-item"><li><span><a href="#Regex" data-toc-modified-id="Regex-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Regex</a></span></li></ul></li><li><span><a href="#Build-global-index" data-toc-modified-id="Build-global-index-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Build global index</a></span></li></ul></div>

## Package import 

In [1]:
import pandas as pd 
import numpy as np
import os
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import mailparser



## Data Loading
- from file into DataFrame

In [135]:
def load_data_folder(path):
    '''
    @param folders: the train or test directory
    @return: document list with [doc_path, doc, label, original_idx]
    '''
    folders = glob(path+'/**')  # explore all the folder under the directory

    docs = []
    for classes in folders:
        label = classes.split('\\')[-1]
        doc_paths = glob(classes+'\\**')
        
        for doc_path in doc_paths:
            original_idx = doc_path.split('\\')[-1]
            
            with open(doc_path, encoding='UTF-8') as f:
                text = f.read()
            docs.append([doc_path, text, label, original_idx])

    print(f'\nLoaded folder under {path}: \n')
    for folder in folders:
        print(folder)
        
    return docs


corpus_train_docs = load_data_folder(path='../data/train')
corpus_test_docs = load_data_folder(path='../data/test')


Loaded folder under ../data/train: 

../data/train\alt.atheism
../data/train\comp.graphics
../data/train\comp.os.ms-windows.misc
../data/train\comp.sys.ibm.pc.hardware
../data/train\comp.sys.mac.hardware
../data/train\comp.windows.x
../data/train\misc.forsale
../data/train\rec.autos
../data/train\rec.motorcycles
../data/train\rec.sport.baseball
../data/train\rec.sport.hockey
../data/train\sci.crypt
../data/train\sci.electronics
../data/train\sci.med
../data/train\sci.space
../data/train\soc.religion.christian
../data/train\talk.politics.guns
../data/train\talk.politics.mideast
../data/train\talk.politics.misc
../data/train\talk.religion.misc

Loaded folder under ../data/test: 

../data/test\alt.atheism
../data/test\comp.graphics
../data/test\comp.os.ms-windows.misc
../data/test\comp.sys.ibm.pc.hardware
../data/test\comp.sys.mac.hardware
../data/test\comp.windows.x
../data/test\misc.forsale
../data/test\rec.autos
../data/test\rec.motorcycles
../data/test\rec.sport.baseball
../data/test

### parsing

In [136]:
corpus_train = pd.DataFrame(corpus_train_docs, columns=['doc_path', 'text', 'label', 'original_idx'])
corpus_train = corpus_train.reset_index().rename(columns={'index':'global_index'})

corpus_test = pd.DataFrame(corpus_test_docs, columns=['doc_path', 'text', 'label', 'original_idx'])
corpus_test = corpus_test.reset_index().rename(columns={'index':'global_index'})

print("original_idx duplicate count:", corpus_train.shape[0] - corpus_train.original_idx.drop_duplicates().shape[0], ' on ', corpus_train.shape[0])
print("original_idx duplicate count:", corpus_test.shape[0] - corpus_test.original_idx.drop_duplicates().shape[0], ' on ', corpus_test.shape[0])

original_idx duplicate count: 1060  on  11083
original_idx duplicate count: 770  on  7761


In [137]:
def parser(string):
    '''
    @param parser: email string
    @return: structural information for email header, body, others
    '''
    error_message = None
    header = {}
    body = None
    others = []
    try:
        mail = mailparser.parse_from_string(string)
        if mail.has_defects:  # [first line error]
            remove_first_line_string = "\n".join(string.split('\n')[1:])
            mail = mailparser.parse_from_string(remove_first_line_string)
            # print('remove_first_line_string update for ')
        header, body = mail.headers, mail.body
        others = [mail.date, mail.delivered_to, mail.to_domains, error_message]

    except Exception as error:
        error_message = error
    return header, body, others


def structural_email(data):
    '''
    @param data: data text dataframe series including all the training set or test set
    @return: structural information
    '''
    header_info = []
    body_info = []
    others_info = []
    for string in data:
        header, body, others = parser(string)
        header_info.append(header)
        body_info.append(body)
        others_info.append(others)

    a1 = pd.DataFrame.from_dict(header_info)
    a2 = pd.DataFrame(body_info, columns=['body'])
    a3 = pd.DataFrame(others_info, columns=[
                      'date', 'delivered_to', 'to_domains', 'error_message'])
    structure_email = pd.concat([a1, a2, a3], axis=1)
    return structure_email

In [None]:
structural_train = structural_email(corpus_train['text'])
structural_test = structural_email(corpus_test['text'])

In [138]:
train = pd.concat([corpus_train, structural_train], axis=1)
test = pd.concat([corpus_test, structural_test], axis=1)

Email content 'x-usenet-faq' not handled
Email content 'x-usenet-faq' not handled
Email content 'x-usenet-faq' not handled


### attribute selection

In [178]:
t = train.isnull().sum().sort_values()
not_used_cols = t[t > train.shape[0]*0.1].index.tolist()
print("not_used_cols: \n", not_used_cols)

used_cols = t[t <= train.shape[0]*0.1].index.tolist()
print('may use cols: \n', used_cols)
train[used_cols].head()

not_used_cols: 
may use cols: 
 ['global_index', 'doc_path', 'text', 'label', 'original_idx', 'to_domains', 'body', 'delivered_to', 'Subject', 'From', 'Lines', 'Organization']


Unnamed: 0,global_index,doc_path,text,label,original_idx,to_domains,body,delivered_to,Subject,From,Lines,Organization
0,0,../data/train\alt.atheism\49960,From: mathew <mathew@mantis.co.uk>\nSubject: A...,alt.atheism,49960,[],Archive-name: atheism/resources\nAlt-atheism-a...,[],Alt.Atheism FAQ: Atheist Resources,mathew <mathew@mantis.co.uk>,290,"Mantis Consultants, Cambridge. UK."
1,1,../data/train\alt.atheism\51060,From: mathew <mathew@mantis.co.uk>\nSubject: A...,alt.atheism,51060,[],Archive-name: atheism/introduction\nAlt-atheis...,[],Alt.Atheism FAQ: Introduction to Atheism,mathew <mathew@mantis.co.uk>,646,"Mantis Consultants, Cambridge. UK."
2,2,../data/train\alt.atheism\51119,From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...,alt.atheism,51119,[],In article <65974@mimsy.umd.edu>\nmangoe@cs.um...,[],Re: Gospel Dating,I3150101@dbstu1.rz.tu-bs.de (Benedikt Rosenau),93,"Technical University Braunschweig, Germany"
3,3,../data/train\alt.atheism\51120,From: mathew <mathew@mantis.co.uk>\nSubject: R...,alt.atheism,51120,[],dmn@kepler.unh.edu (...until kings become phil...,[],Re: university violating separation of church/...,mathew <mathew@mantis.co.uk>,29,"Mantis Consultants, Cambridge. UK."
4,4,../data/train\alt.atheism\51121,From: strom@Watson.Ibm.Com (Rob Strom)\nSubjec...,alt.atheism,51121,[],In article <N4HY.93Apr5120934@harder.ccr-p.ida...,[],"Re: [soc.motss, et al.] ""Princeton axes matchi...",strom@Watson.Ibm.Com (Rob Strom),15,IBM Research


In [None]:
select_cols = ['label', 'body', 'global_index',
               'Subject', 'From', 'Lines', 'Organization']

In [179]:
train = train[select_cols]
test = test[select_cols]

Unnamed: 0,label,body,global_index,Subject,From,Lines,Organization
0,alt.atheism,In article <healta.153.735242337@saturn.wwc.ed...,0,Re: about the bible quiz answers,decay@cbnewsj.cb.att.com (dean.kaflowitz),18,AT&T
1,alt.atheism,"In article <timmbake.735265296@mcl>, timmbake@...",1,Re: Amusing atheists and agnostics,cfaehl@vesta.unm.edu (Chris Faehl),88,"University of New Mexico, Albuquerque"
2,alt.atheism,jaeger@buphy.bu.edu (Gregg Jaeger) writes:\n>I...,2,Re: Yet more Rushdie [Re: ISLAMIC LAW],mathew <mathew@mantis.co.uk>,50,"Mantis Consultants, Cambridge. UK."
3,alt.atheism,"In article 11853@vice.ICO.TEK.COM, bobbe@vice....",3,Re: Christian Morality is,"dps@nasa.kodak.com (Dan Schaertel,,,)",21,Eastman Kodak Company
4,alt.atheism,In article <930419.104739.2t8.rusnews.w165w@ma...,4,"Re: After 2000 years, can we say that Christia...",halat@panther.bears (Jim Halat),129,
...,...,...,...,...,...,...,...
7756,talk.religion.misc,pboxrud@magnus.acs.ohio-state.edu (Paul D Boxr...,7756,Re: Religion and marriage,sbuckley@fraser.sfu.ca (Stephen Buckley),37,"Simon Fraser University, Burnaby, B.C., Canada"
7757,talk.religion.misc,"In article <1993Apr23.111105.7703@ifi.uio.no>,...",7757,Re: A Message for you Mr. President: How do yo...,bakerj@gtephx.UUCP (Jon Baker),37,"AG Communication Systems, Phoenix, Arizona"
7758,talk.religion.misc,In article <1rc1f3INN7rl@emx.cc.utexas.edu> \n...,7758,Re: Why did they behave as they did (Waco--rea...,pharvey@quack.kfu.com (Paul Harvey),18,"The Duck Pond public unix: +1 408 249 9630, lo..."
7759,talk.religion.misc,\nIn article <1993Apr26.231845.13843@digi.lone...,7759,Re: Info about New Age!\n <1dx802lO40Rq01@JUTS...,<KEVXU@CUNYVM.BITNET>,46,City University of New York


## Feature engineering

## EDA

In [10]:
train['char_length'] = train['text'].apply(lambda x:len(x))

# c = (train['char_length'].sort_values())
# sent_cdf = c.cumsum()
# sent_pdf = c # / c.sum()
# sent_pdf.plot(kind='hist', bins=100)
# plt.xlabel("char_length")  # 需要先sort, 才能说是index of chars.
# plt.ylabel("char_cum_counts_perc")
# plt.title("MAX_DOC_LEN CDF")
# plt.show()

In [11]:
train.sort_values(by='char_length')

Unnamed: 0,global_index,doc_path,text,label,original_idx,char_length
494,494,../data/train\comp.graphics\37928,From: hl7204@eehp22 (H L)\nSubject: Re: Graphi...,comp.graphics,37928,125
2402,2402,../data/train\comp.sys.mac.hardware\51522,From: kwgst+@pitt.edu (Mr. Someone)\nSubject: ...,comp.sys.mac.hardware,51522,125
4039,4039,../data/train\rec.autos\101589,From: swdwan@napier.uwaterloo.ca (Donald Wan)\...,rec.autos,101589,131
2421,2421,../data/train\comp.sys.mac.hardware\51541,From: news@news.claremont.edu (The News System...,comp.sys.mac.hardware,51541,139
3595,3595,../data/train\misc.forsale\75911,From: ibeshir@nyx.cs.du.edu (Ibrahim)\nSubject...,misc.forsale,75911,142
...,...,...,...,...,...,...
10176,10176,../data/train\talk.politics.mideast\76392,From: dbd@urartu.sdpa.org (David Davidian)\nSu...,talk.politics.mideast,76392,62187
1362,1362,../data/train\comp.os.ms-windows.misc\9704,From: james@dlss2 (James Cummings)\nSubject: R...,comp.os.ms-windows.misc,9704,63095
2826,2826,../data/train\comp.windows.x\66322,From: ware@cis.ohio-state.edu (Peter Ware)\nSu...,comp.windows.x,66322,66459
9625,9625,../data/train\talk.politics.guns\54684,From: mjp@watson.ibm.com (Michael Phelps)\nSub...,talk.politics.guns,54684,71400


Unnamed: 0,From,Subject,Summary,Keywords,Expires,Distribution,Organization,Supersedes,Lines,X-Newsreader,...,Followups-to,X-Newsposter,X-Header,X-Cc,Oanization,body,date,delivered_to,to_domains,error_message
0,mathew <mathew@mantis.co.uk>,Alt.Atheism FAQ: Atheist Resources,"Books, addresses, music -- anything related to...","FAQ, atheism, books, music, fiction, addresses...","Thu, 29 Apr 1993 11:57:19 GMT",world,"Mantis Consultants, Cambridge. UK.",<19930301143317@mantis.co.uk>,290,,...,,,,,,Archive-name: atheism/resources\nAlt-atheism-a...,,[],[],
1,mathew <mathew@mantis.co.uk>,Alt.Atheism FAQ: Introduction to Atheism,Please read this file before posting to alt.at...,"FAQ, atheism","Thu, 6 May 1993 12:22:45 GMT",world,"Mantis Consultants, Cambridge. UK.",<19930308134439@mantis.co.uk>,646,,...,,,,,,Archive-name: atheism/introduction\nAlt-atheis...,,[],[],
2,I3150101@dbstu1.rz.tu-bs.de (Benedikt Rosenau),Re: Gospel Dating,,,,,"Technical University Braunschweig, Germany",,93,,...,,,,,,In article <65974@mimsy.umd.edu>\nmangoe@cs.um...,,[],[],
3,mathew <mathew@mantis.co.uk>,Re: university violating separation of church/...,,,,,"Mantis Consultants, Cambridge. UK.",,29,rusnews v1.01,...,,,,,,dmn@kepler.unh.edu (...until kings become phil...,,[],[],
4,strom@Watson.Ibm.Com (Rob Strom),"Re: [soc.motss, et al.] ""Princeton axes matchi...",,,,usa,IBM Research,,15,,...,,,,,,In article <N4HY.93Apr5120934@harder.ccr-p.ida...,,[],[],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11078,psyrobtw@ubvmsd.cc.buffalo.edu (Robert Weiss),18 Apr 93 God's Promise in Philippians 4:9,,,,,University at Buffalo,,8,,...,,,,,,"\n\tThose things,\n\twhich ye have both learne...",,[],[],
11079,sandvik@newton.apple.com (Kent Sandvik),Re: 14 Apr 93 God's Promise in 1 John 1: 7,,,,,Cookamunga Tourist Bureau,,14,,...,,,,,,"In article <RN652B5w165w@tweekco.uucp>, alizar...",,[],[],
11080,sandvik@newton.apple.com (Kent Sandvik),Re: Disillusioned Protestant Finds Christ,,,,,Cookamunga Tourist Bureau,,23,,...,,,,,,"In article <C5KxDD.K4J@boi.hp.com>, jburrill@b...",,[],[],
11081,cutter@gloster.via.mind.org (cutter),Re: Biblical Backing of Koresh's 3-02 Tape (Ci...,,,,world,"Gordian Knot, Gloster,GA",,22,,...,,,,,,netd@susie.sbc.com () writes:\n\n> In article ...,,[],[],


### Regex
- From
- Subject
    - 如果 bmp 在这里面的话, 直接跳过 先不处理
- Summary
- Keywords
- Expires
- Distribution
- Organization
- Supersedes
- Lines

- Archive-name
- Alt-atheism-archive-name
- Last-modified
- Version

内容里面最长的词有多长, 如果太长了 超过100, 则认为是有乱码. 直接跳过?

In [None]:
写正则表达式
- 

In [126]:
string = t['text'][9]
string.split('\n')

['From: keith@cco.caltech.edu (Keith Allan Schneider)',
 'Subject: Re: >>>>>>Pompous ass',
 'Organization: California Institute of Technology, Pasadena',
 'Lines: 9',
 'NNTP-Posting-Host: punisher.caltech.edu',
 '',
 'kmr4@po.CWRU.edu (Keith M. Ryan) writes:',
 '',
 '>>Then why do people keep asking the same questions over and over?',
 '>Because you rarely ever answer them.',
 '',
 "Nope, I've answered each question posed, and most were answered multiple",
 'times.',
 '',
 'keith',
 '']

0        From: mathew <mathew@mantis.co.uk>\nSubject: A...
1        From: mathew <mathew@mantis.co.uk>\nSubject: A...
2        From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...
3        From: mathew <mathew@mantis.co.uk>\nSubject: R...
4        From: strom@Watson.Ibm.Com (Rob Strom)\nSubjec...
                               ...                        
11078    From: psyrobtw@ubvmsd.cc.buffalo.edu (Robert W...
11079    From: sandvik@newton.apple.com (Kent Sandvik)\...
11080    From: sandvik@newton.apple.com (Kent Sandvik)\...
11081    From: cutter@gloster.via.mind.org (cutter)\nSu...
11082    Subject: Re: Albert Sabin\nFrom: rfox@charlie....
Name: text, Length: 11083, dtype: object

In [None]:
def info_extractor():

## Build global index
- for future information retrieve

In [19]:
# def parser():
data

['../data/train\\talk.religion.misc\\82757',
 '../data/train\\talk.religion.misc\\82758',
 '../data/train\\talk.religion.misc\\82759',
 '../data/train\\talk.religion.misc\\82760',
 '../data/train\\talk.religion.misc\\82763',
 '../data/train\\talk.religion.misc\\82766',
 '../data/train\\talk.religion.misc\\82767',
 '../data/train\\talk.religion.misc\\82771',
 '../data/train\\talk.religion.misc\\82772',
 '../data/train\\talk.religion.misc\\82774',
 '../data/train\\talk.religion.misc\\82775',
 '../data/train\\talk.religion.misc\\82776',
 '../data/train\\talk.religion.misc\\82777',
 '../data/train\\talk.religion.misc\\82778',
 '../data/train\\talk.religion.misc\\82779',
 '../data/train\\talk.religion.misc\\82781',
 '../data/train\\talk.religion.misc\\82782',
 '../data/train\\talk.religion.misc\\82783',
 '../data/train\\talk.religion.misc\\82784',
 '../data/train\\talk.religion.misc\\82785',
 '../data/train\\talk.religion.misc\\82786',
 '../data/train\\talk.religion.misc\\82787',
 '../data/

In [18]:
pd.data

['../data/train\\talk.religion.misc\\82757',
 '../data/train\\talk.religion.misc\\82758',
 '../data/train\\talk.religion.misc\\82759',
 '../data/train\\talk.religion.misc\\82760',
 '../data/train\\talk.religion.misc\\82763',
 '../data/train\\talk.religion.misc\\82766',
 '../data/train\\talk.religion.misc\\82767',
 '../data/train\\talk.religion.misc\\82771',
 '../data/train\\talk.religion.misc\\82772',
 '../data/train\\talk.religion.misc\\82774',
 '../data/train\\talk.religion.misc\\82775',
 '../data/train\\talk.religion.misc\\82776',
 '../data/train\\talk.religion.misc\\82777',
 '../data/train\\talk.religion.misc\\82778',
 '../data/train\\talk.religion.misc\\82779',
 '../data/train\\talk.religion.misc\\82781',
 '../data/train\\talk.religion.misc\\82782',
 '../data/train\\talk.religion.misc\\82783',
 '../data/train\\talk.religion.misc\\82784',
 '../data/train\\talk.religion.misc\\82785',
 '../data/train\\talk.religion.misc\\82786',
 '../data/train\\talk.religion.misc\\82787',
 '../data/