<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Package-import" data-toc-modified-id="Package-import-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Package import</a></span></li><li><span><a href="#Data-Loading" data-toc-modified-id="Data-Loading-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Loading</a></span><ul class="toc-item"><li><span><a href="#parsing" data-toc-modified-id="parsing-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>parsing</a></span></li><li><span><a href="#combined" data-toc-modified-id="combined-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>combined</a></span><ul class="toc-item"><li><span><a href="#module-test" data-toc-modified-id="module-test-2.2.1"><span class="toc-item-num">2.2.1&nbsp;&nbsp;</span>module test</a></span></li></ul></li><li><span><a href="#attribute-selection" data-toc-modified-id="attribute-selection-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>attribute selection</a></span></li></ul></li><li><span><a href="#Feature-engineering" data-toc-modified-id="Feature-engineering-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Feature engineering</a></span></li><li><span><a href="#EDA" data-toc-modified-id="EDA-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>EDA</a></span><ul class="toc-item"><li><span><a href="#Regex" data-toc-modified-id="Regex-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Regex</a></span></li></ul></li><li><span><a href="#Build-global-index" data-toc-modified-id="Build-global-index-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Build global index</a></span></li></ul></div>

## Package import 

In [1]:
import pandas as pd 
import numpy as np
import os
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import mailparser
import re


## Data Loading
- from file into DataFrame

In [135]:
def load_data_folder(path):
    '''
    @param folders: the train or test directory
    @return: document list with [doc_path, doc, label, original_idx]
    '''
    folders = glob(path+'/**')  # explore all the folder under the directory

    docs = []
    for classes in folders:
        label = classes.split('\\')[-1]
        doc_paths = glob(classes+'\\**')
        
        for doc_path in doc_paths:
            original_idx = doc_path.split('\\')[-1]
            
            with open(doc_path, encoding='UTF-8') as f:
                text = f.read()
            docs.append([doc_path, text, label, original_idx])

    print(f'\nLoaded folder under {path}: \n')
    for folder in folders:
        print(folder)
        
    return docs


corpus_train_docs = load_data_folder(path='../data/train')
corpus_test_docs = load_data_folder(path='../data/test')


Loaded folder under ../data/train: 

../data/train\alt.atheism
../data/train\comp.graphics
../data/train\comp.os.ms-windows.misc
../data/train\comp.sys.ibm.pc.hardware
../data/train\comp.sys.mac.hardware
../data/train\comp.windows.x
../data/train\misc.forsale
../data/train\rec.autos
../data/train\rec.motorcycles
../data/train\rec.sport.baseball
../data/train\rec.sport.hockey
../data/train\sci.crypt
../data/train\sci.electronics
../data/train\sci.med
../data/train\sci.space
../data/train\soc.religion.christian
../data/train\talk.politics.guns
../data/train\talk.politics.mideast
../data/train\talk.politics.misc
../data/train\talk.religion.misc

Loaded folder under ../data/test: 

../data/test\alt.atheism
../data/test\comp.graphics
../data/test\comp.os.ms-windows.misc
../data/test\comp.sys.ibm.pc.hardware
../data/test\comp.sys.mac.hardware
../data/test\comp.windows.x
../data/test\misc.forsale
../data/test\rec.autos
../data/test\rec.motorcycles
../data/test\rec.sport.baseball
../data/test

### parsing

In [136]:
corpus_train = pd.DataFrame(corpus_train_docs, columns=['doc_path', 'text', 'label', 'original_idx'])
corpus_train = corpus_train.reset_index().rename(columns={'index':'global_index'})

corpus_test = pd.DataFrame(corpus_test_docs, columns=['doc_path', 'text', 'label', 'original_idx'])
corpus_test = corpus_test.reset_index().rename(columns={'index':'global_index'})

print("original_idx duplicate count:", corpus_train.shape[0] - corpus_train.original_idx.drop_duplicates().shape[0], ' on ', corpus_train.shape[0])
print("original_idx duplicate count:", corpus_test.shape[0] - corpus_test.original_idx.drop_duplicates().shape[0], ' on ', corpus_test.shape[0])

original_idx duplicate count: 1060  on  11083
original_idx duplicate count: 770  on  7761


In [946]:
def typo_parser(x):
    '''
    1. replace irrelevant symbol "|"
    2. remove extra space "  "
    3. replace extra \n "\n\n" into "\n"
    4. replace '> *>' into '>>' for further analysis
    
    @param string: email body string
    @return: cleaned email body string, extracted emails
    '''
    
    x = re.sub('\|', "", x)
    x = re.sub(' {2,}', " ", x)
    x = re.sub(' ?\n{1,} ?', '\n', x)
    x = re.sub('> *>', '>>', x)
    return x

In [840]:
def email_address_parser(string):
    '''
    extract and remove email from the body
    @param string: email body string
    @return: cleaned email body string, extracted emails
    '''
    emails = None
    emails = re.findall(" ?[\S]+@[\S]+ ?", string)
    string = re.sub(" ?[\S]+@[\S]+ ?", " ", string)
    return string, emails

In [883]:
def bytedata_parser(string, threshold=25):
    '''
    Since 99% of english words length ranged from [1,20], we set the threshold with 25.
    If length of span larger than threshold, then we will not treat it as a word. 
    sep can only use space
    '''
    bytedata = None
    clean_string = " ".join([word for word in re.split(" ", string) if len(word)<=threshold])
    bytedata = [word for word in re.split(" ", string) if len(word)>threshold]
    return clean_string, bytedata


In [933]:
def structure_parser(string):
    '''
    @param parser: email string
    @return: structural information for email header, body, others
    '''
    error_message = None
    header = {}
    body = ''
    others = []
    try:
        mail = mailparser.parse_from_string(string)
        if mail.has_defects:  # [first line error]
            remove_first_line_string = "\n".join(string.split('\n')[1:])
            mail = mailparser.parse_from_string(remove_first_line_string)
            # print('remove_first_line_string update for ')
        header, body = mail.headers, mail.body
        others = [mail.date, mail.delivered_to, mail.to_domains, error_message]

    except Exception as error:
        error_message = error
    return header, body, others



def reference_parser(string, match_type=2):
    '''
    Consider reply with referencing previous email, we need to separate them to make prediction separately.
    @param 
        string: email body string
        match_type: 0 with return only main body, 1 with return main body + previous one reference, 2 with more reference
    @return: 
        reply, previous_one, previous_two in the email
    
    
    @ test with the following code
    string = ' \n\n\n\n    >>>zero email \n\n >>first email\n >second email\n reply email \n'
    reply, previous_one, previous_two = reference_parser(string, match_type=2)
    print("## reply\n", repr(reply))
    print("## previous_one\n", repr(previous_one))
    print("## previous_two\n", repr(previous_two))
    '''
    
    previous_one, previous_two, reply = None, None, None

    # extract reply with out containing >
    reply = " ".join([s for s in string.split('\n') if '>' not in s]).strip()
    
    # add '\n' before string to matchign [^>]{1}
    if match_type>0:
        previous_one = " ".join(re.findall('[^>]{1}>{1}([^>]{1}[\S ]*)\n', '\n' + string)).strip() # matching >
    if match_type>1:
        previous_two = " ".join(re.findall('[^>]{1}>{2}([^>]{1}[\S ]*)\n' , '\n' + string)).strip() # matching >>
    
    # previous_two_more_pt = '[^>]{1}>{2,}([^>]{1}[\S ]*)\n' # matching >> or >>> more
    return reply, previous_one, previous_two


def structural_email(data):
    '''
    this is a parser pipeline, parser order matters.
    1. string => structure email to separate => header, body, others
    2. body => remove typo and some irrelevant words => body
    3. body => parse and remove email from body => body_no_email
    4. body_no_email => parse and remove binary data like BMP or picture from body => body_no_binary_no_email
    5. body_no_binary_no_email => separate email reference and reply => reply, previous_one, previous_two
    
    @param data: data text dataframe series including all the training set or test set
    @return: structural information
    '''
    print("Preprocessing for unstructure email...")
    header_info = []
    body_info = []
    others_info = []
    for string in tqdm(data):
        header, body, others = structure_parser(string)
        body = typo_parser(body)
        body_no_email, emails = email_address_parser(body)
        body_no_binary_no_email, bytedata = bytedata_parser(body_no_email, threshold=25)
        reply, previous_one, previous_two = reference_parser(body_no_binary_no_email, match_type=2)

        header_info.append(header)
        body_info.append([reply, previous_one, previous_two])
        others_info.append(others+[emails]+[bytedata])

    a1 = pd.DataFrame.from_dict(header_info)
    a2 = pd.DataFrame(body_info, columns=['reply', 'reference_one', 'reference_two'])
    a3 = pd.DataFrame(others_info, columns=['date', 'delivered_to', 'to_domains', 'error_message', 'contained_emails', 'long_string'])
    structure_email = pd.concat([a1, a2, a3], axis=1)
    return structure_email

### combined 

In [886]:
structural_train = structural_email(corpus_train['text'])
structural_test = structural_email(corpus_test['text'])

  1%|▍                                                                             | 58/11083 [00:00<00:19, 579.75it/s]

Preprocessing for unstructure email...


 53%|████████████████████████████████████████▎                                   | 5886/11083 [00:07<00:06, 816.76it/s]Email content 'x-usenet-faq' not handled
Email content 'x-usenet-faq' not handled
Email content 'x-usenet-faq' not handled
100%|███████████████████████████████████████████████████████████████████████████| 11083/11083 [00:14<00:00, 746.15it/s]
  2%|█▍                                                                            | 145/7761 [00:00<00:10, 717.62it/s]

Preprocessing for unstructure email...


100%|█████████████████████████████████████████████████████████████████████████████| 7761/7761 [00:10<00:00, 738.43it/s]


In [937]:
structural_train['reference_one'].sample(5).tolist()

["... \n>> >Are all truths also absolutes? Is all of scripture truths (and therefore absolutes)? \n>> The answer to both questions is yes. Perhaps we have different definitions of absolute then.  To me, situations, etc.  True in every instance possible.  Do you agree \n>> Similarly, all truth is absolute.  Indeed, a non-absolute truth is a Evangelicals are clearly not taking this particular part of scripture Can you reconcile this? \n>I don't claim that there are *no* absolutes.  I think there are very \n>> >There is hardly consensus, even in evangelical Christianity (not to mention the rest of Christianity) regarding Biblical interpretation.",
 'I do not have finger!!! So is there any other way of accessing this service',
 "Hi.I'm a Turkish guy who had tried atheism,satenism and buddism at some instant  write here.From my point of view,you atheists are people who has dropped to a  if you had looked a little bit upward you would see the blue skies.You'dsee t  ager.now,let's generate so

In [897]:
structural_train['reference_two']

0                                                         
1                                                         
2        Well, John has a quite different, not necessar...
3                                                         
4                                                         
                               ...                        
11078                                                     
11079                                                     
11080                                                     
11081                                                     
11082    |> Perhaps you should read it and stop advanci...
Name: reference_two, Length: 11083, dtype: object

In [889]:
structural_train

Unnamed: 0,From,Subject,Summary,Keywords,Expires,Distribution,Organization,Supersedes,Lines,X-Newsreader,...,Oanization,reply,reference_one,reference_two,date,delivered_to,to_domains,error_message,contained_emails,long_string
0,mathew <mathew@mantis.co.uk>,Alt.Atheism FAQ: Atheist Resources,"Books, addresses, music -- anything related to...","FAQ, atheism, books, music, fiction, addresses...","Thu, 29 Apr 1993 11:57:19 GMT",world,"Mantis Consultants, Cambridge. UK.",<19930301143317@mantis.co.uk>,290,,...,,Archive-name: resources Last-modified: 11 Dece...,,,,[],[],,"[ <figmo@netcom.com>. , archive-server@mantis...","[atheism/resources\nAlt-atheism-archive-name:,..."
1,mathew <mathew@mantis.co.uk>,Alt.Atheism FAQ: Introduction to Atheism,Please read this file before posting to alt.at...,"FAQ, atheism","Thu, 6 May 1993 12:22:45 GMT",world,"Mantis Consultants, Cambridge. UK.",<19930308134439@mantis.co.uk>,646,,...,,Archive-name: 5 April 1993 Version: 1.2 -----B...,,,,[],[],,"[ <mathew@mantis.co.uk>, pgpinfo@mantis.co.uk.]",[atheism/introduction\nAlt-atheism-archive-nam...
2,I3150101@dbstu1.rz.tu-bs.de (Benedikt Rosenau),Re: Gospel Dating,,,,,"Technical University Braunschweig, Germany",,93,,...,,In article (Charley Wingate) writes: The arg...,>This is a new argument to me. Could you elab...,"Well, John has a quite different, not necessar...",,[],[],,"[ <65974@mimsy.umd.edu>, mangoe@cs.umd.edu ]","[discovered.\n>\n>>Interesting,]"
3,mathew <mathew@mantis.co.uk>,Re: university violating separation of church/...,,,,,"Mantis Consultants, Cambridge. UK.",,29,rusnews v1.01,...,,(...until kings become philosophers or philoso...,"Recently, RAs have been ordered (and none have...",,,[],[],,[dmn@kepler.unh.edu ],[]
4,strom@Watson.Ibm.Com (Rob Strom),"Re: [soc.motss, et al.] ""Princeton axes matchi...",,,,usa,IBM Research,,15,,...,,In article (Bob McGwier) writes: Can somebody...,"[1] HOWEVER, I hate economic terrorism and pol...",,,[],[],,"[ <N4HY.93Apr5120934@harder.ccr-p.ida.org>, , ...",[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11078,psyrobtw@ubvmsd.cc.buffalo.edu (Robert Weiss),18 Apr 93 God's Promise in Philippians 4:9,,,,,University at Buffalo,,8,,...,,"Those things, \twhich ye have both learned, an...",,,,[],[],,[],[]
11079,sandvik@newton.apple.com (Kent Sandvik),Re: 14 Apr 93 God's Promise in 1 John 1: 7,,,,,Cookamunga Tourist Bureau,,14,,...,,"In article (A.Lizard) wrote: Please, please d...",Judging from postings I've read all over Usene...,,,[],[],,"[ <RN652B5w165w@tweekco.uucp>, , alizard@tweek...",[]
11080,sandvik@newton.apple.com (Kent Sandvik),Re: Disillusioned Protestant Finds Christ,,,,,Cookamunga Tourist Bureau,,23,,...,,"In article (Jim Burrill) wrote: Jim, please, ...",If Jesus never taught the concept of the Trini...,,,[],[],,"[ <C5KxDD.K4J@boi.hp.com>, , jburrill@boi.hp.c...",[]
11081,cutter@gloster.via.mind.org (cutter),Re: Biblical Backing of Koresh's 3-02 Tape (Ci...,,,,world,"Gordian Knot, Gloster,GA",,22,,...,,() writes: And I think we ought to hold Christ...,In article (stephen For those who think David...,,,[],[],,"[netd@susie.sbc.com , <20APR199301460499@utar...",[Gladiators.\n\n\n----------------------------...


In [899]:
train

Unnamed: 0,label,body,global_index,doc_path,Subject,From,Lines,Organization,text
0,alt.atheism,Archive-name: atheism/resources\nAlt-atheism-a...,0,../data/train\alt.atheism\49960,Alt.Atheism FAQ: Atheist Resources,mathew <mathew@mantis.co.uk>,290,"Mantis Consultants, Cambridge. UK.",From: mathew <mathew@mantis.co.uk>\nSubject: A...
1,alt.atheism,Archive-name: atheism/introduction\nAlt-atheis...,1,../data/train\alt.atheism\51060,Alt.Atheism FAQ: Introduction to Atheism,mathew <mathew@mantis.co.uk>,646,"Mantis Consultants, Cambridge. UK.",From: mathew <mathew@mantis.co.uk>\nSubject: A...
2,alt.atheism,In article <65974@mimsy.umd.edu>\nmangoe@cs.um...,2,../data/train\alt.atheism\51119,Re: Gospel Dating,I3150101@dbstu1.rz.tu-bs.de (Benedikt Rosenau),93,"Technical University Braunschweig, Germany",From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...
3,alt.atheism,dmn@kepler.unh.edu (...until kings become phil...,3,../data/train\alt.atheism\51120,Re: university violating separation of church/...,mathew <mathew@mantis.co.uk>,29,"Mantis Consultants, Cambridge. UK.",From: mathew <mathew@mantis.co.uk>\nSubject: R...
4,alt.atheism,In article <N4HY.93Apr5120934@harder.ccr-p.ida...,4,../data/train\alt.atheism\51121,"Re: [soc.motss, et al.] ""Princeton axes matchi...",strom@Watson.Ibm.Com (Rob Strom),15,IBM Research,From: strom@Watson.Ibm.Com (Rob Strom)\nSubjec...
...,...,...,...,...,...,...,...,...,...
11078,talk.religion.misc,"\n\tThose things,\n\twhich ye have both learne...",11078,../data/train\talk.religion.misc\84444,18 Apr 93 God's Promise in Philippians 4:9,psyrobtw@ubvmsd.cc.buffalo.edu (Robert Weiss),8,University at Buffalo,From: psyrobtw@ubvmsd.cc.buffalo.edu (Robert W...
11079,talk.religion.misc,"In article <RN652B5w165w@tweekco.uucp>, alizar...",11079,../data/train\talk.religion.misc\84445,Re: 14 Apr 93 God's Promise in 1 John 1: 7,sandvik@newton.apple.com (Kent Sandvik),14,Cookamunga Tourist Bureau,From: sandvik@newton.apple.com (Kent Sandvik)\...
11080,talk.religion.misc,"In article <C5KxDD.K4J@boi.hp.com>, jburrill@b...",11080,../data/train\talk.religion.misc\84446,Re: Disillusioned Protestant Finds Christ,sandvik@newton.apple.com (Kent Sandvik),23,Cookamunga Tourist Bureau,From: sandvik@newton.apple.com (Kent Sandvik)\...
11081,talk.religion.misc,netd@susie.sbc.com () writes:\n\n> In article ...,11081,../data/train\talk.religion.misc\84507,Re: Biblical Backing of Koresh's 3-02 Tape (Ci...,cutter@gloster.via.mind.org (cutter),22,"Gordian Knot, Gloster,GA",From: cutter@gloster.via.mind.org (cutter)\nSu...


In [671]:
train = pd.concat([corpus_train, structural_train], axis=1)
test = pd.concat([corpus_test, structural_test], axis=1)

#### module test

In [900]:
def checking_text(idx, write_in_local=True):
    x = train[train['global_index'] == idx]
    string = x['text'].iloc[0]
    body = x['body'].iloc[0]
    x_path = x['doc_path'].iloc[0]
    x_label = x['label'].iloc[0]
    
    if write_in_local:
        with open('E:/wyang_github/Text-Classification/text.txt', 'w', encoding='utf-8') as f:
            f.write(x_label+'\n\n')
            f.write(x_path+'\n\n')
            f.write(string)
    return string, body, x_path, x_label


# idx = 22
idx = 11082

string, body, x_path, x_label = checking_text(idx)
# header, body, others = structure_parser(string)

body_no_email, emails = email_address_parser(body)
print("\nrepr(body):   \n", repr(body))
print("\nrepr(emails):   \n", repr(emails))
print("\nrepr(body_no_email):   \n", repr(body_no_email))

body_no_binary_no_email, bytedata = bytedata_parser(body_no_email, threshold=25)
print("\nrepr(bytedata):   \n", repr(bytedata))
print("\nrepr(body_no_binary_no_email):   \n", repr(body_no_binary_no_email))

reply, previous_one, previous_two = reference_parser(body_no_binary_no_email, match_type=2)

print("\nrepr(reply):   \n", repr(reply))
print("\nrepr(previous_one):   \n", repr(previous_one))
print("\nrepr(previous_two):   \n", repr(previous_two))


repr(body):   
 'In article <1993Apr15.012537.26867@nntpd2.cxo.dec.com>, sharpe@nmesis.enet.dec.com (System PRIVILEGED Account) writes:\n>\n>In article <C5FtJt.885@sunfish.usd.edu>, rfox@charlie.usd.edu (Rich Fox, Univ of South Dakota) writes:\n>|>\n>|>In article <1993Apr10.213547.17644@rambo.atlanta.dg.com>, wpr@atlanta.dg.com (Bill Rawlins) writes:\n>|>\n>|>[earlier dialogue deleted]\n>|>\n>|>>|> Perhaps you should read it and stop advancing the Bible as evidence relating \n>|>>|> to questions of science.  \n>|>\n>|>[it = _Did Jesus exist?_ by G. A. Wells]\n>|>\n>|>>     There is a great fallacy in your statement. The question of origins is\n>|>>     based on more than science alone.  \n>|>\n>|>Nope, no fallacy.  Yep, science is best in determining how; religions handle\n>|>why and who.\n>|>\n>\n>Rich, I am curious as to why you and others award custody of the baby to\n>theists and religion?\n\nI hope I didn\'t award custody, Rich.  I purposely used "handle" in order to \navoid doin

In [896]:

def reference_parser(string, match_type=2):
    '''
    Consider reply with referencing previous email, we need to separate them to make prediction separately.
    @param 
        string: email body string
        match_type: 0 with return only main body, 1 with return main body + previous one reference, 2 with more reference
    @return: 
        reply, previous_one, previous_two in the email
    
    
    @ test with the following code
    string = ' \n\n\n\n    >>>zero email \n\n >>first email\n >second email\n reply email \n'
    reply, previous_one, previous_two = reference_parser(string, match_type=2)
    print("## reply\n", repr(reply))
    print("## previous_one\n", repr(previous_one))
    print("## previous_two\n", repr(previous_two))
    '''
    # replace \n\n as \n if we dont need to separate paragraph to analysis
    # add \n before string for >> matching.  
    previous_one, previous_two, reply = None, None, None
    string = re.sub(' ?\n{1,} ?', '\n', '\n' + string) 
    
    # extract reply with out containing >
    reply = " ".join([s for s in string.split('\n') if '>' not in s]).strip()
    
    if match_type>0:
        previous_one = " ".join(re.findall('[^>]{1}>{1}([^>]{1}[\S ]*)\n', string)).strip() # matching >
    if match_type>1:
        previous_two = " ".join(re.findall('[^>]{1}>{2}([^>]{1}[\S ]*)\n' , string)).strip() # matching >>
    
    # previous_two_more_pt = '[^>]{1}>{2,}([^>]{1}[\S ]*)\n' # matching >> or >>> more
    return reply, previous_one, previous_two


In [880]:
string = body_no_binary_no_email
# string = body
string = re.sub(' ?\n{1,} ?', '\n', '\n' + string) 

# extract reply with out containing >
reply = " ".join([s for s in string.split('\n') if '>' not in s]).strip()

In [881]:
string

'\nIn article  (Bob McGwier) writes:  |> [1] HOWEVER, I hate economic terrorism and political correctness |> worse than I hate this policy.     |> [2] A more effective approach is to stop donating |> to ANY organizating that directly or indirectly supports gay rights issues |> until they end the boycott on funding of scouts.    Can somebody reconcile the apparent contradiction between [1] and [2]?  --  Rob Strom, (914) 784-7641 IBM Research, 30 Saw Mill River Road, P.O. Box 704, Yorktown Heights, NY  10598 '

In [882]:
string.split('\n')

['',
 'In article  (Bob McGwier) writes:  |> [1] HOWEVER, I hate economic terrorism and political correctness |> worse than I hate this policy.     |> [2] A more effective approach is to stop donating |> to ANY organizating that directly or indirectly supports gay rights issues |> until they end the boycott on funding of scouts.    Can somebody reconcile the apparent contradiction between [1] and [2]?  --  Rob Strom, (914) 784-7641 IBM Research, 30 Saw Mill River Road, P.O. Box 704, Yorktown Heights, NY  10598 ']

In [873]:
reply

'Can somebody reconcile the apparent contradiction between [1] and [2]? -- Rob Strom, strom@watson.ibm.com, (914) 784-7641 IBM Research, 30 Saw Mill River Road, P.O. Box 704, Yorktown Heights, NY  10598'

### attribute selection

In [672]:
t = train.isnull().sum().sort_values()
not_used_cols = t[t > train.shape[0]*0.1].index.tolist() 
print("not_used_cols: \n", not_used_cols)

select_cols = ['label', 'body', 'global_index', 'doc_path',
               'Subject', 'From', 'Lines', 'Organization', 'text']
print('may use cols: \n', select_cols)

not_used_cols: 
may use cols: 
 ['label', 'body', 'global_index', 'doc_path', 'Subject', 'From', 'Lines', 'Organization', 'text']


In [673]:
train = train[select_cols].dropna(subset=['body'])
test = test[select_cols]

In [674]:
train.head()

Unnamed: 0,label,body,global_index,doc_path,Subject,From,Lines,Organization,text
0,alt.atheism,Archive-name: atheism/resources\nAlt-atheism-a...,0,../data/train\alt.atheism\49960,Alt.Atheism FAQ: Atheist Resources,mathew <mathew@mantis.co.uk>,290,"Mantis Consultants, Cambridge. UK.",From: mathew <mathew@mantis.co.uk>\nSubject: A...
1,alt.atheism,Archive-name: atheism/introduction\nAlt-atheis...,1,../data/train\alt.atheism\51060,Alt.Atheism FAQ: Introduction to Atheism,mathew <mathew@mantis.co.uk>,646,"Mantis Consultants, Cambridge. UK.",From: mathew <mathew@mantis.co.uk>\nSubject: A...
2,alt.atheism,In article <65974@mimsy.umd.edu>\nmangoe@cs.um...,2,../data/train\alt.atheism\51119,Re: Gospel Dating,I3150101@dbstu1.rz.tu-bs.de (Benedikt Rosenau),93,"Technical University Braunschweig, Germany",From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...
3,alt.atheism,dmn@kepler.unh.edu (...until kings become phil...,3,../data/train\alt.atheism\51120,Re: university violating separation of church/...,mathew <mathew@mantis.co.uk>,29,"Mantis Consultants, Cambridge. UK.",From: mathew <mathew@mantis.co.uk>\nSubject: R...
4,alt.atheism,In article <N4HY.93Apr5120934@harder.ccr-p.ida...,4,../data/train\alt.atheism\51121,"Re: [soc.motss, et al.] ""Princeton axes matchi...",strom@Watson.Ibm.Com (Rob Strom),15,IBM Research,From: strom@Watson.Ibm.Com (Rob Strom)\nSubjec...


In [257]:
t = train[train['body'].apply(lambda x: 'In article' in x)]
# t.groupby('label').size()

In [242]:
t

Unnamed: 0,label,body,global_index,doc_path,Subject,From,Lines,Organization
2,alt.atheism,In article <65974@mimsy.umd.edu>\nmangoe@cs.um...,2,../data/train\alt.atheism\51119,Re: Gospel Dating,I3150101@dbstu1.rz.tu-bs.de (Benedikt Rosenau),93,"Technical University Braunschweig, Germany"
4,alt.atheism,In article <N4HY.93Apr5120934@harder.ccr-p.ida...,4,../data/train\alt.atheism\51121,"Re: [soc.motss, et al.] ""Princeton axes matchi...",strom@Watson.Ibm.Com (Rob Strom),15,IBM Research
5,alt.atheism,In article <1993Apr5.091139.823@batman.bmd.trw...,5,../data/train\alt.atheism\51122,Re: A visit from the Jehovah's Witnesses,I3150101@dbstu1.rz.tu-bs.de (Benedikt Rosenau),114,"Technical University Braunschweig, Germany"
7,alt.atheism,In article <114127@bu.edu>\njaeger@buphy.bu.ed...,7,../data/train\alt.atheism\51124,Re: An Anecdote about Islam,I3150101@dbstu1.rz.tu-bs.de (Benedikt Rosenau),28,"Technical University Braunschweig, Germany"
16,alt.atheism,In article <2942881697.0.p00168@psilink.com> p...,16,../data/train\alt.atheism\51134,Re: Don't more innocents die without the death...,bobbe@vice.ICO.TEK.COM (Robert Beauchaine),26,"Tektronix, Inc., Beaverton, OR."
...,...,...,...,...,...,...,...,...
11077,talk.religion.misc,"Okay, I'll bite. I should probably leave this ...",11077,../data/train\talk.religion.misc\84443,Re: Flaming Nazis,deane@binah.cc.brandeis.edu (David Matthew Deane),106,Brandeis University
11079,talk.religion.misc,"In article <RN652B5w165w@tweekco.uucp>, alizar...",11079,../data/train\talk.religion.misc\84445,Re: 14 Apr 93 God's Promise in 1 John 1: 7,sandvik@newton.apple.com (Kent Sandvik),14,Cookamunga Tourist Bureau
11080,talk.religion.misc,"In article <C5KxDD.K4J@boi.hp.com>, jburrill@b...",11080,../data/train\talk.religion.misc\84446,Re: Disillusioned Protestant Finds Christ,sandvik@newton.apple.com (Kent Sandvik),23,Cookamunga Tourist Bureau
11081,talk.religion.misc,netd@susie.sbc.com () writes:\n\n> In article ...,11081,../data/train\talk.religion.misc\84507,Re: Biblical Backing of Koresh's 3-02 Tape (Ci...,cutter@gloster.via.mind.org (cutter),22,"Gordian Knot, Gloster,GA"


In [255]:
print(string)

From: dbd@urartu.sdpa.org (David Davidian)
Subject: FORGED POSTING -- FORGED POSTING -- FORGED POSTING
Summary: usually generated by those who can't live with themselves! 
Organization: S.D.P.A. Center for Regional Studies
Lines: 37


THE FOLLOWING POSTING WAS FORGED IN MY NAME! PLEASE IGNORE SUCH POSTINGS!

[FORGED] Newsgroups:soc.culture.turkish,talk.politics.mideast,talk.politics.
[FORGED] soviet,soc.culture.greek
[FORGED] From: dbd@urartu.sdpa.org (David Davidian)
[FORGED] News-Software: VAX/VMS VNEWS 1.41    
[FORGED] Organization: University of Tennessee Computing Center
[FORGED] Date: Fri, 16 Apr 1993 21:36:00 GMT
[FORGED] Lines: 293
[FORGED]
[FORGED] Dear friends,
[FORGED]
[FORGED] I am a graduate student in Education at the University of Tennessee. 
[FORGED]
  .
  .
  .
[FORGED]
[FORGED]
[FORGED]                         __QUESTIONNAIRE__
[FORGED]                  Teaching Music for deaf children.
[FORGED]
[FORGED] NAME ________________________________
[FORGED] ADDRESS/ E-MAIL 

## Feature engineering

In [210]:
train[['body', 'doc_path', 'label']].sample(5).style

Unnamed: 0,body,doc_path,label
7409,"In article <19613@pitt.UUCP>, geb@cs.pitt.edu (Gordon Banks) writes: > In article <1993Apr7.221357.12533@lamont.ldgo.columbia.edu> brenner@ldgo.columbia.edu (carl brenner) writes: > >> see the ulterior motive here. It is easy for me to see it the > >> those physicians who call everything lyme and treat everything. > >> There is a lot of money involved. > > > >	You keep bringing this up. But I don't understand what's in it > >financially for the physician to go ahead and treat. Unless the physician > >has an investment in (or is involved in some kickback scheme with) the > >home infusion company, where is the financial gain for the doctor? > > Well, let me put it this way, based on my own experience. A > general practitioner with no training in infectious diseases, > by establishing links to the ""Lyme community"", treating patients > who come to him wondering about lyme or having decided they > have lyme as if they did, saying that diseases such as MS > are probably spirochetal, if not Lyme, giving talks at meetings > of users groups, validating the feelings of even delusional > patients, etc. This GP can go from being a run-of-the-mill > $100K/yr GP to someone with lots of patients in the hospital > and getting expensive infusions that need monitoring in his > office, and making lots of bread. Also getting the adulation > of many who believe his is their only hope (if not of cure, > then of control) and seeing his name in publications put out > by support groups, etc. This is a definite temptation. 	Harumph. Getting published in these newsletters is hardly something to aspire to. :-) 	I can't really argue with your logic, though I think you may be extrapolating a bit recklessly from what appears to be a sample size of one. Even if what you say about this local Pittsburgh guy is true, it is not logical or fair to conclude that this is true of all doctors who treat Lyme disease. 	By your logic, I could conclude that all of the physicians who consult for insurance companies and make money by denying benefits to Lyme patients are doing it for the money, rather than because they believe they are encouraging good medicine. I have no idea how sincere these guys are, but their motives are as suspect as the physicians you excoriate for what you believe to be indiscriminate treatment. 	I would really feel more comfortable discussing the medical issues in Lyme, rather than speculating as to the motives of the various parties involved. > ---------------------------------------------------------------------------- > Gordon Banks N3JXP | ""Skepticism is the chastity of the intellect, and > geb@cadre.dsl.pitt.edu | it is shameful to surrender it too soon."" > ---------------------------------------------------------------------------- Carl Brenner",../data/train\sci.med\58904,sci.med
8975,"I've been thinking about the idea that was raised (by Michael Covington, I think) that words mean what we think they mean, regardless of etymology. I've been reflecting on what certain words meant in my childhood and tracing how this shaped some of my attitudes. I grew up in a home where Christ was a bad word. People who were very angry said it. The word Christian meant someone who was not a Jew. It carried connotations of otherness, of threat, of enemy. It took some time to figure out that there was a connection between `Christ' and `Christian'. When I accepted Jesus, I expected to be disowned. To become a Christian meant to join the enemy. I knew others would consider me a traitor. At some level, I agreed, but was still prepared to pay this price. Like Esau, I sold my birthright. However, I made a better bargain. He only got some stew, but I got the incomparable riches of knowing Christ. As it turned out, my parents did not disown me. I found out later that they were hoping it was a phase that I would grow out of. By the time they had decided it wasn't a phase, they were sort of used to it. They didn't disown me but they didn't completely accept the situation either. For example, they didn't come to my wedding because it was in a church. When I visited my grandmother in the hospital a few days before her death, she said to me, ""As far as I'm concerned, you still are a Jew."" What she meant was that she loved me and forgave me. But I am not a Jew. I am a Christian. (I'll concede, one that likes chicken soup with matzoh balls.:-)) I do not keep kosher. I do not celebrate the Sabbath on Saturday. My sons are not circumcised. But these things are true of some people who do consider themselves Jews. It is not these rules that make people Jews; it is the heritage from the past. I gave up the past. This is why I find it hard to relate to Messianic Jews. Their experience is unlike mine. They still consider themselves Jews while following Jesus. Some would even say that I *must* do so, too. I am at a stage of my life now where I would like to have a heritage. It was not something I valued very much when I gave it. But I did have a sense that I was giving it for God. It may have been a small sacrifice. It may have been an unnecessary sacrifice. But I gave it and do not want to ask for it back. And while I don't have the heritage I was born with, I do have another. I am an outcast from the house of Israel, but I am a member of the Church. One of the things I like about being a Catholic Christian is that it is rich in tradition. It gives me a feeling of, once again, being rooted in the past. This is probably one of the reasons why I don't like it when people mess around with Christian traditions (for example, changing the name of Easter). These traditions fill an important emotional need of mine. I suppose the point of all this is that people shouldn't assume that all believers of Jewish background are the same. For some `Jewish Christian' is a good name, for others it is an oxymoron. Jayne Kulikauskas/jayen@mmalt.guild.org",../data/train\soc.religion.christian\20925,soc.religion.christian
10057,"Mr. Freeman: Please find something more constructive to do with your time rather than engaging in fantasy..... Not that I have a particular affinty to Arafat or anything. John ""Marlow ceased, and sat apart, indistinct and silent, in the pose of a  meditating Buddha. Nobody moved for a time...The offing was barred by  a black bank of clouds, and the tranquil waterway leading to the utter-  most ends of the earth flowed sombre under an overcast sky - seemed to",../data/train\talk.politics.mideast\76251,talk.politics.mideast
1287,"In article <1993Apr17.023017.17301@gmuvax2.gmu.edu> rwang@gmuvax2.gmu.edu writes:  > > Hi, everybody:  > I guess my subject has said it all. It is getting boring  > looking at those same old bmp files that came with Windows. So,  > I am wondering if there is any body has some beautiful bmp file  > I can share. Or maybe somebody can tell me some ftp site for  > some bmp files, like some scenery files, some animals files,  > etc.... I used to have some, unfortunately i delete them all.  I downloaded the CompuServe GIF of the month. A raytraced image of a golf ball next to a hole. Very nice, 640x480x256 bitmap, easily converted to a Windows BMP. If anyone wants, I could upload a copy on Cica... Eric -- +------------------------+----------------------------+------------------+ | Eric Trepanier | Internet: eric@tgm.CAM.ORG | CI$: 71042,3207 | | 55 Grenon O. +----------------------------+------------------+ | Laval (Quebec) H7N 5M3 | Everybody has a right to believe in something | | Canada / (514)663-6929 | I believe I'll have another beer! | +------------------------+-----------------------------------------------+",../data/train\comp.os.ms-windows.misc\9624,comp.os.ms-windows.misc
171,"In article <66014@mimsy.umd.edu> mangoe@cs.umd.edu (Charley Wingate) writes:  >>And what about that revelation thing, Charley? > >If you're talking about this intellectual engagement of revelation, well, >it's obviously a risk one takes. >  I see, it is not rational, but it is intellectual. Does madness qualify as intellectual engagement, too?  >>Many people say that the concept of metaphysical and religious knowledge >>is contradictive. > >I'm not an objectivist, so I'm not particularly impressed with problems of >conceptualization. The problem in this case is at least as bad as that of >trying to explain quantum mechanics and relativity in the terms of ordinary >experience. One can get some rough understanding, but the language is, from >the perspective of ordinary phenomena, inconsistent, and from the >perspective of what's being described, rather inexact (to be charitable). >  Exactly why science uses mathematics. QM representation in natural language is not supposed to replace the elaborate representation in mathematical terminology. Nor is it supposed to be the truth, as opposed to the representation of gods or religions in ordinary language. Admittedly, not every religion says so, but a fancy side effect of their inept representations are the eternal hassles between religions.  And QM allows for making experiments that will lead to results that will be agreed upon as being similar. Show me something similar in religion.  >An analogous situation (supposedly) obtains in metaphysics; the problem is >that the ""better"" descriptive language is not available. >  With the effect that the models presented are useless. And one can argue that the other way around, namely that the only reason metaphysics still flourish is because it makes no statements that can be verified or falsified - showing that it is bogus.  >>And in case it holds reliable information, can you show how you establish >>that? > >This word ""reliable"" is essentially meaningless in the context-- unless you >can show how reliability can be determined.  Haven't you read the many posts about what reliability is and how it can be acheived respectively determined?  Benedikt",../data/train\alt.atheism\51300,alt.atheism


## EDA

In [10]:
train['char_length'] = train['text'].apply(lambda x:len(x))

# c = (train['char_length'].sort_values())
# sent_cdf = c.cumsum()
# sent_pdf = c # / c.sum()
# sent_pdf.plot(kind='hist', bins=100)
# plt.xlabel("char_length")  # 需要先sort, 才能说是index of chars.
# plt.ylabel("char_cum_counts_perc")
# plt.title("MAX_DOC_LEN CDF")
# plt.show()

In [11]:
train.sort_values(by='char_length')

Unnamed: 0,global_index,doc_path,text,label,original_idx,char_length
494,494,../data/train\comp.graphics\37928,From: hl7204@eehp22 (H L)\nSubject: Re: Graphi...,comp.graphics,37928,125
2402,2402,../data/train\comp.sys.mac.hardware\51522,From: kwgst+@pitt.edu (Mr. Someone)\nSubject: ...,comp.sys.mac.hardware,51522,125
4039,4039,../data/train\rec.autos\101589,From: swdwan@napier.uwaterloo.ca (Donald Wan)\...,rec.autos,101589,131
2421,2421,../data/train\comp.sys.mac.hardware\51541,From: news@news.claremont.edu (The News System...,comp.sys.mac.hardware,51541,139
3595,3595,../data/train\misc.forsale\75911,From: ibeshir@nyx.cs.du.edu (Ibrahim)\nSubject...,misc.forsale,75911,142
...,...,...,...,...,...,...
10176,10176,../data/train\talk.politics.mideast\76392,From: dbd@urartu.sdpa.org (David Davidian)\nSu...,talk.politics.mideast,76392,62187
1362,1362,../data/train\comp.os.ms-windows.misc\9704,From: james@dlss2 (James Cummings)\nSubject: R...,comp.os.ms-windows.misc,9704,63095
2826,2826,../data/train\comp.windows.x\66322,From: ware@cis.ohio-state.edu (Peter Ware)\nSu...,comp.windows.x,66322,66459
9625,9625,../data/train\talk.politics.guns\54684,From: mjp@watson.ibm.com (Michael Phelps)\nSub...,talk.politics.guns,54684,71400


Unnamed: 0,From,Subject,Summary,Keywords,Expires,Distribution,Organization,Supersedes,Lines,X-Newsreader,...,Followups-to,X-Newsposter,X-Header,X-Cc,Oanization,body,date,delivered_to,to_domains,error_message
0,mathew <mathew@mantis.co.uk>,Alt.Atheism FAQ: Atheist Resources,"Books, addresses, music -- anything related to...","FAQ, atheism, books, music, fiction, addresses...","Thu, 29 Apr 1993 11:57:19 GMT",world,"Mantis Consultants, Cambridge. UK.",<19930301143317@mantis.co.uk>,290,,...,,,,,,Archive-name: atheism/resources\nAlt-atheism-a...,,[],[],
1,mathew <mathew@mantis.co.uk>,Alt.Atheism FAQ: Introduction to Atheism,Please read this file before posting to alt.at...,"FAQ, atheism","Thu, 6 May 1993 12:22:45 GMT",world,"Mantis Consultants, Cambridge. UK.",<19930308134439@mantis.co.uk>,646,,...,,,,,,Archive-name: atheism/introduction\nAlt-atheis...,,[],[],
2,I3150101@dbstu1.rz.tu-bs.de (Benedikt Rosenau),Re: Gospel Dating,,,,,"Technical University Braunschweig, Germany",,93,,...,,,,,,In article <65974@mimsy.umd.edu>\nmangoe@cs.um...,,[],[],
3,mathew <mathew@mantis.co.uk>,Re: university violating separation of church/...,,,,,"Mantis Consultants, Cambridge. UK.",,29,rusnews v1.01,...,,,,,,dmn@kepler.unh.edu (...until kings become phil...,,[],[],
4,strom@Watson.Ibm.Com (Rob Strom),"Re: [soc.motss, et al.] ""Princeton axes matchi...",,,,usa,IBM Research,,15,,...,,,,,,In article <N4HY.93Apr5120934@harder.ccr-p.ida...,,[],[],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11078,psyrobtw@ubvmsd.cc.buffalo.edu (Robert Weiss),18 Apr 93 God's Promise in Philippians 4:9,,,,,University at Buffalo,,8,,...,,,,,,"\n\tThose things,\n\twhich ye have both learne...",,[],[],
11079,sandvik@newton.apple.com (Kent Sandvik),Re: 14 Apr 93 God's Promise in 1 John 1: 7,,,,,Cookamunga Tourist Bureau,,14,,...,,,,,,"In article <RN652B5w165w@tweekco.uucp>, alizar...",,[],[],
11080,sandvik@newton.apple.com (Kent Sandvik),Re: Disillusioned Protestant Finds Christ,,,,,Cookamunga Tourist Bureau,,23,,...,,,,,,"In article <C5KxDD.K4J@boi.hp.com>, jburrill@b...",,[],[],
11081,cutter@gloster.via.mind.org (cutter),Re: Biblical Backing of Koresh's 3-02 Tape (Ci...,,,,world,"Gordian Knot, Gloster,GA",,22,,...,,,,,,netd@susie.sbc.com () writes:\n\n> In article ...,,[],[],


### Regex
- From
- Subject
    - 如果 bmp 在这里面的话, 直接跳过 先不处理
- Summary
- Keywords
- Expires
- Distribution
- Organization
- Supersedes
- Lines

- Archive-name
- Alt-atheism-archive-name
- Last-modified
- Version

内容里面最长的词有多长, 如果太长了 超过100, 则认为是有乱码. 直接跳过?

In [None]:
写正则表达式
- 

In [126]:
string = t['text'][9]
string.split('\n')

['From: keith@cco.caltech.edu (Keith Allan Schneider)',
 'Subject: Re: >>>>>>Pompous ass',
 'Organization: California Institute of Technology, Pasadena',
 'Lines: 9',
 'NNTP-Posting-Host: punisher.caltech.edu',
 '',
 'kmr4@po.CWRU.edu (Keith M. Ryan) writes:',
 '',
 '>>Then why do people keep asking the same questions over and over?',
 '>Because you rarely ever answer them.',
 '',
 "Nope, I've answered each question posed, and most were answered multiple",
 'times.',
 '',
 'keith',
 '']

0        From: mathew <mathew@mantis.co.uk>\nSubject: A...
1        From: mathew <mathew@mantis.co.uk>\nSubject: A...
2        From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...
3        From: mathew <mathew@mantis.co.uk>\nSubject: R...
4        From: strom@Watson.Ibm.Com (Rob Strom)\nSubjec...
                               ...                        
11078    From: psyrobtw@ubvmsd.cc.buffalo.edu (Robert W...
11079    From: sandvik@newton.apple.com (Kent Sandvik)\...
11080    From: sandvik@newton.apple.com (Kent Sandvik)\...
11081    From: cutter@gloster.via.mind.org (cutter)\nSu...
11082    Subject: Re: Albert Sabin\nFrom: rfox@charlie....
Name: text, Length: 11083, dtype: object

In [None]:
def info_extractor():

## Build global index
- for future information retrieve

In [19]:
# def parser():
data

['../data/train\\talk.religion.misc\\82757',
 '../data/train\\talk.religion.misc\\82758',
 '../data/train\\talk.religion.misc\\82759',
 '../data/train\\talk.religion.misc\\82760',
 '../data/train\\talk.religion.misc\\82763',
 '../data/train\\talk.religion.misc\\82766',
 '../data/train\\talk.religion.misc\\82767',
 '../data/train\\talk.religion.misc\\82771',
 '../data/train\\talk.religion.misc\\82772',
 '../data/train\\talk.religion.misc\\82774',
 '../data/train\\talk.religion.misc\\82775',
 '../data/train\\talk.religion.misc\\82776',
 '../data/train\\talk.religion.misc\\82777',
 '../data/train\\talk.religion.misc\\82778',
 '../data/train\\talk.religion.misc\\82779',
 '../data/train\\talk.religion.misc\\82781',
 '../data/train\\talk.religion.misc\\82782',
 '../data/train\\talk.religion.misc\\82783',
 '../data/train\\talk.religion.misc\\82784',
 '../data/train\\talk.religion.misc\\82785',
 '../data/train\\talk.religion.misc\\82786',
 '../data/train\\talk.religion.misc\\82787',
 '../data/

In [18]:
pd.data

['../data/train\\talk.religion.misc\\82757',
 '../data/train\\talk.religion.misc\\82758',
 '../data/train\\talk.religion.misc\\82759',
 '../data/train\\talk.religion.misc\\82760',
 '../data/train\\talk.religion.misc\\82763',
 '../data/train\\talk.religion.misc\\82766',
 '../data/train\\talk.religion.misc\\82767',
 '../data/train\\talk.religion.misc\\82771',
 '../data/train\\talk.religion.misc\\82772',
 '../data/train\\talk.religion.misc\\82774',
 '../data/train\\talk.religion.misc\\82775',
 '../data/train\\talk.religion.misc\\82776',
 '../data/train\\talk.religion.misc\\82777',
 '../data/train\\talk.religion.misc\\82778',
 '../data/train\\talk.religion.misc\\82779',
 '../data/train\\talk.religion.misc\\82781',
 '../data/train\\talk.religion.misc\\82782',
 '../data/train\\talk.religion.misc\\82783',
 '../data/train\\talk.religion.misc\\82784',
 '../data/train\\talk.religion.misc\\82785',
 '../data/train\\talk.religion.misc\\82786',
 '../data/train\\talk.religion.misc\\82787',
 '../data/