<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Package-import" data-toc-modified-id="Package-import-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Package import</a></span></li><li><span><a href="#Data-Loading" data-toc-modified-id="Data-Loading-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Loading</a></span><ul class="toc-item"><li><span><a href="#parsing" data-toc-modified-id="parsing-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>parsing</a></span></li><li><span><a href="#attribute-selection" data-toc-modified-id="attribute-selection-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>attribute selection</a></span></li></ul></li><li><span><a href="#Feature-engineering" data-toc-modified-id="Feature-engineering-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Feature engineering</a></span></li><li><span><a href="#EDA" data-toc-modified-id="EDA-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>EDA</a></span><ul class="toc-item"><li><span><a href="#Regex" data-toc-modified-id="Regex-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Regex</a></span></li></ul></li><li><span><a href="#Build-global-index" data-toc-modified-id="Build-global-index-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Build global index</a></span></li></ul></div>

## Package import 

In [1]:
import pandas as pd 
import numpy as np
import os
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import mailparser



## Data Loading
- from file into DataFrame

In [135]:
def load_data_folder(path):
    '''
    @param folders: the train or test directory
    @return: document list with [doc_path, doc, label, original_idx]
    '''
    folders = glob(path+'/**')  # explore all the folder under the directory

    docs = []
    for classes in folders:
        label = classes.split('\\')[-1]
        doc_paths = glob(classes+'\\**')
        
        for doc_path in doc_paths:
            original_idx = doc_path.split('\\')[-1]
            
            with open(doc_path, encoding='UTF-8') as f:
                text = f.read()
            docs.append([doc_path, text, label, original_idx])

    print(f'\nLoaded folder under {path}: \n')
    for folder in folders:
        print(folder)
        
    return docs


corpus_train_docs = load_data_folder(path='../data/train')
corpus_test_docs = load_data_folder(path='../data/test')


Loaded folder under ../data/train: 

../data/train\alt.atheism
../data/train\comp.graphics
../data/train\comp.os.ms-windows.misc
../data/train\comp.sys.ibm.pc.hardware
../data/train\comp.sys.mac.hardware
../data/train\comp.windows.x
../data/train\misc.forsale
../data/train\rec.autos
../data/train\rec.motorcycles
../data/train\rec.sport.baseball
../data/train\rec.sport.hockey
../data/train\sci.crypt
../data/train\sci.electronics
../data/train\sci.med
../data/train\sci.space
../data/train\soc.religion.christian
../data/train\talk.politics.guns
../data/train\talk.politics.mideast
../data/train\talk.politics.misc
../data/train\talk.religion.misc

Loaded folder under ../data/test: 

../data/test\alt.atheism
../data/test\comp.graphics
../data/test\comp.os.ms-windows.misc
../data/test\comp.sys.ibm.pc.hardware
../data/test\comp.sys.mac.hardware
../data/test\comp.windows.x
../data/test\misc.forsale
../data/test\rec.autos
../data/test\rec.motorcycles
../data/test\rec.sport.baseball
../data/test

### parsing

In [136]:
corpus_train = pd.DataFrame(corpus_train_docs, columns=['doc_path', 'text', 'label', 'original_idx'])
corpus_train = corpus_train.reset_index().rename(columns={'index':'global_index'})

corpus_test = pd.DataFrame(corpus_test_docs, columns=['doc_path', 'text', 'label', 'original_idx'])
corpus_test = corpus_test.reset_index().rename(columns={'index':'global_index'})

print("original_idx duplicate count:", corpus_train.shape[0] - corpus_train.original_idx.drop_duplicates().shape[0], ' on ', corpus_train.shape[0])
print("original_idx duplicate count:", corpus_test.shape[0] - corpus_test.original_idx.drop_duplicates().shape[0], ' on ', corpus_test.shape[0])

original_idx duplicate count: 1060  on  11083
original_idx duplicate count: 770  on  7761


In [137]:
def parser(string):
    '''
    @param parser: email string
    @return: structural information for email header, body, others
    '''
    error_message = None
    header = {}
    body = None
    others = []
    try:
        mail = mailparser.parse_from_string(string)
        if mail.has_defects:  # [first line error]
            remove_first_line_string = "\n".join(string.split('\n')[1:])
            mail = mailparser.parse_from_string(remove_first_line_string)
            # print('remove_first_line_string update for ')
        header, body = mail.headers, mail.body
        others = [mail.date, mail.delivered_to, mail.to_domains, error_message]

    except Exception as error:
        error_message = error
    return header, body, others


def structural_email(data):
    '''
    @param data: data text dataframe series including all the training set or test set
    @return: structural information
    '''
    header_info = []
    body_info = []
    others_info = []
    for string in data:
        header, body, others = parser(string)
        header_info.append(header)
        body_info.append(body)
        others_info.append(others)

    a1 = pd.DataFrame.from_dict(header_info)
    a2 = pd.DataFrame(body_info, columns=['body'])
    a3 = pd.DataFrame(others_info, columns=[
                      'date', 'delivered_to', 'to_domains', 'error_message'])
    structure_email = pd.concat([a1, a2, a3], axis=1)
    return structure_email

In [None]:
structural_train = structural_email(corpus_train['text'])
structural_test = structural_email(corpus_test['text'])

In [189]:
train = pd.concat([corpus_train, structural_train], axis=1)
test = pd.concat([corpus_test, structural_test], axis=1)

### attribute selection

In [191]:
t = train.isnull().sum().sort_values()
not_used_cols = t[t > train.shape[0]*0.1].index.tolist() 
print("not_used_cols: \n", not_used_cols)

select_cols = ['label', 'body', 'global_index', 'doc_path',
               'Subject', 'From', 'Lines', 'Organization']
print('may use cols: \n', select_cols)

not_used_cols: 
may use cols: 
 ['label', 'body', 'global_index', 'doc_path', 'Subject', 'From', 'Lines', 'Organization']


In [197]:
train = train[select_cols]
test = test[select_cols]

In [198]:
train[select_cols].head()

Unnamed: 0,label,body,global_index,doc_path,Subject,From,Lines,Organization
0,alt.atheism,Archive-name: atheism/resources\nAlt-atheism-a...,0,../data/train\alt.atheism\49960,Alt.Atheism FAQ: Atheist Resources,mathew <mathew@mantis.co.uk>,290,"Mantis Consultants, Cambridge. UK."
1,alt.atheism,Archive-name: atheism/introduction\nAlt-atheis...,1,../data/train\alt.atheism\51060,Alt.Atheism FAQ: Introduction to Atheism,mathew <mathew@mantis.co.uk>,646,"Mantis Consultants, Cambridge. UK."
2,alt.atheism,In article <65974@mimsy.umd.edu>\nmangoe@cs.um...,2,../data/train\alt.atheism\51119,Re: Gospel Dating,I3150101@dbstu1.rz.tu-bs.de (Benedikt Rosenau),93,"Technical University Braunschweig, Germany"
3,alt.atheism,dmn@kepler.unh.edu (...until kings become phil...,3,../data/train\alt.atheism\51120,Re: university violating separation of church/...,mathew <mathew@mantis.co.uk>,29,"Mantis Consultants, Cambridge. UK."
4,alt.atheism,In article <N4HY.93Apr5120934@harder.ccr-p.ida...,4,../data/train\alt.atheism\51121,"Re: [soc.motss, et al.] ""Princeton axes matchi...",strom@Watson.Ibm.Com (Rob Strom),15,IBM Research


## Feature engineering

In [201]:
train[['body', 'doc_path']].sample(5).style

Unnamed: 0,body,doc_path
10746,"I produced an error last week about CHORION: >> (By the way Mr DeCenso, you really should have looked in the index of your >> Bauer-Arndt-Gingrich Greek lexicon. You would have found that the word in >> Acts for ""lot"" is ""kleros,"" not ""CHORION"" as stated by Mr Archer, and nowhere >> in the very large discussion of kleros in done the to ""Theological Dictionary >> of the New Testament"" by Bromley, is the meaning ""burial plot"" discussed. It >> discusses the forms of ""kleros"" (eg: kleros, kleroo, etc), and the various >> meanings of ""kleros"" (eg: ""plot of land,"" and ""inheritance""), but mentions >> nothing about CHORION or ""burial plot."" (Why does this not surprise me?) Thus >> it would seem to be a very good thing you dumped Archer as a reference). > > I was wrong. I admit that I do not have a handle on Greek grammar, and thus > confused ""kleros"", the second to last word in Acts 1:17 as being the plot of > land discussed. In actuality it is ""chorion"", which is the last word Acts > 1:18. Unfortunately my Greek dictionary does not discuss ""chorion"" so I > cannot report as to the nuances of the word. I abhor publishing trash (I abhor it of myself even more than I do from others, but since I do not present myself as an authority on the subject, I do not feel dishonest, though I do openly admit ignorance and incompetence in this example). Thus I felt honor bound to do a better set of research specifically on the word. First it should be noted that Greek grammar is not as tough as I first assumed (it is not nontrivial by any means, and I still am not competent with it, but it is not as opaque as I had thought). It turns out that while the Index for the Bauer-Arndt-Gingrich ""Greek Lexicon"" renders each verse in order, each word within a verse is put in greek alphabetical order. Thus while the the meaning of the verse is decipherable, the syntax is far from clear. On the other hand, a Greek-English Intralinear Bible makes things a lot more comprehendable. And yes, the word for field in Acts 1:18 is indeed ""chorion."" Now I've checked several Greek-English lexicons: 	""Greek English Lexicon of the New Testament,"" Louw and Nida 	""Robinson's Greek and English Lexicon of the New Testament"" 	""Greek English Lexicon of the New Testament,"" Grimm 	""Word study Concordance,"" Tynsdale 	""A Greek English Lexicon of the New Testament and other early Christian Writings,"" Bauer-Arndt-Gingrich 	""The New Analytical Greek Lexicon,"" Perschbacher In each case the meaning of the word Chorion was given variously as: 	A space, place, region, district, field, area, ""country place,"" land, farm, estate, ""a bit of tillage"", and similar meanings. Nowhere do any of these books mention anything about ""grave."" As some of these books go into great detail, I would be very surprised to find that these books are all inadequate and Mr Archer is the only competent scholar in Greek. I think it more likely that Mr Archer's investigations into ""contradictions"" to be once again, as your friend said it, ""lacking in substance,"" and thus Archer is again shown worthless as an expert witness (By the way Mr DeCenso, I would have honorably presented my results on this matter, even if I had found them to support Mr Archer's contentions). By the way, among these lexicons, (eg: Robinson's) is the definition of ""agros,"" the word used in Matthew 27 to describe the field bought. The word ""agros"" is defined as ""a field in the country."" Chorion is specifically noted as a synonym to agros. This is significant, as it is evidence of how silly Bullinger's exegisis was, which stated that the word for ""field"" in Matthew (ie: agros) is different from the word for ""field"" in Acts (ie: chorion), and thus we must be talking about two different fields (Of course you already admit how stupid Bullinger's exegisis is, but this was a small serendipity which drives the point home). So as of now, unless Mr DeCenso show compelling reasons to believe otherwise (eg: a reputable scholar with reputable references), I consider this particular issue closed. See Mr DeCenso, now you can go on to answer questions about the denials of Peter, the day of the Crucifixion, Tyre, and the fact that the author of Matthew quoted from the wrong prophet in discussing the ""Potter's Field.""  Later,  Dave Butler  Precise knowledge is the only true knowledge, and he who does not teach  exactly, does not teach at all.  Henry Ward Beecher  American Clergyman  as recorded by George Seldes",../data/train\talk.religion.misc\82813
5710,"In article <15APR93.14691229.0062@lafibm.lafayette.edu> VB30@lafibm.lafayette.edu (VB30) writes: >Just wondering. A friend and I were talking the other day, and >we were (for some reason) trying to come up with names of Jewish >maybe John Lowenstein. Lowenstein is NOT Jewish. However, there is a long list including Hank Greenberg, Moe Berg, Rod Carew (a convert), the Sherry brothers, Art Shamsky, and Ron Blomberg. Barry",../data/train\rec.sport.baseball\104806
8626,"In article dleonar@andy.bgsu.edu (Pixie) writes: >Pardon me, a humble atheist, but exactly what is the difference >between holding a revealed truth with blind faith as its basis (i.e. >regardless of any evidence that you may find to the contrary) as an >absolute truth, fully expecting people to believe you and arrogance? > > They sound like one and the same to me. > > I see no wisdom whatsoever in your words I'm not surprised that you see no wisdom in them. That is because your premises are wrong from the word ""Go"". You claim that Christianity is based on blind faith, but this simply is not so. Just look at the current thread on the evidence for Jesus' resurrection for evidence that Jesus was real and that he triumphed over death. Furthermore, you say that Christians hold to their beliefs ""regardless of any evidence that you may find to the contrary."" Without any evidence to support your claim, this statement is little more than an ad hominem argument. Mind you, I don't mean this as a personal attack. I'm merely pointing out the intellectual dishonesty behind condemning Christianity in this fashion. It would make much more sense if you could prove that all Christians do base their belief on empty nothings, and that they do ignore all evidence to the contrary. Only then can you expect your attack to make sense.  -- Virgilio ""Dean"" Velasco Jr, Department of Electrical Eng'g and Applied Physics CWRU graduate student, roboticist-in-training and Q wannabee  ""Bullwinkle, that man's intimidating a referee!"" | My boss is a ""Not very well. He doesn't look like one at all!"" | Jewish carpenter.",../data/train\soc.religion.christian\20576
4889,"In article craig@cellar.org (Saint Craig) writes: > shz@mare.att.com (Keeper of the 'Tude) writes: > > No anyone who is a ""true"" rider with the real riding > attitude will offer a wave, weather they are on a Harley > or on a Honda or some other bike, inless they have a > serious case of my bike is better than your and you're > too low to be acknowleged. This you'll find is the case > with most of the harley riders out here where I am, > however I still give them a wave, and ride secure in the > knowlege that I'm a better persob than they are.  ^^^^^^  perSOB, I kinda like that Most people wave or return my wave when I'm on my Harley. Other Harley riders seldom wave back to me when I'm on my duck. Squids don't wave, or return waves ever, even to each other, from what I can tell. -- Michael Manning mmanning@icomsim.com (NeXTMail accepted.) `92 FLSTF FatBoy `92 Ducati 900SS",../data/train\rec.motorcycles\104444
8384,"In article enzo@research.canon.oz.au (Enzo Liguori) writes: >Now, Space Marketing >is working with University of Colorado and Livermore engineers on >a plan to place a mile-long inflatable billboard in low-earth >orbit. NASA would provide contractual launch services. However, >since NASA bases its charge on seriously flawed cost estimates >(WN 26 Mar 93) the taxpayers would bear most of the expense. This >may look like environmental vandalism, but Mike Lawson, CEO of >Space Marketing, told us yesterday that the real purpose of the >project is to help the environment! The platform will carry ozone >monitors he explained--advertising is just to help defray costs. How could this possibly be ""environmental vandalism"" when there is no ""environment"" to vandalize up there? Since the advertising ""is just to help defray costs"", it's certainly no surprise that ""the taxpayers would bear most of the expense"". Sounds like a good idea to me, since the taxpayers would bear _all_ of the expense if they didn't do the advertising. >What do you think of this revolting and hideous attempt to vandalize >the night sky? Great idea, they should have done it long ago. >What about light pollution in observations? (I read somewhere else that >it might even be visible during the day, leave alone at night). I can't believe that a mile-long billboard would have any significant effect on the overall sky brightness. Venus is visible during the day, but nobody complains about that. Besides, it's in LEO, so it would only be visible during twilight when the sky is already bright, and even if it would have some miniscule impact, it would be only for a short time as it goes zipping across the sky. >Are protesting groups being organized in the States? No doubt. People are always looking for something to protest about, so it would be no surprise. >Really, really depressed. Well, look on the, er, bright side. Imagine the looks on the faces of people in primitive tribes out in the middle of nowhere as they look up and see a can of Budweiser flying across the sky... :-D -- Jeff Cook Jeff.Cook@FtCollinsCO.NCR.com",../data/train\sci.space\61100


## EDA

In [10]:
train['char_length'] = train['text'].apply(lambda x:len(x))

# c = (train['char_length'].sort_values())
# sent_cdf = c.cumsum()
# sent_pdf = c # / c.sum()
# sent_pdf.plot(kind='hist', bins=100)
# plt.xlabel("char_length")  # 需要先sort, 才能说是index of chars.
# plt.ylabel("char_cum_counts_perc")
# plt.title("MAX_DOC_LEN CDF")
# plt.show()

In [11]:
train.sort_values(by='char_length')

Unnamed: 0,global_index,doc_path,text,label,original_idx,char_length
494,494,../data/train\comp.graphics\37928,From: hl7204@eehp22 (H L)\nSubject: Re: Graphi...,comp.graphics,37928,125
2402,2402,../data/train\comp.sys.mac.hardware\51522,From: kwgst+@pitt.edu (Mr. Someone)\nSubject: ...,comp.sys.mac.hardware,51522,125
4039,4039,../data/train\rec.autos\101589,From: swdwan@napier.uwaterloo.ca (Donald Wan)\...,rec.autos,101589,131
2421,2421,../data/train\comp.sys.mac.hardware\51541,From: news@news.claremont.edu (The News System...,comp.sys.mac.hardware,51541,139
3595,3595,../data/train\misc.forsale\75911,From: ibeshir@nyx.cs.du.edu (Ibrahim)\nSubject...,misc.forsale,75911,142
...,...,...,...,...,...,...
10176,10176,../data/train\talk.politics.mideast\76392,From: dbd@urartu.sdpa.org (David Davidian)\nSu...,talk.politics.mideast,76392,62187
1362,1362,../data/train\comp.os.ms-windows.misc\9704,From: james@dlss2 (James Cummings)\nSubject: R...,comp.os.ms-windows.misc,9704,63095
2826,2826,../data/train\comp.windows.x\66322,From: ware@cis.ohio-state.edu (Peter Ware)\nSu...,comp.windows.x,66322,66459
9625,9625,../data/train\talk.politics.guns\54684,From: mjp@watson.ibm.com (Michael Phelps)\nSub...,talk.politics.guns,54684,71400


Unnamed: 0,From,Subject,Summary,Keywords,Expires,Distribution,Organization,Supersedes,Lines,X-Newsreader,...,Followups-to,X-Newsposter,X-Header,X-Cc,Oanization,body,date,delivered_to,to_domains,error_message
0,mathew <mathew@mantis.co.uk>,Alt.Atheism FAQ: Atheist Resources,"Books, addresses, music -- anything related to...","FAQ, atheism, books, music, fiction, addresses...","Thu, 29 Apr 1993 11:57:19 GMT",world,"Mantis Consultants, Cambridge. UK.",<19930301143317@mantis.co.uk>,290,,...,,,,,,Archive-name: atheism/resources\nAlt-atheism-a...,,[],[],
1,mathew <mathew@mantis.co.uk>,Alt.Atheism FAQ: Introduction to Atheism,Please read this file before posting to alt.at...,"FAQ, atheism","Thu, 6 May 1993 12:22:45 GMT",world,"Mantis Consultants, Cambridge. UK.",<19930308134439@mantis.co.uk>,646,,...,,,,,,Archive-name: atheism/introduction\nAlt-atheis...,,[],[],
2,I3150101@dbstu1.rz.tu-bs.de (Benedikt Rosenau),Re: Gospel Dating,,,,,"Technical University Braunschweig, Germany",,93,,...,,,,,,In article <65974@mimsy.umd.edu>\nmangoe@cs.um...,,[],[],
3,mathew <mathew@mantis.co.uk>,Re: university violating separation of church/...,,,,,"Mantis Consultants, Cambridge. UK.",,29,rusnews v1.01,...,,,,,,dmn@kepler.unh.edu (...until kings become phil...,,[],[],
4,strom@Watson.Ibm.Com (Rob Strom),"Re: [soc.motss, et al.] ""Princeton axes matchi...",,,,usa,IBM Research,,15,,...,,,,,,In article <N4HY.93Apr5120934@harder.ccr-p.ida...,,[],[],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11078,psyrobtw@ubvmsd.cc.buffalo.edu (Robert Weiss),18 Apr 93 God's Promise in Philippians 4:9,,,,,University at Buffalo,,8,,...,,,,,,"\n\tThose things,\n\twhich ye have both learne...",,[],[],
11079,sandvik@newton.apple.com (Kent Sandvik),Re: 14 Apr 93 God's Promise in 1 John 1: 7,,,,,Cookamunga Tourist Bureau,,14,,...,,,,,,"In article <RN652B5w165w@tweekco.uucp>, alizar...",,[],[],
11080,sandvik@newton.apple.com (Kent Sandvik),Re: Disillusioned Protestant Finds Christ,,,,,Cookamunga Tourist Bureau,,23,,...,,,,,,"In article <C5KxDD.K4J@boi.hp.com>, jburrill@b...",,[],[],
11081,cutter@gloster.via.mind.org (cutter),Re: Biblical Backing of Koresh's 3-02 Tape (Ci...,,,,world,"Gordian Knot, Gloster,GA",,22,,...,,,,,,netd@susie.sbc.com () writes:\n\n> In article ...,,[],[],


### Regex
- From
- Subject
    - 如果 bmp 在这里面的话, 直接跳过 先不处理
- Summary
- Keywords
- Expires
- Distribution
- Organization
- Supersedes
- Lines

- Archive-name
- Alt-atheism-archive-name
- Last-modified
- Version

内容里面最长的词有多长, 如果太长了 超过100, 则认为是有乱码. 直接跳过?

In [None]:
写正则表达式
- 

In [126]:
string = t['text'][9]
string.split('\n')

['From: keith@cco.caltech.edu (Keith Allan Schneider)',
 'Subject: Re: >>>>>>Pompous ass',
 'Organization: California Institute of Technology, Pasadena',
 'Lines: 9',
 'NNTP-Posting-Host: punisher.caltech.edu',
 '',
 'kmr4@po.CWRU.edu (Keith M. Ryan) writes:',
 '',
 '>>Then why do people keep asking the same questions over and over?',
 '>Because you rarely ever answer them.',
 '',
 "Nope, I've answered each question posed, and most were answered multiple",
 'times.',
 '',
 'keith',
 '']

0        From: mathew <mathew@mantis.co.uk>\nSubject: A...
1        From: mathew <mathew@mantis.co.uk>\nSubject: A...
2        From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...
3        From: mathew <mathew@mantis.co.uk>\nSubject: R...
4        From: strom@Watson.Ibm.Com (Rob Strom)\nSubjec...
                               ...                        
11078    From: psyrobtw@ubvmsd.cc.buffalo.edu (Robert W...
11079    From: sandvik@newton.apple.com (Kent Sandvik)\...
11080    From: sandvik@newton.apple.com (Kent Sandvik)\...
11081    From: cutter@gloster.via.mind.org (cutter)\nSu...
11082    Subject: Re: Albert Sabin\nFrom: rfox@charlie....
Name: text, Length: 11083, dtype: object

In [None]:
def info_extractor():

## Build global index
- for future information retrieve

In [19]:
# def parser():
data

['../data/train\\talk.religion.misc\\82757',
 '../data/train\\talk.religion.misc\\82758',
 '../data/train\\talk.religion.misc\\82759',
 '../data/train\\talk.religion.misc\\82760',
 '../data/train\\talk.religion.misc\\82763',
 '../data/train\\talk.religion.misc\\82766',
 '../data/train\\talk.religion.misc\\82767',
 '../data/train\\talk.religion.misc\\82771',
 '../data/train\\talk.religion.misc\\82772',
 '../data/train\\talk.religion.misc\\82774',
 '../data/train\\talk.religion.misc\\82775',
 '../data/train\\talk.religion.misc\\82776',
 '../data/train\\talk.religion.misc\\82777',
 '../data/train\\talk.religion.misc\\82778',
 '../data/train\\talk.religion.misc\\82779',
 '../data/train\\talk.religion.misc\\82781',
 '../data/train\\talk.religion.misc\\82782',
 '../data/train\\talk.religion.misc\\82783',
 '../data/train\\talk.religion.misc\\82784',
 '../data/train\\talk.religion.misc\\82785',
 '../data/train\\talk.religion.misc\\82786',
 '../data/train\\talk.religion.misc\\82787',
 '../data/

In [18]:
pd.data

['../data/train\\talk.religion.misc\\82757',
 '../data/train\\talk.religion.misc\\82758',
 '../data/train\\talk.religion.misc\\82759',
 '../data/train\\talk.religion.misc\\82760',
 '../data/train\\talk.religion.misc\\82763',
 '../data/train\\talk.religion.misc\\82766',
 '../data/train\\talk.religion.misc\\82767',
 '../data/train\\talk.religion.misc\\82771',
 '../data/train\\talk.religion.misc\\82772',
 '../data/train\\talk.religion.misc\\82774',
 '../data/train\\talk.religion.misc\\82775',
 '../data/train\\talk.religion.misc\\82776',
 '../data/train\\talk.religion.misc\\82777',
 '../data/train\\talk.religion.misc\\82778',
 '../data/train\\talk.religion.misc\\82779',
 '../data/train\\talk.religion.misc\\82781',
 '../data/train\\talk.religion.misc\\82782',
 '../data/train\\talk.religion.misc\\82783',
 '../data/train\\talk.religion.misc\\82784',
 '../data/train\\talk.religion.misc\\82785',
 '../data/train\\talk.religion.misc\\82786',
 '../data/train\\talk.religion.misc\\82787',
 '../data/