In [1]:
import pandas as pd
import xml.etree.ElementTree as ET
import io
import datetime

In [2]:
etree = ET.parse('../../data/raw/bc3/corpus.xml')

In [3]:
root = etree.getroot()

In [4]:
message_cols = ["Received", "From", "To", "Subject", "Text", "thread"]
thread_cols = ["name", "listno"]
message_df = pd.DataFrame(columns=message_cols);
thread_df = pd.DataFrame(columns=thread_cols);

In [5]:
def get_text(node):
    message = ''
    for sent in node.findall('Sent'):
        message += sent.text + '\n'
    return message

In [6]:
for thread in root.findall('thread'):
    thread_name = thread.find('name').text
    thread_listno = thread.find('listno').text
    thread_df = thread_df.append({'name': thread_name, 'listno': thread_listno}, ignore_index=True)
    
    for message in thread.findall('DOC'):
        message_received = message.find('Received').text
        message_from = message.find('From').text
        message_to = message.find('To').text
        message_subject = message.find('Subject').text
        message_text = get_text(message.find('Text')).strip('\n')
        message_thread = thread_listno
        message_threadname = thread_name
        message_df = message_df.append(
            {
                'Received': message_received,
                'From': message_from,
                'To': message_to,
                'Subject': message_subject,
                'Text': message_text,
                'thread': message_thread,
                'thread_name': message_threadname
            },
            ignore_index=True)

In [7]:
message_df.head()

Unnamed: 0,Received,From,To,Subject,Text,thread,thread_name
0,Tue Dec 08 07:30:52 -0800 1998,Jacob Palme <jpalme@dsv.su.se>,discuss@apps.ietf.org,Extending IETF meetings to two weeks?,"The IETF meetings tend to become too large, cr...",007-7484738,Extending IETF meetings to two weeks?
1,Wed Dec 09 20:21:11 -0800 1998,Terry Allen <tallen@sonic.net>,"discuss@apps.ietf.org,jpalme@dsv.su.se",Re: Extending IETF meetings to two weeks?,"> The IETF meetings tend to become too large, ...",007-7484738,Extending IETF meetings to two weeks?
2,Thu Dec 10 07:09:32 -0800 1998,Brian E Carpenter <brian@hursley.ibm.com>,Terry Allen <tallen@sonic.net>,Re: Extending IETF meetings to two weeks?,"Terry, \n\nWG Chairs already are asked to spec...",007-7484738,Extending IETF meetings to two weeks?
3,Thu Dec 10 11:55:56 -0800 1998,Larry Masinter <masinter@parc.xerox.com>,"<agenda@ietf.org> , <discuss@apps.ietf.org>",create 'final' IETF agenda schedule earlier?,Working groups don't seem to decide at the las...,007-7484738,Extending IETF meetings to two weeks?
4,Fri Dec 11 16:02:06 -0800 1998,Richard Shockey <rshockey@ix.netcom.com>,discuss@apps.ietf.org,Re: create 'final' IETF agenda schedule earlier?,"> For example, it would be very useful so that...",007-7484738,Extending IETF meetings to two weeks?


In [8]:
thread_df.head()

Unnamed: 0,name,listno
0,Extending IETF meetings to two weeks?,007-7484738
1,SWADEurope postcard picture,015-2625401
2,CHI 2003?,023-2964247
3,September F2F,026-4380722
4,Location of August 30-31 1999 Face to Face,043-10248963


In [9]:
message_df['Received'] = pd.to_datetime(message_df['Received'], infer_datetime_format=True)

In [10]:
message_df.head()

Unnamed: 0,Received,From,To,Subject,Text,thread,thread_name
0,1998-12-08 07:30:52-08:00,Jacob Palme <jpalme@dsv.su.se>,discuss@apps.ietf.org,Extending IETF meetings to two weeks?,"The IETF meetings tend to become too large, cr...",007-7484738,Extending IETF meetings to two weeks?
1,1998-12-09 20:21:11-08:00,Terry Allen <tallen@sonic.net>,"discuss@apps.ietf.org,jpalme@dsv.su.se",Re: Extending IETF meetings to two weeks?,"> The IETF meetings tend to become too large, ...",007-7484738,Extending IETF meetings to two weeks?
2,1998-12-10 07:09:32-08:00,Brian E Carpenter <brian@hursley.ibm.com>,Terry Allen <tallen@sonic.net>,Re: Extending IETF meetings to two weeks?,"Terry, \n\nWG Chairs already are asked to spec...",007-7484738,Extending IETF meetings to two weeks?
3,1998-12-10 11:55:56-08:00,Larry Masinter <masinter@parc.xerox.com>,"<agenda@ietf.org> , <discuss@apps.ietf.org>",create 'final' IETF agenda schedule earlier?,Working groups don't seem to decide at the las...,007-7484738,Extending IETF meetings to two weeks?
4,1998-12-11 16:02:06-08:00,Richard Shockey <rshockey@ix.netcom.com>,discuss@apps.ietf.org,Re: create 'final' IETF agenda schedule earlier?,"> For example, it would be very useful so that...",007-7484738,Extending IETF meetings to two weeks?


In [11]:
date_data = {
    'mail_sent_day': [],
    'mail_sent_date': [],
    'mail_sent_time': [],
    'mail_sent_year': []
}
tags = { '1' : 'Monday' ,'2' : 'Tuesday' ,'3' : 'Wednesday' ,'4' : 'Thursday' ,'5' : 'Friday' ,'6' : 'Saturday' ,'7' : 'Sunday' }

for timestamp in message_df.Received:
    date_data['mail_sent_day'].append(tags[str(timestamp.isoweekday())])
    date_data['mail_sent_date'].append(timestamp.date())
    date_data['mail_sent_time'].append(timestamp.time())
    date_data['mail_sent_year'].append(timestamp.year)

date_df = pd.DataFrame.from_dict(date_data, orient='columns')
date_df.index = message_df.index
    
message_df = pd.concat([message_df, date_df], axis=1)
del date_df, date_data, tags

In [12]:
message_df.head()

Unnamed: 0,Received,From,To,Subject,Text,thread,thread_name,mail_sent_day,mail_sent_date,mail_sent_time,mail_sent_year
0,1998-12-08 07:30:52-08:00,Jacob Palme <jpalme@dsv.su.se>,discuss@apps.ietf.org,Extending IETF meetings to two weeks?,"The IETF meetings tend to become too large, cr...",007-7484738,Extending IETF meetings to two weeks?,Tuesday,1998-12-08,07:30:52,1998
1,1998-12-09 20:21:11-08:00,Terry Allen <tallen@sonic.net>,"discuss@apps.ietf.org,jpalme@dsv.su.se",Re: Extending IETF meetings to two weeks?,"> The IETF meetings tend to become too large, ...",007-7484738,Extending IETF meetings to two weeks?,Wednesday,1998-12-09,20:21:11,1998
2,1998-12-10 07:09:32-08:00,Brian E Carpenter <brian@hursley.ibm.com>,Terry Allen <tallen@sonic.net>,Re: Extending IETF meetings to two weeks?,"Terry, \n\nWG Chairs already are asked to spec...",007-7484738,Extending IETF meetings to two weeks?,Thursday,1998-12-10,07:09:32,1998
3,1998-12-10 11:55:56-08:00,Larry Masinter <masinter@parc.xerox.com>,"<agenda@ietf.org> , <discuss@apps.ietf.org>",create 'final' IETF agenda schedule earlier?,Working groups don't seem to decide at the las...,007-7484738,Extending IETF meetings to two weeks?,Thursday,1998-12-10,11:55:56,1998
4,1998-12-11 16:02:06-08:00,Richard Shockey <rshockey@ix.netcom.com>,discuss@apps.ietf.org,Re: create 'final' IETF agenda schedule earlier?,"> For example, it would be very useful so that...",007-7484738,Extending IETF meetings to two weeks?,Friday,1998-12-11,16:02:06,1998


In [13]:
thread_df = thread_df.set_index('listno')

In [14]:
thread_df.head()

Unnamed: 0_level_0,name
listno,Unnamed: 1_level_1
007-7484738,Extending IETF meetings to two weeks?
015-2625401,SWADEurope postcard picture
023-2964247,CHI 2003?
026-4380722,September F2F
043-10248963,Location of August 30-31 1999 Face to Face


In [15]:
thread_data = {
    'count': [],
    'start_date': [],
    'end_date': [],
    'participants': []
}
for thread in thread_df.iterrows():
    messages = message_df.loc[message_df['thread'] == thread[0]]
    thread_data['count'].append(messages.count()[0])
    thread_data['start_date'].append(min(messages.Received))
    thread_data['end_date'].append(max(messages.Received))
    participants = []
    for message in messages.iterrows():
        if message[1].To is not None:
            participants += ([message[1].From] + message[1].To.split(','))
        else:
            participants += [message[1].From]
    thread_data['participants'].append(list(set(participants)))
    
thread_data_df = pd.DataFrame.from_dict(thread_data, orient='columns')
thread_data_df.head()
thread_data_df.index = thread_df.index
    
thread_df = pd.concat([thread_df, thread_data_df], axis=1)
del thread_data_df, thread_data

In [16]:
thread_df.head()

Unnamed: 0_level_0,name,count,start_date,end_date,participants
listno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
007-7484738,Extending IETF meetings to two weeks?,5,1998-12-08 07:30:52-08:00,1998-12-11 16:02:06-08:00,"[discuss@apps.ietf.org, <discuss@apps.ietf.or..."
015-2625401,SWADEurope postcard picture,6,2003-04-25 06:39:28-07:00,2003-04-25 15:18:38-07:00,"[""'Libby Miller'"" <Libby.Miller@bristol.ac.uk>..."
023-2964247,CHI 2003?,7,2003-03-26 00:46:07-08:00,2003-04-03 12:02:24-08:00,"[Keith Instone <keith@instone.org>, Sharon Las..."
026-4380722,September F2F,5,2003-05-30 12:15:41-07:00,2003-06-04 19:35:33-07:00,"[Francis McCabe <fgm@fla.fujitsu.com>, Monica ..."
043-10248963,Location of August 30-31 1999 Face to Face,8,1999-07-15 19:40:16-07:00,1999-07-19 11:55:20-07:00,"[""Barb Fox (Exchange)"" <bfox@Exchange.Microsof..."


In [17]:
message_df.to_pickle('bc3_mails.pkl')
thread_df.to_pickle('bc3_threads.pkl')