### Converting the Switchboard dataset into Convokit format

In [7]:
from convokit import Corpus, User, Utterance
from swda import Transcript
import glob

#### Creating Users

Each caller is considered a user, and there are total of 440 different callers in this dataset. Each user is marked with a numerical id, and the metadata for each user includes the following information:

- Gender (str): MALE or FEMALE
- Education (int): 0, 1, 2, 3, 9
- Birth Year (int): YYYY
- Dialect Area (str): MIXED, NEW ENGLAND, NORTH MIDLAND, NORTHERN, NYC, SOUTH MIDLAND, SOUTHERN, UNK, WESTERN

In [21]:
files = glob.glob("./swda/*/sw_*.utt.csv") # Switchboard utterance files
user_meta = {}

for file in files:
    trans = Transcript(file, './swda/swda-metadata.csv')
    user_meta[trans.from_caller] = {"sex": trans.from_caller_sex,
                                    "education": trans.from_caller_education,
                                    "birth_year": trans.from_caller_birth_year,
                                    "dialect_area": trans.from_caller_dialect_area}
    user_meta[trans.to_caller] = {"sex": trans.to_caller_sex,
                                    "education": trans.to_caller_education,
                                    "birth_year": trans.to_caller_birth_year,
                                    "dialect_area": trans.to_caller_dialect_area}

Create a User object for each unique user in the dataset

In [22]:
corpus_users = {k: User(name = k, meta = v) for k,v in user_meta.items()}

Check number of users in the dataset

In [59]:
print("Number of users in the data = {}".format(len(corpus_users)))

Number of users in the data = 440


In [19]:
# Example metadata from user 1632
corpus_users[1632].meta

{'sex': 'FEMALE',
 'education': 2,
 'birth_year': 1962,
 'dialect_area': 'WESTERN'}

#### Creating Utterances

Utterances are found in the "text" field of each Transcript object. There are 221,616 utterances in total.

Each Utterance object has the following fields:

- id: the unique id of the utterance
- user: the User giving the utterance
- root: id of the root utterance of the conversation
- reply_to: id of the utterance this replies to
- timestamp: timestamp of the utterance (not applicable in Switchboard)
- text: text of the utterance
- tag: the DAMSL act-tag of the utterance
- pos: the part-of-speech tagged portion of the utterance

In [56]:
utterance_corpus = {}

# Iterate thru each transcript
for file in files:
    
    trans = Transcript(file, './swda/swda-metadata.csv')
    utts = trans.utterances
    root = str(trans.conversation_no) + "-0" # Get id of root utterance
    
    # Iterate thru each utterance in transcript
    for utt in utts:
        
        idx = str(utt.conversation_no) + "-" + str(utt.transcript_index)
        text = utt.text
        
        # Check which user is talking
        if 'A' in utt.caller:
            user = trans.from_caller
        else:
            user = trans.to_caller
        
        # Put act-tag and POS information into metadata
        meta = {'tag': utt.act_tag,
                'pos': utt.pos}
    
        # Check id of previous utterance
        if utt.transcript_index == 0:
            reply_to = None
        else:
            reply_to = str(utt.conversation_no) + "-" + str(utt.transcript_index-1)
            
        utterance_corpus[idx] = Utterance(idx, corpus_users[user], root,
                                          reply_to, None, text, meta)

In [57]:
utterance_list = [utterance for k,utterance in utterance_corpus.items()]

Check number of utterances in the dataset

In [60]:
print("Number of utterances in the data = {}".format(len(utterance_corpus)))

Number of utterances in the data = 221616


In [53]:
# Example utterance object
utterance_corpus['4325-0']

Utterance({'id': '4325-0', 'user': User([('name', 1632)]), 'root': 4325, 'reply_to': None, 'timestamp': None, 'text': 'Okay.  /', 'meta': {'tag': 'o', 'pos': 'Okay/UH ./.'}})

#### Creating corpus from list of utterances

In [58]:
swda_corpus = Corpus(utterances=utterance_list, version=1)

In [62]:
print("number of conversations in the dataset = {}".format(len(swda_corpus.get_conversation_ids())))

number of conversations in the dataset = 1155


In [69]:
convo_ids = swda_corpus.get_conversation_ids()
for i, convo_idx in enumerate(convo_ids[0:2]):
    print("sample conversation {}:".format(i))
    print(swda_corpus.get_conversation(convo_idx).get_utterance_ids())

sample conversation 0:
['4325-0', '4325-1', '4325-2', '4325-3', '4325-4', '4325-5', '4325-6', '4325-7', '4325-8', '4325-9', '4325-10', '4325-11', '4325-12', '4325-13', '4325-14', '4325-15', '4325-16', '4325-17', '4325-18', '4325-19', '4325-20', '4325-21', '4325-22', '4325-23', '4325-24', '4325-25', '4325-26', '4325-27', '4325-28', '4325-29', '4325-30', '4325-31', '4325-32', '4325-33', '4325-34', '4325-35', '4325-36', '4325-37', '4325-38', '4325-39', '4325-40', '4325-41', '4325-42', '4325-43', '4325-44', '4325-45', '4325-46', '4325-47', '4325-48', '4325-49', '4325-50', '4325-51', '4325-52', '4325-53', '4325-54', '4325-55', '4325-56', '4325-57', '4325-58', '4325-59', '4325-60', '4325-61', '4325-62', '4325-63', '4325-64', '4325-65', '4325-66', '4325-67', '4325-68', '4325-69', '4325-70', '4325-71', '4325-72', '4325-73', '4325-74', '4325-75', '4325-76', '4325-77', '4325-78', '4325-79', '4325-80', '4325-81', '4325-82', '4325-83', '4325-84', '4325-85', '4325-86', '4325-87', '4325-88', '4325-8

#### Updating corpus level metadata

In [71]:
swda_meta = {}
for file in files:
    trans = Transcript(file, './swda/swda-metadata.csv')
    idx = trans.conversation_no
    swda_meta[idx] = {}
    swda_meta[idx]['filename'] = trans.ptd_basename
    swda_meta[idx]['talk_day'] = trans.talk_day
    swda_meta[idx]['topic_description'] = trans.topic_description
    swda_meta[idx]['length'] = trans.length
    swda_meta[idx]['prompt'] = trans.prompt
                     
    
swda_corpus.meta['metadata'] = swda_meta
swda_corpus.meta['name'] = "The Switchboard Dialog Act Corpus"

In [72]:
swda_corpus.meta

{'name': 'The Switchboard Dialog Act Corpus',
 'metadata': {4325: {'filename': '4/sw4325',
   'talk_day': datetime.datetime(1992, 3, 23, 0, 0),
   'topic_description': 'CHILD CARE',
   'length': 5,
   'prompt': 'FIND OUT WHAT CRITERIA THE OTHER CALLER WOULD USE IN SELECTING CHILD CARE SERVICES FOR A PRESCHOOLER.  IS IT EASY OR DIFFICULT TO FIND SUCH CARE?'},
  4330: {'filename': '4/sw4330',
   'talk_day': datetime.datetime(1992, 3, 23, 0, 0),
   'topic_description': 'DRUG TESTING',
   'length': 5,
   'prompt': 'HOW DO YOU FEEL ABOUT THE PRACTICE OF SOME COMPANIES OR GOVERNMENT AGENCIES TESTING EMPLOYEES OR PROSPECTIVE EMPLOYEES FOR DRUGS?  IS RANDOM SPOT TESTING JUSTIFIED?  WHAT LIMITS SHOULD THERE BE'},
  4103: {'filename': '4/sw4103',
   'talk_day': datetime.datetime(1992, 3, 9, 0, 0),
   'topic_description': 'FAMILY FINANCE',
   'length': 5,
   'prompt': 'DOES YOUR FAMILY KEEP A MONTHLY BUDGETCAN YOU GIVE A GENERAL DESCRIPTION OF YOUR PROCEDURES'},
  4327: {'filename': '4/sw4327',
 

#### Saving created corpus

In [73]:
swda_corpus.dump("swda-corpus", base_path = "./datasets")

Check if available info from dataset can be checked directly

In [75]:
from convokit import meta_index
meta_index(filename = "./datasets/swda-corpus")

{'utterances-index': {'tag': "<class 'str'>", 'pos': "<class 'str'>"},
 'users-index': {'sex': "<class 'str'>",
  'education': "<class 'int'>",
  'birth_year': "<class 'int'>",
  'dialect_area': "<class 'str'>"},
 'conversations-index': {},
 'overall-index': {'name': "<class 'str'>", 'metadata': 'bin'},
 'version': 1}