# Dataset Generation Script
This notebook contains the code to generate the dataset for the Inbox Guardian classification task.  It uses the Gmail API to scrape all emails in a user's inbox with the labels "Unimportant," "Normal," and "Urgent," cleans and preprocesses them, and writes them to a csv.  Note this script requires you to log in to your google account to run (this generates a crednetials.json file and a token.json; I have not included my versions of these files to protect the privacy of my account).

In [1]:
import os.path
import base64
import re
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from bs4 import BeautifulSoup
import pandas as pd
import os

In [4]:
# this cell handles logging into your google account
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly','https://www.googleapis.com/auth/gmail.modify']

creds = None
# The file token.json stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('../token.json'):
    creds = Credentials.from_authorized_user_file('../token.json', SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file(               
        # your creds file here. Please create json file as here https://cloud.google.com/docs/authentication/getting-started
            '../credentials.json', SCOPES)
        creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('../token.json', 'w') as token:
            token.write(creds.to_json())

In [5]:
service = build("gmail", "v1", credentials=creds)
# get all labels assosiated with a user's gmail
results = service.users().labels().list(userId="me").execute()
labels = results.get("labels", [])

if not labels:
  print("No labels found.")
else:
  print("Labels:")
  for label in labels:
    # print all labels assosiated with a user's gmail - sanity check to ensure we have the urgent, normal and unimportant labels
    # also allows us to get internal label ids, which are necessary to scrape the messages assosiated with these labels
    print(str(label["name"]) + ": " + str(label["id"])) 


Labels:
CHAT: CHAT
SENT: SENT
INBOX: INBOX
IMPORTANT: IMPORTANT
TRASH: TRASH
DRAFT: DRAFT
SPAM: SPAM
CATEGORY_FORUMS: CATEGORY_FORUMS
CATEGORY_UPDATES: CATEGORY_UPDATES
CATEGORY_PERSONAL: CATEGORY_PERSONAL
CATEGORY_PROMOTIONS: CATEGORY_PROMOTIONS
CATEGORY_SOCIAL: CATEGORY_SOCIAL
STARRED: STARRED
UNREAD: UNREAD
Normal: Label_3234030140869167843
Urgent: Label_502346772578880845
Unimportant: Label_6179828275384677292


In [6]:
def decode_message(msg):
    """
    This function handles converting a message retrieved via the Gmail API to a human readable format.  We have to read the message body
    (which is located in a different part of the information Google API returns depending on the number/type of attachments) and decode the
    base64 body to plain text

    Args:
        msg: The message returned by the Gmail API to decode
    Returns:
        text: A plaintext version of the message body
    """


    # handles fetching the body of emails with different types/numbers of attachments
    if 'parts' in msg['payload']:
        if msg['payload']['parts'][0]['mimeType'] == 'multipart/alternative':
            message_raw = msg['payload']['parts'][0]['parts'][0]['body']['data']    
        else:
            if "data" in msg['payload']['parts'][0]["body"]:
                message_raw = msg['payload']['parts'][0]['body']['data']
            else:
                if msg['payload']['parts'][0]["parts"][0]["mimeType"] == 'multipart/alternative':
                    message_raw = msg['payload']['parts'][0]["parts"][0]["parts"][0]['body']['data']
                else:
                    message_raw = msg['payload']['parts'][0]["parts"][0]["body"]["data"]
                    
    else:
        message_raw = msg['payload']['body']['data']

    # decode message body to plain text
    try:
        byte_code = base64.urlsafe_b64decode(message_raw)
        text = byte_code.decode("utf-8")
        return text
    except BaseException as error:
        return ""

In [7]:
def parse_metadata(msg):
    """
    Parses out metadata from a message returned by Gmail API
    
    Args:
        msg: The message returned by the Gmail API to decode
    Returns:
        ret: A list containing the sender and subject of a message
    """

    ret = [" ", " "]
    
    # loop over all message metadata
    for values in msg['payload']['headers']: 
        name = values['name']
        if name == 'From':
            ret[0] = values["value"]
        if name == "Subject":
            ret[1] = values["value"]
    return ret
        

In [8]:
def read_messages(messages, label):
    """
    Reads over all messages belonging to a specific Gmail Label, and converts them to csv formatted lines

    Args:
        messages: The messages belonging to a specific Gmail label to read over
        label: The label to assign each message (we only read messages belonging to a single label in this function, so all messages should get the same label)
    Returns:
        text: A csv formatted string containing information about all messages in messages
    """

    text = ""

    for message in messages:
        # fetch more specific information about each message
        msg = service.users().messages().get(userId='me', id=message['id']).execute()
        
        # get sender/subject and parse out special characters
        metadata = parse_metadata(msg)
        sender = re.sub('[\n\r\t,]+', ' ', metadata[0])
        subject = re.sub('[\n\r\t,]+', ' ', metadata[1])

        # filter out special characters via regular expressions, html code via Beautiful soup
        body = BeautifulSoup(re.sub('[\n\r\t,]+', ' ', decode_message(msg))).get_text()

        meta = f'{sender} {subject}' # sender and subject concatendated 
        full = f'{meta} {body}' # sender, subject, and body concatenated 
        
        line = f'{sender},{subject},{body},{meta},{full},{label}\n' 
        
        text += line

    print(f'Processed {len(messages)} documents')
    return text


In [9]:
# retrieve all messages assosiated with each label
spam = service.users().messages().list(userId='me', labelIds=['Label_6179828275384677292'], maxResults=500).execute() 
urgent = service.users().messages().list(userId='me', labelIds=['Label_502346772578880845'], maxResults=500).execute()
normal = service.users().messages().list(userId='me', labelIds=['Label_3234030140869167843'], maxResults=500).execute()

spamMessages = spam.get('messages',[]);
urgentMessages = urgent.get('messages',[]);
normalMessages = normal.get('messages',[]);

In [10]:
# write messages to a csv
# note this csv is NOT shuffled - we will do this during model training
# extra emails come from email chains, where each individual email counts as its own message

if os.path.isfile("fullDataset.csv"):
    os.remove("fullDataset.csv")
    
with open("fullDataset.csv", "a") as f:
    f.write("Sender,Subject,Body,Meta,Full,Label\n")
    f.write(read_messages(spamMessages, 0))
    f.write(read_messages(normalMessages, 1))
    f.write(read_messages(urgentMessages, 2))

  body = BeautifulSoup(re.sub('[\n\r\t,]+', ' ', decode_message(msg))).get_text()


Processed 262 documents
Processed 184 documents
Processed 90 documents


In [11]:
# sanity check - print first 5 rows of dataset
df = pd.read_csv("fullDataset.csv")
df.head()

Unnamed: 0,Sender,Subject,Body,Meta,Full,Label
0,Womens Network Dartmouth <Womens.Network.Dartm...,YOU'RE INVITED: SPA NIGHT TOMORROW 8PM,As the term concludes take a study break and ...,Womens Network Dartmouth <Womens.Network.Dartm...,Womens Network Dartmouth <Womens.Network.Dartm...,0
1,Dartmouth Bikes <Dartmouth.Bikes@dartmouth.edu>,Open Bike Shop Hours (Free Lube!) Tuesday Morn...,Open shop hours are available for the communit...,Dartmouth Bikes <Dartmouth.Bikes@dartmouth.edu...,Dartmouth Bikes <Dartmouth.Bikes@dartmouth.edu...,0
2,Collis Governing Board <Collis.Governing.Board...,microbrews <3,CGB Microbrews where: One Wheelock (Collis) wh...,Collis Governing Board <Collis.Governing.Board...,Collis Governing Board <Collis.Governing.Board...,0
3,The New York Times <nytdirect@nytimes.com>,Breaking news: $1 billion donation to provide ...,Breaking news: $1 billion donation to provide ...,The New York Times <nytdirect@nytimes.com> Bre...,The New York Times <nytdirect@nytimes.com> Bre...,0
4,Quizlet <newsletter@lifecycle.quizlet.com>,7-week study streak ✅,You're on a roll! 96 Quizlet /* RESETS ...,Quizlet <newsletter@lifecycle.quizlet.com> 7-w...,Quizlet <newsletter@lifecycle.quizlet.com> 7-w...,0


In [12]:
# sanity check - check dataset length
print(len(df))

536
