In [1]:
#installing mailbox module

!pip install mailbox



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import mailbox  # mailbox is a python module that helps access and manipulate files of difeerent mailbox formats like mbox, MailDi and, MH.

In [2]:
# path to the mbox file on my local machine
# r" is used to prevent Unicode error from \U 

mboxfile = r"C:\Users\VICTOR JOY\Downloads\takeout-20201104T005831Z-001\Takeout\Mail\All mail Including Spam and Trash.mbox"
mbox = mailbox.mbox(mboxfile)
mbox

<mailbox.mbox at 0x1d7edb768e0>

In [6]:
# mailbox interface dictionary-like meaning it contains key-value pairs.
# here we search for all the keys in the mbox file we collected

for key in mbox[0].keys():
    print(key)

X-GM-THRID
X-Gmail-Labels
Delivered-To
Received
X-Google-Smtp-Source
X-Received
ARC-Seal
ARC-Message-Signature
ARC-Authentication-Results
Return-Path
Received
Received-SPF
Authentication-Results
DKIM-Signature
To
From
Reply-To
Subject
Message-ID
List-Unsubscribe
Feedback-ID
MIME-Version
Content-Type
Date


In [7]:
import csv  

In [9]:
# converting the mailbox into a csv file
# csv file name = mailbox.csv
# encoding = utf-8 is used to prevent unicode errors. Most webpages use utf-8 to represent unicodes, meaning there are unicodes in our mail, without this, our code wouls give errors
# we're using a with statement because we want to do something to the data

with open('mailbox.csv', 'w', encoding="utf-8") as outputfile:  # output file is used to refer to the file objects
  writer = csv.writer(outputfile) #csv writer reads and writes documents in dictionary form
  writer.writerow(['subject','from','date','to','label','thread'])
    
    
  for message in mbox:
    writer.writerow([message['subject'], message['from'],  message['date'], message['to'],  message['X-Gmail-Labels'], message['X-GM-THRID']])

In [10]:
#reading the csv file and naming features/columns

dfs = pd.read_csv('mailbox.csv', names=['subject', 'from', 'date', 'to', 'label', 'thread'])

In [11]:
dfs.dtypes

subject    object
from       object
date       object
to         object
label      object
thread     object
dtype: object

In [12]:
#date is of type "object" but we want to change it to type "date-time" so we can have a proper analysis of our e-mail data

dfs['date'] = dfs['date'].apply(lambda x: pd.to_datetime(x, errors='coerce', utc=True))

In [None]:
'''errors{‘ignore’, ‘raise’, ‘coerce’}, default ‘raise’

If ‘raise’, then invalid parsing will raise an exception.
If ‘coerce’, then invalid parsing will be set as NaN.
If ‘ignore’, then invalid parsing will return the input.'''


'''utc: Boolean value, Returns time in UTC if True.'''

In [13]:
dfs.dtypes

subject                 object
from                    object
date       datetime64[ns, UTC]
to                      object
label                   object
thread                  object
dtype: object

In [14]:
# find the Nan values from date

dfs = dfs[dfs['date'].notna()]

In [15]:
# saving the dataframe into a csv file

dfs.to_csv('gmail.csv')

# DESCRIPTIVE STATISTICS

In [16]:
dfs

Unnamed: 0,subject,from,date,to,label,thread
1,"=?UTF-8?Q?Joy,_Save_=E2=82=A650,000_Off_Your_S...",ALATbyWema <noreply@alat.ng>,2020-10-31 11:02:45+00:00,victoriousjvictor@gmail.com,"Inbox,Category Updates,Unread",1682064975966880594
2,=?utf-8?Q?19=20Gifts=20for=20Your=20=27Emily=2...,=?utf-8?Q?Fashionista?= <newsletters@fashionis...,2020-11-03 16:59:46+00:00,<victoriousjvictor@gmail.com>,"Inbox,Category Promotions,Unread",1682359422775130217
3,=?utf-8?Q?=F0=9F=93=98_SLA_Book_Club:_Zikora_b...,She Leads Africa <content@sheleadsafrica.org>,2020-10-31 17:08:31+00:00,victoriousjvictor@gmail.com,"Inbox,Category Updates,Unread",1682087987251912700
4,Flash Points: U.S. election violence is a clea...,Foreign Policy <noreply@crm.foreignpolicy.com>,2020-10-30 13:12:41+00:00,"""victoriousjvictor@gmail.com"" <victoriousjvict...","Inbox,Category Updates,Unread",1681982557987556596
5,Debit Alert: XXXXXX5010,"""FCMB"" <ebusiness@fcmb.com>",2020-10-30 11:06:41+00:00,victoriousjvictor@gmail.com,"Inbox,Category Personal,Unread",1681974518294368770
...,...,...,...,...,...,...
30103,Debit Alert: XXXXXX5010,"""FCMB"" <ebusiness@fcmb.com>",2020-01-17 18:27:08+00:00,victoriousjvictor@gmail.com,"Inbox,Category Updates,Unread",1656001007669102716
30104,Google Alert - Fashion,Google Alerts <googlealerts-noreply@google.com>,2020-07-24 07:04:33+00:00,victoriousjvictor@gmail.com,"Inbox,Category Updates,Unread",1673080890461859067
30105,There's A New Kind Of Inequality. And It's Not...,"""Pocket"" <noreply@getpocket.com>",2019-12-10 18:24:39+00:00,victoriousjvictor@gmail.com,"Inbox,Category Promotions,Unread",1652558801022797654
30106,Google Alert - New technology,Google Alerts <googlealerts-noreply@google.com>,2020-10-26 20:00:07+00:00,victoriousjvictor@gmail.com,"Inbox,Category Updates,Unread",1681645799109025785


In [18]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30107 entries, 1 to 30107
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype              
---  ------   --------------  -----              
 0   subject  30096 non-null  object             
 1   from     30107 non-null  object             
 2   date     30107 non-null  datetime64[ns, UTC]
 3   to       30099 non-null  object             
 4   label    30107 non-null  object             
 5   thread   30107 non-null  object             
dtypes: datetime64[ns, UTC](1), object(5)
memory usage: 1.6+ MB


In [19]:
dfs.columns

Index(['subject', 'from', 'date', 'to', 'label', 'thread'], dtype='object')

In [25]:
# using regex to pick out only email addresses from dfs["from"]


import re

def extract_email_ID(string):
  email = re.findall(r'<(.+?)>', string)  # check the string and search if it contains <....>
  if not email:
    email = list(filter(lambda y: '@' in y, string.split()))  # lamba function to check if the string contains @
  return email[0] if email else np.nan  # if it doesn't contain @ return nan else the email address is returned


# applying the function to the dataframe
dfs['from'] = dfs['from'].apply(lambda x: extract_email_ID(x))

In [24]:
#output of the extract_email_ID function

dfs

Unnamed: 0,subject,from,date,to,label,thread
1,"=?UTF-8?Q?Joy,_Save_=E2=82=A650,000_Off_Your_S...",noreply@alat.ng,2020-10-31 11:02:45+00:00,victoriousjvictor@gmail.com,"Inbox,Category Updates,Unread",1682064975966880594
2,=?utf-8?Q?19=20Gifts=20for=20Your=20=27Emily=2...,newsletters@fashionista.com,2020-11-03 16:59:46+00:00,<victoriousjvictor@gmail.com>,"Inbox,Category Promotions,Unread",1682359422775130217
3,=?utf-8?Q?=F0=9F=93=98_SLA_Book_Club:_Zikora_b...,content@sheleadsafrica.org,2020-10-31 17:08:31+00:00,victoriousjvictor@gmail.com,"Inbox,Category Updates,Unread",1682087987251912700
4,Flash Points: U.S. election violence is a clea...,noreply@crm.foreignpolicy.com,2020-10-30 13:12:41+00:00,"""victoriousjvictor@gmail.com"" <victoriousjvict...","Inbox,Category Updates,Unread",1681982557987556596
5,Debit Alert: XXXXXX5010,ebusiness@fcmb.com,2020-10-30 11:06:41+00:00,victoriousjvictor@gmail.com,"Inbox,Category Personal,Unread",1681974518294368770
...,...,...,...,...,...,...
30103,Debit Alert: XXXXXX5010,ebusiness@fcmb.com,2020-01-17 18:27:08+00:00,victoriousjvictor@gmail.com,"Inbox,Category Updates,Unread",1656001007669102716
30104,Google Alert - Fashion,googlealerts-noreply@google.com,2020-07-24 07:04:33+00:00,victoriousjvictor@gmail.com,"Inbox,Category Updates,Unread",1673080890461859067
30105,There's A New Kind Of Inequality. And It's Not...,noreply@getpocket.com,2019-12-10 18:24:39+00:00,victoriousjvictor@gmail.com,"Inbox,Category Promotions,Unread",1652558801022797654
30106,Google Alert - New technology,googlealerts-noreply@google.com,2020-10-26 20:00:07+00:00,victoriousjvictor@gmail.com,"Inbox,Category Updates,Unread",1681645799109025785


In [None]:
# refactoring timezones to do proper analysis

In [29]:
import datetime
import pytz   # python module used to change time zones accurately

def refactor_timezone(x):
    gmt_1 = pytz.timezone('Africa/Lagos') # my time zone. Please note, other time zones exist
    return x.astimezone(gmt_1)

In [28]:
dfs['date'] = dfs['date'].apply(lambda x: refactor_timezone(x))  # applying the timezone to the dataframe

In [30]:
dfs

Unnamed: 0,subject,from,date,to,label,thread
1,"=?UTF-8?Q?Joy,_Save_=E2=82=A650,000_Off_Your_S...",noreply@alat.ng,2020-10-31 12:02:45+01:00,victoriousjvictor@gmail.com,"Inbox,Category Updates,Unread",1682064975966880594
2,=?utf-8?Q?19=20Gifts=20for=20Your=20=27Emily=2...,newsletters@fashionista.com,2020-11-03 17:59:46+01:00,<victoriousjvictor@gmail.com>,"Inbox,Category Promotions,Unread",1682359422775130217
3,=?utf-8?Q?=F0=9F=93=98_SLA_Book_Club:_Zikora_b...,content@sheleadsafrica.org,2020-10-31 18:08:31+01:00,victoriousjvictor@gmail.com,"Inbox,Category Updates,Unread",1682087987251912700
4,Flash Points: U.S. election violence is a clea...,noreply@crm.foreignpolicy.com,2020-10-30 14:12:41+01:00,"""victoriousjvictor@gmail.com"" <victoriousjvict...","Inbox,Category Updates,Unread",1681982557987556596
5,Debit Alert: XXXXXX5010,ebusiness@fcmb.com,2020-10-30 12:06:41+01:00,victoriousjvictor@gmail.com,"Inbox,Category Personal,Unread",1681974518294368770
...,...,...,...,...,...,...
30103,Debit Alert: XXXXXX5010,ebusiness@fcmb.com,2020-01-17 19:27:08+01:00,victoriousjvictor@gmail.com,"Inbox,Category Updates,Unread",1656001007669102716
30104,Google Alert - Fashion,googlealerts-noreply@google.com,2020-07-24 08:04:33+01:00,victoriousjvictor@gmail.com,"Inbox,Category Updates,Unread",1673080890461859067
30105,There's A New Kind Of Inequality. And It's Not...,noreply@getpocket.com,2019-12-10 19:24:39+01:00,victoriousjvictor@gmail.com,"Inbox,Category Promotions,Unread",1652558801022797654
30106,Google Alert - New technology,googlealerts-noreply@google.com,2020-10-26 21:00:07+01:00,victoriousjvictor@gmail.com,"Inbox,Category Updates,Unread",1681645799109025785


In [32]:
# dropping "to" feature because all mails were sent to me. duh!!!

dfs.drop("to", axis = 1, inplace = True)

In [33]:
dfs

Unnamed: 0,subject,from,date,label,thread
1,"=?UTF-8?Q?Joy,_Save_=E2=82=A650,000_Off_Your_S...",noreply@alat.ng,2020-10-31 12:02:45+01:00,"Inbox,Category Updates,Unread",1682064975966880594
2,=?utf-8?Q?19=20Gifts=20for=20Your=20=27Emily=2...,newsletters@fashionista.com,2020-11-03 17:59:46+01:00,"Inbox,Category Promotions,Unread",1682359422775130217
3,=?utf-8?Q?=F0=9F=93=98_SLA_Book_Club:_Zikora_b...,content@sheleadsafrica.org,2020-10-31 18:08:31+01:00,"Inbox,Category Updates,Unread",1682087987251912700
4,Flash Points: U.S. election violence is a clea...,noreply@crm.foreignpolicy.com,2020-10-30 14:12:41+01:00,"Inbox,Category Updates,Unread",1681982557987556596
5,Debit Alert: XXXXXX5010,ebusiness@fcmb.com,2020-10-30 12:06:41+01:00,"Inbox,Category Personal,Unread",1681974518294368770
...,...,...,...,...,...
30103,Debit Alert: XXXXXX5010,ebusiness@fcmb.com,2020-01-17 19:27:08+01:00,"Inbox,Category Updates,Unread",1656001007669102716
30104,Google Alert - Fashion,googlealerts-noreply@google.com,2020-07-24 08:04:33+01:00,"Inbox,Category Updates,Unread",1673080890461859067
30105,There's A New Kind Of Inequality. And It's Not...,noreply@getpocket.com,2019-12-10 19:24:39+01:00,"Inbox,Category Promotions,Unread",1652558801022797654
30106,Google Alert - New technology,googlealerts-noreply@google.com,2020-10-26 21:00:07+01:00,"Inbox,Category Updates,Unread",1681645799109025785


In [42]:
# creating a new feature to show if the the mail is from me or not

myemail = 'victoriousjvictor@gmail.com'
dfs['label'] = dfs['from'].apply(lambda x: 'sent' if x==myemail else 'inbox')

dfs.head(50)  # checking to see if it actually works and it did

Unnamed: 0,subject,from,date,label,thread
1,"=?UTF-8?Q?Joy,_Save_=E2=82=A650,000_Off_Your_S...",noreply@alat.ng,2020-10-31 12:02:45+01:00,inbox,1682064975966880594
2,=?utf-8?Q?19=20Gifts=20for=20Your=20=27Emily=2...,newsletters@fashionista.com,2020-11-03 17:59:46+01:00,inbox,1682359422775130217
3,=?utf-8?Q?=F0=9F=93=98_SLA_Book_Club:_Zikora_b...,content@sheleadsafrica.org,2020-10-31 18:08:31+01:00,inbox,1682087987251912700
4,Flash Points: U.S. election violence is a clea...,noreply@crm.foreignpolicy.com,2020-10-30 14:12:41+01:00,inbox,1681982557987556596
5,Debit Alert: XXXXXX5010,ebusiness@fcmb.com,2020-10-30 12:06:41+01:00,inbox,1681974518294368770
6,Debit Alert: XXXXXX5010,ebusiness@fcmb.com,2020-10-30 12:06:32+01:00,inbox,1681974518294368770
7,Debit Alert: XXXXXX5010,ebusiness@fcmb.com,2020-10-30 12:04:50+01:00,inbox,1681974518294368770
8,=?utf-8?Q?The=20Complicated=20Legacy=20of=20=2...,newsletters@fashionista.com,2020-10-30 16:59:52+01:00,inbox,1681993458151248157
9,Credit Alert: XXXXXX5010,ebusiness@fcmb.com,2020-11-03 16:36:07+01:00,inbox,1682353967689599723
10,"Joy, you can still do something for your future!!",mary@astromary.com,2020-10-31 03:25:37+01:00,inbox,1682032441188755972
