In [60]:
import re
import pandas as pd
import numpy as np

In [61]:
def startsWithDateTime(s):
    pattern = r'^([0-9]|[0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)(\d{2}|\d{4}), ([0-9][0-9]):([0-9][0-9]) -'
    result = re.match(pattern, s)
    if result:
        return True
    return False

def startsWithAuthor(s):
    patterns = [
        r'([\w]+):',                        # First Name
        r'([\w]+[\s]+[\w]+):',              # First Name + Last Name
        r'([\w]+[\s]+[\w]+[\s]+[\w]+):',    # First Name + Middle Name + Last Name
        r'([+]\d{3} \d{3} \d{3} \d{4})',    # Mobile Number (Nigeria)
        r'([+]\d{2} \d{5} \d{5}):',         # Mobile Number (India)
        r'([+]\d{2} \d{3} \d{3} \d{4}):',   # Mobile Number (US)
        r'([+]\d{2} \d{4} \d{7})'           # Mobile Number (Europe)
    ]
    pattern = '^' + '|'.join(patterns)
    # print(pattern)
    result = re.match(pattern, s)
    if result:
        return True
    return False

def getDataPoint(line):
    # line = 18/06/17, 22:47 - Loki: Why do you have 2 numbers, Banner?
    
    splitLine = line.split(' - ') # splitLine = ['18/06/17, 22:47', 'Loki: Why do you have 2 numbers, Banner?']
    dateTime = splitLine[0] # dateTime = '18/06/17, 22:47'
    date, time = dateTime.split(', ') # date = '18/06/17'; time = '22:47'
    message = ' '.join(splitLine[1:]) # message = 'Loki: Why do you have 2 numbers, Banner?'
    
    if startsWithAuthor(message): # True
        splitMessage = message.split(': ') # splitMessage = ['Loki', 'Why do you have 2 numbers, Banner?']
        author = splitMessage[0] # author = 'Loki'
        message = ' '.join(splitMessage[1:]) # message = 'Why do you have 2 numbers, Banner?'
    else:
        author = None
    return date, time, author, message

def breakUp(s):
    first = s.split(' - ')
    dateTime = first[0]
    authorMsg = first[1]

    date, time = dateTime.split(', ')
    third = authorMsg.split(': ')
    author = third[0]
    msg = third[1]
    
    # print('\nDate: ', date)
    # print('\nTime: ', time)
    # print('\nAuthor: ', author)
    # print('\nMessage: ', msg.encode('utf-8'))

    return date, time, author, msg

In [62]:
parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe

# 1st dataset
conversationPath = './data/WhatsApp Chat with TeamApt.txt' 

with open(conversationPath, encoding="utf-8") as fp:
    fp.readline() # Skipping first line of the file (usually contains information about end-to-end encryption)
        
    messageBuffer = [] # Buffer to capture intermediate output for multi-line messages
    date, time, author = None, None, None # Intermediate variables to keep track of the current message being processed
    
    while True:
        line = fp.readline() 
        if not line: # Stop reading further if end of file has been reached
            break
        line = line.strip() # Guarding against erroneous leading and trailing whitespaces
        if startsWithDateTime(line): # If a line starts with a Date Time pattern, then this indicates the beginning of a new message
            if len(messageBuffer) > 0: # Check if the message buffer contains characters from previous iterations
                parsedData.append([date, time, author, ' '.join(messageBuffer)]) # Save the tokens from the previous message in parsedData
            messageBuffer.clear() # Clear the message buffer so that it can be used for the next message
            date, time, author, message = getDataPoint(line) # Identify and extract tokens from the line
            messageBuffer.append(message) # Append message to buffer
        else:
            messageBuffer.append(line) # If a line doesn't start with a Date Time pattern, then it is part of a multi-line message. So, just append to buffer
            
df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message'])
print(df.shape)
df.head()

(5070, 4)


Unnamed: 0,Date,Time,Author,Message
0,17/06/2018,13:27,"+234 812 269 0137 created group ""TeamApt""",
1,26/03/2019,09:09,,Ope TeamApt added you
2,26/03/2019,09:21,+234 815 381 7868,"Happy birthday @2348032148369 Long life, prosp..."
3,26/03/2019,09:44,+234 802 119 1069,Happy birthday @2348032148369 . More years!
4,26/03/2019,09:58,+234 812 269 0137 added +234 817 346 7910,


In [63]:
cols = ['Author', 'Message']

for col in cols:
    df.loc[df[col] == None, col] = np.nan
    df.loc[df[col] == '<Media omitted>', col] = np.nan
    df.loc[df[col] == '', col] = np.nan
    
df.isnull().sum()

Date         0
Time         0
Author      86
Message    306
dtype: int64

In [64]:
df = df.dropna()
print(df.shape)
df.head(10)

(4678, 4)


Unnamed: 0,Date,Time,Author,Message
2,26/03/2019,09:21,+234 815 381 7868,"Happy birthday @2348032148369 Long life, prosp..."
3,26/03/2019,09:44,+234 802 119 1069,Happy birthday @2348032148369 . More years!
5,26/03/2019,10:28,+234 810 237 5562,happy birthday @2348032148369 . Long life and ...
6,26/03/2019,10:28,Solomon Amadi,Happy birthday Paschal Ezenwankwo. God's bless...
7,26/03/2019,10:32,+234 816 611 7441,Happy Birthday @2348032148369 ...More Blessings
8,26/03/2019,10:49,+234 905 354 8871,Happy Birthday @2348032148369. Have a good one.
9,26/03/2019,10:50,+234 816 901 3692,Happy Birthday @P.... more blessings
10,26/03/2019,11:35,Paschal TeamApt,Thanks for your wishes ladies and gentlemen
11,26/03/2019,11:40,Chinaza Emenike TeamApt,"Hello Aptians, Gentle reminder that Saturday ..."
12,26/03/2019,11:41,Chinaza Emenike TeamApt,More information to be passed along as the wee...


In [65]:
df.to_csv('./data/cleaned_teamapt_whatsapp_group.csv', index=None)