In [1]:
import pandas as pd
import json
import numpy as np
from datetime import datetime
from datetime import timedelta
from IPython.display import display


<div style="background-color:#FFA726;padding:10px;border-radius:5px;text-align: center;">
    <h2>Whatsapp Chat Data Cleaning </h2>
</div>

This notebooks purpose is to clean WhatsApp message data in preparation for training a language model that can learn to generate replies in my personal style.

To create usable training pairs, the raw sequential chat data is processed into (input, response) format — where the input consists of one or more messages sent to me, and the response is my reply.


### Define the Data Frame data_set
 - This Data Frame will be filled with a 'message' as the independent variable and 'response' as the dependent variable

In [2]:
data_set = pd.DataFrame()

<div style="background-color:#FFA726;padding:10px;border-radius:5px;text-align: center;">
    <h2> Function For Privacy </h2>
</div>

- Since I will be using my own personal messages to build this chat bot and I do not want anyone to be able to see my private messages when showing the progress of the Data Frame, I thus want to hide all information such as:
    - the messages sent
    - the name of the contact I'm messaging

In [3]:
def show_df(df):
    """
    This function outputs a Data Frame after replacing all sensitive information with generic values
    
    Args:
    df (Pandas DataFrame) :The data frame that I want to print
    """
    
    col_names = ["sender_of_replied_to_message", "message_replied_to", "message_info", "message"]
    reply_message_count = 0
    message_count = 0
    
    output = df.head(10).reset_index(drop=True).copy()

    for i in range(len(output)):
        row = output.iloc[i]

        if len(output.columns) > 2:
            if row[col_names[0]] != None and row[col_names[0]] != "You":
                output.at[i, col_names[0]] = "Friend"

            if row[col_names[1]] != None:
                output.at[i, col_names[1]] = f"Random message replied to {reply_message_count}"
                reply_message_count += 1

        if row[col_names[2]] != None:
            begining_index = output.at[i, col_names[2]].find(']') + 2
            end_index = output.at[i, col_names[2]].find(':', begining_index)

            name_of_sender = str(output.at[i, col_names[2]])[begining_index:end_index]

            if name_of_sender != 'Nic' and name_of_sender != 'You':
                output.at[i, col_names[2]] = f"{output.at[i, col_names[2]][0:begining_index - 1]} Friend:"

            if name_of_sender == 'Nic' or name_of_sender == 'You':
                output.at[i, col_names[2]] = f"{output.at[i, col_names[2]][0:begining_index - 1]} Nic:"

        if row[col_names[3]] != None:
            output.at[i, col_names[3]] = f"Message {message_count}"
            message_count += 1
    
    display(output)

<div style="background-color:#FFA726;padding:10px;border-radius:5px;text-align: center;">
    <h2>Reading In The Data </h2>
</div>

- I want to make each chat its own Data Frame in order to remove chats that dont meet the specific criterion that would make it applicable to act as observations to fine-tune the language model
- I thus don't want to join all the different chats messages into one Data Frame just yet


In [4]:
with open("whatsapp_messages.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

In [5]:
dfs = []

for chat in raw_data:
    df = pd.DataFrame(chat)
    dfs.append(df)

In [6]:
print(len(dfs))

239


<div style="background-color:#FFA726;padding:10px;border-radius:5px;text-align: center;">
    <h2>Cleaning The Data</h2>
</div>

In [7]:
dfs[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       1 non-null      object
 1   1       1 non-null      object
 2   2       48 non-null     bool  
 3   3       46 non-null     object
 4   4       45 non-null     object
 5   5       48 non-null     bool  
 6   6       48 non-null     bool  
 7   7       48 non-null     bool  
dtypes: bool(4), object(4)
memory usage: 1.8+ KB


## Removing Chats with Only One Message

- Some chats contain only a single message.
- This occurs when the last activity in the chat was over a year ago:
    - WhatsApp Web only retains messages sent within the past year.
    - If a chat has been inactive for more than a year, WhatsApp Web will display only the final message sent in that conversation.
- These isolated messages cannot be used as input for training the language model, and are therefore excluded from the dataset.


In [8]:
non_empty_dfs = []

for df in dfs:
    if len(df) > 1:
        non_empty_dfs.append(df)

In [9]:
print(f"Number of chats originally: {len(dfs)} \nNumber of chats after removing chats now: {len(non_empty_dfs)}")

Number of chats originally: 239 
Number of chats after removing chats now: 117


In [10]:
dfs = non_empty_dfs

## Removing 'Empty' Information

- The scraper was not designed to detect or save voice notes or files.
- As a result, when such messages are encountered, no information is recorded about them.
- These entries are effectively empty and are removed from the dataset.


In [11]:
for df in dfs:
    empty = (
        df[0].isna() &
        df[1].isna() &
        (df[2] == False) &
        df[3].isna() &
        df[4].isna() &
        (df[5] == False) &
        (df[6] == False) &
        (df[7] == False)     
    )
    
    empty_index = df[empty].index

    #Now drop the index's
    df = df.drop(index = empty_index, inplace=True)

In [12]:
#First im changing the names of the coloums to be more informative than just numbers

col_names = ["sender_of_replied_to_message", "message_replied_to", "media_in_message_replied_to", 
            "message_info", "message", "image_in_message", "sticker_in_message", "video_in_message"]
for df in dfs:
    df.columns = col_names

In [13]:
show_df(dfs[1])

Unnamed: 0,sender_of_replied_to_message,message_replied_to,media_in_message_replied_to,message_info,message,image_in_message,sticker_in_message,video_in_message
0,,,False,"[12:01, 4/25/2024] Nic:",Message 0,False,False,False
1,,,False,"[12:15, 4/25/2024] Friend:",Message 1,False,False,False
2,,,False,"[12:15, 4/25/2024] Friend:",Message 2,False,False,False
3,,,False,"[20:20, 4/27/2024] Nic:",Message 3,False,False,False
4,,,False,"[11:59, 5/10/2024] Friend:",Message 4,False,False,False
5,,,False,"[11:59, 5/10/2024] Friend:",Message 5,False,False,False
6,,,False,"[14:02, 5/13/2024] Friend:",Message 6,False,False,False
7,,,False,"[14:02, 5/13/2024] Friend:",Message 7,False,False,False
8,,,False,"[14:02, 5/13/2024] Friend:",Message 8,False,False,False
9,,,False,"[09:20, None] Nic:",,True,False,False


## Getting A List Of All The Contact Names

In [14]:
contact_names = [None] * len(dfs)
index_of_contact = -1

for df in dfs:
    index_of_contact += 1
    unique_names = []
    for i in range(len(df)):
        if (df['message_info'].iloc[i] != None):

            name = df['message_info'].iloc[i].split(']')[1].strip()
            name = name[:len(name) - 1]

            # In collection of data, some instances had two colons
            if (name[-1] == ':'):
                name = name[:len(name) - 1]

            #make a list of the unique names
            if not name in unique_names:
                unique_names.append(name)
    
    for name in unique_names:
        if name not in ['Nic', 'You']:
            contact_names[index_of_contact] = name
            break

## Removing Unresponsive Contacts

- Some contacts could not be identified in the dataset.
- In each of these cases, the contact never replied to any of my messages.
- This typically occurred in one-sided interactions, such as when I reached out (e.g., to a fellow student) and received no response.
- Since these chats contain no reply and therefore provide no usable input for modeling a response pattern, they are excluded from the dataset.


In [15]:
index_of_contactless_chats = []
counter = -1

for contact in contact_names:
    counter += 1
    if contact == None:
        index_of_contactless_chats.append(counter)
        

In [16]:
index_of_contactless_chats

[7, 60, 63, 100]

In [17]:
show_df(dfs[63])

Unnamed: 0,sender_of_replied_to_message,message_replied_to,media_in_message_replied_to,message_info,message,image_in_message,sticker_in_message,video_in_message
0,,,False,"[12:07, None] Nic:",,False,False,False
1,,,False,"[12:07, None] Nic:",,False,False,False
2,,,False,,,True,False,False


### Removing These chats data frames from the list of data frames

In [18]:
dfs = [df for i, df in enumerate(dfs) if i not in index_of_contactless_chats]

In [19]:
contact_names = [contact for i, contact in enumerate(contact_names) if i not in index_of_contactless_chats]

## Removing Non-Text Message Information

- This project focuses solely on text-based communication.
- Therefore, columns that indicate whether a message included media (e.g., images or files) are removed.
- Additionally, any columns that exclusively contain media content and no textual data are also excluded from the dataset.


In [20]:
for i in range(len(dfs)):
    #Selecting the columns I want
    dfs[i] = dfs[i][['sender_of_replied_to_message', 'message_replied_to', 'message_info', 'message']]
    
    #Finding the indexs of the rows without text
    obs_without_txt = dfs[i].index[np.where(dfs[i]['message'].isna())]
    
    #removing these rows
    dfs[i] = dfs[i].drop(index=obs_without_txt)

In [21]:
splt_info = dfs[1].iloc[0]['message_info'].split(',')

### Splitting the Message DataFrame into Two Groups

To prepare the dataset for training, the messages are divided into two distinct groups based on conversational structure:

1. **Group 1**: Messages that I sent as direct replies to specific messages received from someone else.  
   - These pairs can be used to train the model on clear, one-to-one response behavior — making them ideal for learning how I typically respond to specific prompts or questions.
   
   

2. **Group 2**: Sequences of messages that are not direct replies, but are part of a broader conversation thread.  
   - These sequences help the model learn the flow of conversation over time, including how I communicate across multiple turns when there's no single message being responded to.

Separating the messages in this way allows for more nuanced training and evaluation, as the model can be exposed to both structured reply behavior and natural multi-turn conversation dynamics.

In [22]:
for i in range(len(dfs)):
    name = contact_names[i]
    
    # Get the indexs of the messages with replies
    messages_with_replies = dfs[i].index[np.where(dfs[i]['message_replied_to'].notna())]
    all_replies = dfs[i].loc[messages_with_replies].copy()
    
    # Get the indexs of the replies that I sent
    where_i_responded = all_replies.index[np.where(all_replies['sender_of_replied_to_message'] == name)]
    my_direct_responses = all_replies.loc[where_i_responded]
    
    # Add the direct responses to the data_set
    data_set = pd.concat([data_set, my_direct_responses], ignore_index=True)
    
    # Remove the used indexs from the data frames in dfs
    dfs[i] = dfs[i].drop(messages_with_replies)
    dfs[i] = dfs[i][['message_info', 'message']]

### Reducing the Dataset to Essential Columns

- For the purposes of training the language model, only the **message** and its corresponding **reply** are needed.
- All other metadata (such as sender identity, timestamps, and media indicators) is not relevant to the model’s objective and is therefore removed.
- The final dataset is structured with two columns:  
  - **`message`**: the input (a message sent to me)  
  - **`reply`**: the response (my direct reply to that message)

- These message–reply pairs are primarily sourced from the group of messages that are direct responses to a message I received.
- This ensures that each pair in the dataset reflects a real conversational exchange where the model can learn how I typically respond to specific inputs.


In [23]:
cols = ['Message', 'Reply']

data_set = data_set.drop(columns=['sender_of_replied_to_message', 'message_info'])
data_set.columns = cols

## Processing the Remaining Messages

- The remaining messages that are not direct replies may still be valuable if they are part of a coherent message sequence.
- Often, a single idea or response is split across multiple consecutive messages sent within a short time frame.
- To include these in the dataset, I group together sequential messages based on their timestamps:
    - If two or more messages are sent close together in time, they are combined into a single input or reply.
- This allows these message sequences to be used as valid **message–reply pairs** for training the model, even if they weren’t directly linked by a reply indicator.


In [24]:
for i in range(len(dfs)):
    messages = dfs[i]['message_info']
    them = []
    time = []
    day = []
    month = []
    year = []
    
    # Seperate the relevant information
    for message in messages:
        splt = message.split(']')

        # Handle the name of the sender
        name = splt[1].strip()[:]
        if name == "Nic:" or name == "You::":
            them.append(0)
        else:
            them.append(1)

        time_and_date = splt[0].split(',')

        # Handle the time
        time.append(time_and_date[0][1:])

        #handle date
        date = time_and_date[1].strip().split('/')

        month.append(date[0])
        day.append(date[1])
        year.append(date[2])
    
    df2 = pd.DataFrame({
    'Them': them,
    'Time': time,
    'day': day,
    'month': month,
    'year': year
    })
    

    # Concatenate
    dfs[i] = pd.concat([dfs[i]['message'].reset_index(drop=True), df2.reset_index(drop=True)], axis=1)

In [25]:
for j in range(len(dfs)):
    for i in range(len(dfs[j])):
        dfs[j].at[i, 'Time'] = f"{dfs[j].at[i, 'year']}:{dfs[j].at[i, 'month']}:{dfs[j].at[i, 'day']}:{dfs[j].at[i, 'Time']}"
    
    # drop the left over cells
    dfs[j].drop(columns=['day', 'month', 'year'], inplace=True)
    
    #Now turn the time of the message sent into a datetime object, which would allow for comparisons
    for i in range(len(dfs[j])):
        dfs[j].at[i, 'Time'] = datetime.strptime(dfs[j].at[i, 'Time'], '%Y:%m:%d:%H:%M')
    
    

## Joining Sequential Messages To Finish Building The Final Dataset

To preserve as much meaningful conversational data as possible, the following rules are implemented to combine messages into valid **message–reply pairs**:

1. The sequence begins with the **first message sent by the other person**.
    - If they send multiple messages within a **30-minute window**, these are grouped together as a single input, separated by commas.

2. If I reply **within 48 hours** of the first message, my response is recorded as the corresponding **reply**.

3. Any additional messages I send **within 30 minutes of my first reply** are also considered part of the reply and are joined together using commas.
    - If I send another message **after** this 30-minute window, it is assumed to be the start of a **new conversation** and is not included in the previous reply.

> **Note**: If no reply is sent within 48 hours, the original message (and its group) is discarded, as it cannot be paired meaningfully with a response.


In [26]:
joined_df = pd.DataFrame(columns=['Message', 'Reply'])

for j in range(len(dfs)):
    df = dfs[j]

    i = 0

    inputs = []
    outputs = []

    while i < len(df):
        row = df.iloc[i]

        input_msg = []
        first_input_time = None
        output_msg = []
        first_output_time = None

        # Check for start of conversation (message from Them)
        if row['Them'] == 1:
            input_msg.append(row['message'])
            first_input_time = row['Time']
            i += 1

            # Gather any additional messages from Them within 30 minutes
            while i < len(df) and df.iloc[i]['Them'] == 1:
                next_row = df.iloc[i]
                if next_row['Time'] - first_input_time <= timedelta(minutes=30):
                    input_msg.append(next_row['message'])
                    i += 1
                else:
                    break


            # Now look for first reply from You within 48 hours
            reply_found = False
            while i < len(df):
                row = df.iloc[i]
                if row['Them'] == 0 and row['Time'] - first_input_time <= timedelta(hours=100):
                    # First reply found
                    output_msg.append(row['message'])
                    first_output_time = row['Time']
                    i += 1
                    reply_found = True

                    # Gather any further replies within 30 minutes of first reply
                    while i < len(df) and df.iloc[i]['Them'] == 0:
                        next_row = df.iloc[i]
                        if next_row['Time'] - first_output_time <= timedelta(minutes=30):
                            output_msg.append(next_row['message'])
                            i += 1
                        else:
                            break  # Stop collecting replies
                    break  # Done with this Q&A pair
                elif row['Them'] == 1:
                    # Another message from Them arrived before a reply → stop search
                    break
                else:
                    i += 1

            # Only store the pair if a reply was found
            if reply_found:
                inputs.append("\n".join(input_msg))
                outputs.append("\n".join(output_msg))

        else:
            i += 1  # Skip if the message isn't from Them
    
    temp = pd.DataFrame({'Message': inputs, 'Reply': outputs})

    # Build final dataset
    joined_df = pd.concat([joined_df, temp], ignore_index=True)
    

### Example of a Message and Response Obtained Using the Above Rules

In [27]:
display(joined_df.iloc[1807])

Message    Added you. Please let me know if you got it. 
Reply                         I did get it thank you !!!
Name: 1807, dtype: object

## Merging Sequentially Joined Pairs with Direct Reply Pairs

Now that we have constructed message–reply pairs from sequential message groups, we can combine them with the direct reply pairs created earlier.

By merging both types:
- We ensure the dataset captures a wide range of conversational patterns — from clearly defined, one-to-one replies to more natural, multi-turn exchanges.
- This enriches the training data and allows the language model to generalize better across different messaging contexts.


In [28]:
data_set = pd.concat([data_set, joined_df], ignore_index=True)

<div style="background-color:#FFA726;padding:10px;border-radius:5px;text-align: center;">
    <h2>Exporting The Final Data Set</h2>
</div>

In [29]:
data_set.to_csv("message_reply_pairs.csv", index=False)