# Study 3: WhatsApp Data Donation Preparation

In [103]:
#Packages

%pip install pyspellchecker

#### Steps

1. The Donate Files are imported from PORT [Jump to WhatsApp Data Section](#1-loading-the-donated-whatsapp-data)
2. The Files are preprocessed, for example, removing PORT artifacts
4. The Dutch chat messages are exported and translated via Deepl. The English translations are imported again
5. The English chats are spellchecked and matched with the original Chat DF

## 1. Loading the Donated WhatsApp data

In [3]:
import os

os.chdir('C:/Users/77197jsc/OneDrive - Erasmus University Rotterdam/Documents/Study 3/Data/whatsdata_january_2024')

import os
import zipfile
import re
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk

pd.set_option('display.max_rows', 10)

def is_seven_digit_number(string):
    pattern = r'^\d{7}$'
    match = re.match(pattern, string)
    return match is not None

def process_zip(zip_path,donation):
    print(f"ZIP: {zip_path}")
    
    pcount = 0
    totalInstaCount = 0  
    totalWhatsCount = 0
    
    data_list = []
    whatsapp_summary = []
    whatsapp_details = []
    

    # For the regex of the text files 
    datetimes = []
    sources = []
    messages = []
    data = {}
    
    with zipfile.ZipFile(zip_path, 'r') as zip_file:
        # Get all folder names in the ZIP file
        folder_names = set()
        for item in zip_file.infolist():
            if item.is_dir():
                folder_name_temp = item.filename.strip('/').split('/')[-1]
                if is_seven_digit_number(folder_name_temp):
                    pcount+=1
                    folder_names.add(item.filename.strip('/'))
  
        # Process each folder and its files
        for folder_name in folder_names:
#             print(f"Folder: {folder_name}")
            countTracking = 0
            instaPerParticipant = 0
            whatsPerParticipant = 0
            for item in zip_file.infolist():
                if item.filename.startswith(folder_name + '/'):
                    if not item.is_dir():
                        file_name = os.path.basename(item.filename)
#                         modified_time = os.path.getmtime(item.filename)
#                         print(modified_time)
                        # tracking is both for instagram and whatsapp
                        if('tracking' in file_name):
                            countTracking+=1
    
                        elif (donation=='i' and 'Instagram' in file_name):
                            instaPerParticipant+=1
#                             if file_name.endswith('.json'):
#                                 print(f"   File: {file_name}")
#                                 with zip_file.open(item) as json_file:
#                                     json_data = json.load(json_file)
#                                     # Process JSON data here
#                                     print(json_data)

                        elif (donation=='w' and 'Whatsapp' in file_name):  
                            whatsPerParticipant+=1
                            # processing the whatsapp files
                            if file_name.endswith('.json'):
                                with zip_file.open(item) as json_file:
                                    json_data = json.load(json_file)
                                    # the json_data is a list of data. The list always ends with an element about user_omissions. 
                                    # The other elements are the messages data. One element contains at most 5000 entries. So in case of donating more than 
                                    # 5000 rows of whatsapp data, new element is created that stores the next 5000 entries, etc..
                                    # As a result a more complex json data handling is necessary.
                                    num_parts = len(json_data)
                                    counter = num_parts
                                    dif = num_parts-counter
                                    total_len = 0
                                    while(counter > 0):
                                        
                                        dif = num_parts-counter
                                        # keep collecting the textual data until you reach the counter 0   
                                        if(counter>1):
                                            total_len = total_len + len(json_data[num_parts-counter]['zip_content: '+str(dif)])
                                            for message in json_data[num_parts-counter]['zip_content: '+str(dif)]:       
#                                                 result = re.search(r'\[(.*?)\]', message['chat_message']).group(1)

#                                                 if result:
#                                                     extracted_string = result
#                                                     print(extracted_string)
                                                result = re.search(r'\[(.*?)\]', message['chat_message'])                 
                                                whatsapp_details.append([folder_name.split('/')[-1],file_name,total_len,result.group(1),message['chat_message'][result.end() + 1:]])
                                        else:
                                            #when counter is 0, we have collected all the possible data, the user omissions are then also added as json_data[dif]. 
                                            # in this state, dif is equal to num_parts so its the last element of the json array (always user omissions).
                                            whatsapp_summary.append([folder_name.split('/')[-1],file_name,total_len,json_data[dif]['user_omissions']])
                                        counter = counter - 1
                                
                            elif (file_name.endswith('.txt')):
#                                 print('found a text file')

                                try:
                                    with zip_file.open(item) as text_file:
                                        # Read and print the content line by line
                                        for line_bytes in text_file:
                                            # Convert bytes to string and strip newline
                                            line = line_bytes.decode('utf-8').strip()

                                            reg = r'^\s*\[(.*?)\]\s*([^:]+):\s*(.*)'
#                                             r'\[(\d{2}/\d{2}/\d{4}, \d{2}:\d{2}:\d{2})\] (\w+): (.+)'
                                            match = re.match(reg, line)
                                            if match:
                                                datetime, name, message = match.groups()
                                                datetimes.append(datetime)
                                                sources.append(name)
                                                messages.append(message)
#                                                 print(f"DateTime: {datetime}, Name: {name}, Message: {message}")
                                            else:
                                                print(f"Line does not match the expected format: {line}")

#                                             print(line + '\n')
                                except FileNotFoundError:
                                    print(f"File '{zip_filename}' not found.")
                                except PermissionError:
                                    print(f"Permission denied to open file '{zip_filename}'.")
                                except Exception as e:
                                    print(f"An error occurred: {str(e)}")
                                # Create a DataFrame from the extracted components
                                data = {
                                    'datetime': datetimes,
                                    'source': sources,
                                    'message': messages
                                }
                                

                                    
            # participant folder loop end, get the summaries 
            if (instaPerParticipant!=0):
                totalInstaCount+=1

            if (whatsPerParticipant!=0):
                totalWhatsCount+=1
                            
            if(donation=='w'):
#                 print('   Total Whatsapp files:', whatsPerParticipant)
                data_list.append([folder_name.split('/')[-1],whatsPerParticipant,countTracking])
            elif(donation=='i'):
#                 print('   Total Instagram files:', instaPerParticipant)
                data_list.append([folder_name.split('/')[-1],instaPerParticipant,countTracking])

#             print('   Total tracking files:', countTracking)

          
            

         # Process individual files here
        print('Total Participants:', pcount)
        if(donation=='i'):
            print('Participants with Insta files:',totalInstaCount,'\n')
            df_data = pd.DataFrame(data_list, columns=['Movez_code', 'InstaFiles', 'LogFiles'])

        elif(donation=='w'):
            print('Participants with Whats files:',totalWhatsCount,'\n')
            df_data = pd.DataFrame(data_list, columns=['Movez_code', 'WhatsFiles', 'LogFiles'])
            df_whatsapp_summary = pd.DataFrame(whatsapp_summary, columns=['Movez_code', 'FileName', 'Number of messages','Omissions'])
            df_whatsapp_details = pd.DataFrame(whatsapp_details, columns=['Movez_code', 'FileName', 'Number of messages','Date','Message'])
        
        #create the dataframe of the txt files - will be empty if no txt is available 
        df_txt_whatsapp = pd.DataFrame(data)
        
        return df_data,df_whatsapp_summary,df_whatsapp_details,df_txt_whatsapp


In [8]:
df_whats,df_whatsapp_summary,df_whatsapp_details,df_txt_whatsapp = process_zip('C:/Users/77197jsc/OneDrive - Erasmus University Rotterdam/Documents/Study 3/Data/whatsdata_october_2024.zip','w')

ZIP: C:/Users/77197jsc/OneDrive - Erasmus University Rotterdam/Documents/Study 3/Data/whatsdata_october_2024.zip
Total Participants: 285
Participants with Whats files: 201 



In [10]:
#Dropping duplicate messages due to multiple upload
df_whatsapp_details = df_whatsapp_details.drop_duplicates("Message")

In [11]:
#Aggregating all messages per user
results_df = df_whatsapp_details.groupby('Movez_code').agg({'Message': lambda x: '; '.join(x),
                                          'Number of messages': 'first'}).reset_index()

In [1]:
# Getting some summary statistics on the chats

results_df["No_char"] = results_df['Message'].apply(len)
results_df["No_words"] = results_df['Message'].apply(lambda x: len(x.split()))

print("The average number of words per chat is", round(results_df["No_words"].mean(), 2), '.')

NameError: name 'results_df' is not defined

## 2. Preprocessing of the Chat files

In [14]:
# Long words above 25 are not likely actual words but rather artifacts like "hahahahahha..."

def filter_long_words(message):
    # Split the string into words
    words = message.split()
    
    # Filter out words longer than 25 characters
    filtered_words = [word for word in words if len(word) <= 25]
    
    # Join the filtered words back into a single string
    return ' '.join(filtered_words)

In [15]:
# We remover all PORT artifacts in the text

import re

# List of artifacts to remove
artifacts = [
    '<Media weggelaten>', 'media weggelaten', '<Media omitted>', 'media omitted',
    'audio omitted', 'video omitted', 'image omitted', 'audio weggelaten',
    'video weggelaten', 'afbeelding weggelaten', '<NAAM>', '<URL>',
    'Visitekaartje weggelaten', 'sticker weggelaten', 'document weggelaten',
    'GIF weggelaten', 't0ehurjkdfncvm00tuhfjcnv'
]

# Replace all artifacts
pattern = '|'.join(re.escape(a) for a in artifacts)
results_df['Message'] = results_df['Message'].str.replace(pattern, '', regex=True)

# Remove quotes (including curly or backtick types)
results_df['Message'] = results_df['Message'].str.replace(r"['\"’‘`]", '', regex=True)

# Apply your custom word filter
results_df['Message'] = results_df['Message'].apply(filter_long_words)


In [17]:
print(
    "The mean number of donated messages per person is:",
    round(results_df["Number of messages"].mean(), 2),
    "with SD =",
    round(results_df["Number of messages"].std(), 2)
)

print(
    "The mean number of donated words per person is:",
    round(results_df["No_words"].mean(), 2),
    "with SD =",
    round(results_df["No_words"].std(), 2)
)

The mean number of donated messages per person is: 695.51 with SD = 1079.66
The mean number of donated words per person is: 7967.94 with SD = 23307.14


In [22]:
# Exporting Dutch Original chats

# Set your export folder path
export_folder = "C:/Users/77197jsc/OneDrive - Erasmus University Rotterdam/Documents/Study 3/Data/whatsdata_january_2024/Original Chats/"  # change this to your actual path
os.makedirs(export_folder, exist_ok=True)  # create folder if it doesn't exist

# Iterate over each row in the DataFrame
for i, message in enumerate(results_df["Message"]):
    file_path = os.path.join(export_folder, f"m{i}.txt")
    
    # Write to file, ensure it's string and strip trailing quotes
    with open(file_path, "w", encoding="utf-8") as f:
        cleaned = str(message).strip().rstrip('"').rstrip("'")
        f.write(cleaned)

## 3. Lets import the English translated files from Deepl

In [89]:
import os

folder_path = "C:/Users/77197jsc/OneDrive - Erasmus University Rotterdam/Documents/Study 3/Data/whatsdata_january_2024/Translated Chats/"  # replace with your actual folder path
messages_dict = {}

for i in range(197):
    filename = f"m{i}.txt"
    full_path = os.path.join(folder_path, filename)
    
    with open(full_path, encoding="utf-8") as f:
        lines = [line.strip().rstrip('"').rstrip("'") for line in f if line.strip()]
        messages_dict[f"m{i}"] = lines


In [90]:
#Creating an empty column for the English message translations
results_df["Message_en"] = results_df.apply(lambda _: '', axis=1)

In [91]:
# Sort the dictionary keys to ensure correct order (e.g., m0, m1, ..., m195)
ordered_messages = [messages_dict[f"m{i}"] for i in range(len(messages_dict))]

# If values are lists of lines, join them into a single string per message
ordered_messages = ['\n'.join(msg) if isinstance(msg, list) else msg for msg in ordered_messages]

# Assign to the DataFrame column
results_df["Message_en"] = ordered_messages

In [104]:
#Check if the merging was correct
pd.set_option('display.max_rows', None)
results_df.head(10)

Unnamed: 0,Movez_code,Message,Number of messages,No_char,No_words,Message_en,Message_checked
0,1016110,Maar voel weer goed dus kan miegen weer pesten...,26,2105,387,But feel good again so can bully miegen again;...,But feel good again so can bully miegen again;...
1,1049218,Hahahaha isg; Lekka; Je ben al 00 min aan het ...,551,4667,824,Hahahaha isg; Lekka; You've been pooping for 0...,Hahahaha isg; Lekka; You've been pooping for 0...
2,1054448,welke kleur heb jij van de jumbo highliter?; m...,591,145926,12281,what colour do you have from the jumbo highlit...,what colour do you have from the jumbo highlit...
3,1065025,Ja leuk bij wie; Ik kan vanaf 00 uur; Kan ook ...,7,721,134,Yeah nice with who; I can from 00:; Can do lat...,Yeah nice with who; I can from 00:; Can do lat...
4,1065863,Wanneer ben jij op school?; Ik ben om 0.00 ong...,729,63631,1758,When will you be at school; I'll be there at 0...,When will you be at school; I'll be there at 0...
5,1068790,Ich mag niet meer; Sorry maar ik mag niet mee;...,68,4302,823,Ich may no longer; Sorry but I'm not allowed; ...,Ich may no longer; Sorry but I'm not allowed; ...
6,1069478,Misschien als het mag van jou ouders kunnen we...,187,19973,3604,"Maybe if your parents allow it, we can stay ov...","Maybe if your parents allow it, we can stay ov..."
7,1169428,Wil je samen naar fietsen; Dan om 00:00 bij mi...,131,10763,1986,Do you want to go to cycling together; Then at...,Do you want to go to cycling together; Then at...
8,1264477,knorrr; Ik hoop over een half uurtje maar het ...,1005,51162,9236,knorrr; I hope in half an hour but it could be...,knorrr; I hope in half an hour but it could be...
9,1317241,Erm ; Gasps; NO PHOTOSHOP; see how Im talking ...,1180,71494,12909,Erm ; Gasps; NO PHOTOSHOP; see how Im talking ...,Erm ; Gasps; NO PHOTOSHOP; see how Im talking ...


## 4. Spellcheck the Donated Chats

In [105]:
# Lets run a spellchecker on our donated chats
from spellchecker import SpellChecker

# Initialize SpellChecker with British English
spell = SpellChecker(language='en')

# Define a function to correct spelling
def spell_check(text):
    words = text.split()
    corrected_words = [spell.correction(word) if word in spell else word for word in words]
    return " ".join(corrected_words)


In [97]:
results_df['Message_checked'] = results_df['Message_en'].apply(spell_check)

In [106]:
# lets check how many very short donation we have and filter them out

print("These are all donations:", len(results_df))
results_df2 = results_df[results_df['Message_en'].str.split().str.len() >= 100]
print("These are all donations over 100 words:",len(results_df2))


These are all donations: 197
These are all donations over 100 words: 184


In [107]:
#look into the data
results_df2.head()

Unnamed: 0,Movez_code,Message,Number of messages,No_char,No_words,Message_en,Message_checked
0,1016110,Maar voel weer goed dus kan miegen weer pesten...,26,2105,387,But feel good again so can bully miegen again;...,But feel good again so can bully miegen again;...
1,1049218,Hahahaha isg; Lekka; Je ben al 00 min aan het ...,551,4667,824,Hahahaha isg; Lekka; You've been pooping for 0...,Hahahaha isg; Lekka; You've been pooping for 0...
2,1054448,welke kleur heb jij van de jumbo highliter?; m...,591,145926,12281,what colour do you have from the jumbo highlit...,what colour do you have from the jumbo highlit...
3,1065025,Ja leuk bij wie; Ik kan vanaf 00 uur; Kan ook ...,7,721,134,Yeah nice with who; I can from 00:; Can do lat...,Yeah nice with who; I can from 00:; Can do lat...
4,1065863,Wanneer ben jij op school?; Ik ben om 0.00 ong...,729,63631,1758,When will you be at school; I'll be there at 0...,When will you be at school; I'll be there at 0...


In [108]:
#Saving the long and the shorter DF to two files
results_df.to_csv('messages_final_long.csv', encoding='utf-8', index=False)
results_df2.to_csv('messages_final_short.csv', encoding='utf-8', index=False)