In [3]:
import os

data_path = "data"
files = os.listdir(data_path)

print("Chat files found:")
for f in files:
    print(f)

Chat files found:
chat1.txt
chat10.txt
chat11.txt
chat2.txt
chat3.txt
chat4.txt
chat5.txt
chat6.txt
chat7.txt
chat8.txt
chat9.txt


In [4]:
# Combine all chat files

import os

data_path = "data"

all_chats = ""

for file in os.listdir(data_path):
    if file.endswith(".txt"):
        with open(os.path.join(data_path, file), "r", encoding="utf-8") as f:
            all_chats += f.read() + "\n"

print("Total characters loaded:", len(all_chats))

Total characters loaded: 225585


In [5]:
# parse chat text into structured data

import re
import pandas as pd

# WhatsApp message pattern 
pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2}\s?[apAPmM]*) - (.*?): (.*)'

matches = re.findall(pattern, all_chats)

df = pd.DataFrame(matches, columns=["date", "time", "user", "message"])

print("Total messages:", df.shape[0])
df.head()

Total messages: 1453


Unnamed: 0,date,time,user,message
0,1/16/26,7:03 PM,Pushpa Mam,<Media omitted>
1,1/16/26,7:03 PM,Pushpa Mam,<Media omitted>
2,1/16/26,7:06 PM,+91 79784 20932,This message was deleted
3,1/17/26,7:35 PM,Pushpa Mam,
4,1/30/26,10:54 AM,+91 79784 20932,<Media omitted>


In [7]:
df.shape

(1453, 4)

In [8]:
# clean the dataset

# remove media and deleted messages
df = df[~df["message"].str.contains("<Media omitted>", na=False)]
df = df[~df["message"].str.contains("deleted", case=False, na=False)]

# remove empty messages
df = df[df["message"].str.strip() != ""]

df.reset_index(drop=True, inplace=True)

print("Messages after cleaning:", df.shape[0])
df.head()

Messages after cleaning: 978


Unnamed: 0,date,time,user,message
0,1/12/26,9:19 AM,Priyanshu Rout,ML ka class ajj hai?
1,2/10/26,12:59 PM,+91 77600 72210,*Follow this link to Join the WhatsApp Group* ...
2,2/10/26,11:43 PM,+91 77600 72210,*Follow this link to Join the WhatsApp Group* ...
3,2/13/26,10:37 PM,+91 89709 16930,*TRAINING + INTERNSHIP PROGRAM* ...
4,2/18/26,12:23 AM,+91 77600 72210,*TRAINING + INTERNSHIP PROGRAM* ...


In [9]:
# anonymize users (replace names & numbers)

# create unique user list
unique_users = df["user"].unique()

# create mapping
user_map = {user: f"User{i+1}" for i, user in enumerate(unique_users)}

# replace names with labels
df["user"] = df["user"].map(user_map)

df.head()

Unnamed: 0,date,time,user,message
0,1/12/26,9:19 AM,User1,ML ka class ajj hai?
1,2/10/26,12:59 PM,User2,*Follow this link to Join the WhatsApp Group* ...
2,2/10/26,11:43 PM,User2,*Follow this link to Join the WhatsApp Group* ...
3,2/13/26,10:37 PM,User3,*TRAINING + INTERNSHIP PROGRAM* ...
4,2/18/26,12:23 AM,User2,*TRAINING + INTERNSHIP PROGRAM* ...


In [10]:
df.to_csv("data/whatsapp_cleaned_data.csv", index=False)
print("Dataset saved successfully")

Dataset saved successfully
