In [11]:
from pathlib import Path
from loguru import logger
import pandas as pd
from datetime import datetime

Read in the file

In [12]:
import tomllib

configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)
processed = Path("../data/processed")
datafile = processed / config["inputpath"]
if not datafile.exists():
    logger.warning(
        f"{datafile} does not exist. Maybe first run src/preprocess.py, or check the timestamp!"
    )

In [13]:
df = pd.read_csv(datafile, parse_dates=["timestamp"])
df.head()

Unnamed: 0,timestamp,author,message
0,2021-09-09 21:35:48,1 KLu PMO 3 (Pax-Zone) ðŸŒ´,â€ŽBerichten en oproepen worden end-to-end versl...
1,2021-09-09 21:35:48,Steenbergen KMA,â€ŽSteenbergen KMA heeft deze groep gemaakt
2,2021-09-09 21:35:48,1 KLu PMO 3 (Pax-Zone) ðŸŒ´,â€ŽJe bent toegevoegd
3,2023-04-10 17:23:01,Anke KMA,"Voor degene die naar de bullenparade gaan, kun..."
4,2023-04-10 17:27:18,Anke KMA,*lijken


Check the datatypes. Note the timestamp type!

In [14]:
df.dtypes

timestamp    datetime64[ns]
author               object
message              object
dtype: object

Sometimes, author names have a tilde in front of them, allong with some unicode. Let's clean that.

In [15]:
import re

clean_tilde = r"^~\u202f"
df["author"] = df["author"].apply(lambda x: re.sub(clean_tilde, "", x))

Let's check how many unique authors we have

In [16]:
len(df.author.unique())
df

Unnamed: 0,timestamp,author,message
0,2021-09-09 21:35:48,1 KLu PMO 3 (Pax-Zone) ðŸŒ´,â€ŽBerichten en oproepen worden end-to-end versl...
1,2021-09-09 21:35:48,Steenbergen KMA,â€ŽSteenbergen KMA heeft deze groep gemaakt
2,2021-09-09 21:35:48,1 KLu PMO 3 (Pax-Zone) ðŸŒ´,â€ŽJe bent toegevoegd
3,2023-04-10 17:23:01,Anke KMA,"Voor degene die naar de bullenparade gaan, kun..."
4,2023-04-10 17:27:18,Anke KMA,*lijken
...,...,...,...
217,2025-01-07 10:00:28,Wendy KMA,ðŸ˜‚ðŸ˜‚ðŸ˜‚ðŸ˜‚ðŸ˜‚ðŸ˜‚ðŸ˜‚ðŸ˜‚
218,2025-01-07 10:00:51,Ilse KMA,Hahaha nice!
219,2025-01-07 10:00:55,Melissa,Geweldig haha
220,2025-01-07 10:00:56,Wendy KMA,Mag hopen dat je de prijs hebt gewonnen


Let's make the authors anonymous

In [17]:
import json
from wa_analyzer.humanhasher import humanize

authors = df.author.unique()
anon = {k: humanize(k) for k in authors}
# we save a reference file so we can look up the original author names if we want to
reference_file = processed / "anon_reference.json"

with open(reference_file, "w") as f:
    # invert the dictionary:
    ref = {v: k for k, v in anon.items()}
    # sort alphabetically:
    ref_sorted = {k: ref[k] for k in sorted(ref.keys())}
    # save as json:
    json.dump(ref_sorted, f)

assert len(anon) == len(authors), "you lost some authors!"

In [18]:
df["anon_author"] = df.author.map(anon)


In [19]:
df

Unnamed: 0,timestamp,author,message,anon_author
0,2021-09-09 21:35:48,1 KLu PMO 3 (Pax-Zone) ðŸŒ´,â€ŽBerichten en oproepen worden end-to-end versl...,prismatic-baboon
1,2021-09-09 21:35:48,Steenbergen KMA,â€ŽSteenbergen KMA heeft deze groep gemaakt,lighthearted-kookabura
2,2021-09-09 21:35:48,1 KLu PMO 3 (Pax-Zone) ðŸŒ´,â€ŽJe bent toegevoegd,prismatic-baboon
3,2023-04-10 17:23:01,Anke KMA,"Voor degene die naar de bullenparade gaan, kun...",plaid-gazelle
4,2023-04-10 17:27:18,Anke KMA,*lijken,plaid-gazelle
...,...,...,...,...
217,2025-01-07 10:00:28,Wendy KMA,ðŸ˜‚ðŸ˜‚ðŸ˜‚ðŸ˜‚ðŸ˜‚ðŸ˜‚ðŸ˜‚ðŸ˜‚,gamesome-mosquito
218,2025-01-07 10:00:51,Ilse KMA,Hahaha nice!,flamboyant-raven
219,2025-01-07 10:00:55,Melissa,Geweldig haha,laughing-cat
220,2025-01-07 10:00:56,Wendy KMA,Mag hopen dat je de prijs hebt gewonnen,gamesome-mosquito


We can now drop the original author column

In [20]:
df.drop(columns=["author"], inplace=True)

Check if it's gone

In [21]:
df.head()

Unnamed: 0,timestamp,message,anon_author
0,2021-09-09 21:35:48,â€ŽBerichten en oproepen worden end-to-end versl...,prismatic-baboon
1,2021-09-09 21:35:48,â€ŽSteenbergen KMA heeft deze groep gemaakt,lighthearted-kookabura
2,2021-09-09 21:35:48,â€ŽJe bent toegevoegd,prismatic-baboon
3,2023-04-10 17:23:01,"Voor degene die naar de bullenparade gaan, kun...",plaid-gazelle
4,2023-04-10 17:27:18,*lijken,plaid-gazelle


And let's rename the column

In [22]:
df.rename(columns={"anon_author": "author"}, inplace=True)

In [23]:
df.head()

Unnamed: 0,timestamp,message,author
0,2021-09-09 21:35:48,â€ŽBerichten en oproepen worden end-to-end versl...,prismatic-baboon
1,2021-09-09 21:35:48,â€ŽSteenbergen KMA heeft deze groep gemaakt,lighthearted-kookabura
2,2021-09-09 21:35:48,â€ŽJe bent toegevoegd,prismatic-baboon
3,2023-04-10 17:23:01,"Voor degene die naar de bullenparade gaan, kun...",plaid-gazelle
4,2023-04-10 17:27:18,*lijken,plaid-gazelle


In my case, the first line is a header, saying messages are encrypted. Let's remove that. Your data might be different, so double check if you also want to remove the first line!

In [24]:
df = df.drop(index=[0])

let's check:

In [25]:
df.head()

Unnamed: 0,timestamp,message,author
1,2021-09-09 21:35:48,â€ŽSteenbergen KMA heeft deze groep gemaakt,lighthearted-kookabura
2,2021-09-09 21:35:48,â€ŽJe bent toegevoegd,prismatic-baboon
3,2023-04-10 17:23:01,"Voor degene die naar de bullenparade gaan, kun...",plaid-gazelle
4,2023-04-10 17:27:18,*lijken,plaid-gazelle
5,2023-04-13 12:33:36,Ha! Aangezien dinsdag 25 April beter uitkomt v...,plaid-gazelle


Let's find emojis in the text and add that as a feature.

In [26]:
import re

emoji_pattern = re.compile(
    "["
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "\U00002702-\U000027B0"  # Dingbats
    "\U000024C2-\U0001F251"
    "]+",
    flags=re.UNICODE,
)


def has_emoji(text):
    return bool(emoji_pattern.search(text))


df["has_emoji"] = df["message"].apply(has_emoji)

Let's create a timestamp for a new, unique, filename.

In [27]:
now = datetime.now().strftime("%Y%m%d-%H%M%S")
output = processed / f"whatsapp-{now}.csv"
output

PosixPath('../data/processed/whatsapp-20250228-131711.csv')

Let's save the file both as a csv and as a parquet file.
Parquet has some advantages:
- its about 100x faster to read and write
- datatypes are preserved (eg the timestamp type). You will loose this in a csv file.
- file size is much smaller

The advantage of csv is that you can easily peak at the data in a text editor.

In [28]:
df.to_csv(output, index=False)
df.to_parquet(output.with_suffix(".parq"), index=False)

Now, go to `config.toml` and change the name by "current" to the parquet file you just created.
This makes it easier to use the same file everywhere, without the need to continuously retype the name if you change it.