In [45]:
import re
from collections import defaultdict

file_path = '/Users/melissamartinez/Downloads/_chat 4.txt'

# Characters to remove that commonly appear in WhatsApp exports
BIDI_CHARS = [
    '\u200e', '\u200f',  # LRM, RLM
    '\u202a', '\u202b', '\u202c', '\u202d', '\u202e',  # embedding/overrides
    '\ufeff',             # BOM
    '\u00a0'              # NBSP
]

# WhatsApp message pattern
pattern = re.compile(r'^\[\s*\d{1,2}/\d{1,2}/\d{2,4},\s*.*?\]\s*(.+?):\s*(.*)$')

# Media keywords
IMAGE_KEYWORDS = {'image omitted', '<media omitted>', 'media omitted', 'photo omitted'}
STICKER_KEYWORDS = {'sticker omitted'}
VIDEO_KEYWORDS = {'video omitted'}
AUDIO_KEYWORDS = {'audio omitted', 'voice message omitted', 'audio message omitted'}

def normalize_line(line: str) -> str:
    for ch in BIDI_CHARS:
        line = line.replace(ch, '')
    return re.sub(r'\s+', ' ', line).strip()

# Stats structure
stats = defaultdict(lambda: {
    "texts": 0,
    "stickers": 0,
    "images": 0,
    "videos": 0,
    "audio": 0,
    "words": 0,
    "nacky": 0
})

current_sender = None

with open(file_path, 'r', encoding='utf-8') as f:
    for raw_line in f:
        line = normalize_line(raw_line)
        if not line:
            continue

        m = pattern.match(line)
        if m:
            sender, message = m.groups()
            message_lower = message.lower().strip()
            current_sender = sender

            # Identify message type
            if any(k in message_lower for k in STICKER_KEYWORDS):
                stats[sender]["stickers"] += 1
            elif any(k in message_lower for k in IMAGE_KEYWORDS):
                stats[sender]["images"] += 1
            elif any(k in message_lower for k in VIDEO_KEYWORDS):
                stats[sender]["videos"] += 1
            elif any(k in message_lower for k in AUDIO_KEYWORDS):
                stats[sender]["audio"] += 1
            else:
                stats[sender]["texts"] += 1
                words = message_lower.split()
                stats[sender]["words"] += len(words)

                # Count "kys" or "kms" as separate words
                stats[sender]["nacky"] += sum(1 for w in words if w in {"nacky"})
        else:
            # Continuation of previous message
            if current_sender:
                message_lower = line.lower()
                words = message_lower.split()
                stats[current_sender]["words"] += len(words)
                stats[current_sender]["nacky"] += sum(1 for w in words if w in {"nacky"})


print(f"{'Sender':<35} {'Texts':>6} {'Words':>8} {'Images':>7} {'Stickers':>9} {'Videos':>7} {'Audio':>6}")
print("-" * 90)
for sender, c in stats.items():
    print(f"{sender:<35} {c['texts']:6} {c['words']:8} {c['images']:7} {c['stickers']:9} {c['videos']:7} {c['audio']:6}")


print(f"{'Sender':<35} {'nacky':>8}")
print("-" * 90)
for sender, c in stats.items():
    print(f"{sender:<35} {c['nacky']:8}")

for sender, counts in stats.items():
    wpm = counts['words'] / counts['texts']
    print(f"{sender: <35} {wpm}")

Sender                               Texts    Words  Images  Stickers  Videos  Audio
------------------------------------------------------------------------------------------
SIA ‚ù§Ô∏èüíõüíô                                10       88       0         0       0      0
Melissa Martinez                      8535    86560     536        39      15      1
Anusha S                                 2       18       0         0       0      0
Kathy Tejada                            54      536       0         0       0      0
Madison Sousa                         2329    20673      39        12       1      1
Isa Sirriani                           166     3368       6         0       0      0
Vivian Paguada                         769     7820      36         4       0      1
Wendy Holguin                          639     6096      23         0       0      0
Caro Garcia                            583     3915      38         0       3      0
Isa Ramos                             4591    327