In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import re, string, datetime, emoji, math, time, json
from collections import defaultdict
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.image as mpimg
from typing import Sequence, Tuple
from PIL import Image, ImageFont, ImageDraw
from pilmoji import Pilmoji
from markdown_pdf import MarkdownPdf, Section
from google import genai

# --------------------------------------------------------------------
# --------------------------------------------------------------------

# Change these as needed:
# All of these are required for the main statistical inference
WHATSAPP_GROUP_NAME = "<your group name>"
DATA_EXPORTED_BY = "<your name>"
NAME_OF_WHATSAPP_FILE = '<your file name>.txt'
STARTING_DAY_OF_SAMPLE = datetime.date(1999,12,31)  # year, month, date format
ENDING_DAY_OF_SAMPLE = datetime.date(2004,1,31)     # year, month, date format
DATA_CUTOFF_STR = "31st Jan 2004 <Edit this to be your real date>"

# This is optional, only if you want to generate the individual text report
GEMINI_API_KEY = ''
# If you choose to generate this, be mindful of the fact that you will be providing your text chat data to 
# google gemini, and they may process your data for training purposes based on their privacy policy.
# The first pdf will be processed entirely in your device.



# --------------------------------------------------------------------
# --------------------------------------------------------------------

In [None]:
pd.set_option('display.max_rows', None)

TTF_FONTFILE_PATH = "DejaVuSans.ttf"
PDF_EXPORT = PdfPages(f"{WHATSAPP_GROUP_NAME}_Statistical Inferences.pdf") 

with open(NAME_OF_WHATSAPP_FILE,'r', encoding="utf8") as f:
    raw_text = f.read()

In [None]:
def set_first_of_month_xticks(plt, keys):
    xticks_dates = {}
    for i, item in enumerate(keys):
        try: date = datetime.datetime.fromisoformat(item[:-5])
        except ValueError: date = datetime.datetime.fromisoformat(item)
        if date.day == 1:
            xticks_dates[i] = date.strftime("%b\n%Y")
    plt.xticks(list(xticks_dates.keys()),list(xticks_dates.values()))

In [None]:
msgs_per_date = defaultdict(int)
msgs_per_person = defaultdict(int)
total_characters_per_person = defaultdict(int)
daily_msg_count_per_person = defaultdict(lambda: defaultdict(int))
hourly = {str(i):0 for i in range(24)}
words = defaultdict(int)
emojis_per_person = defaultdict(int)
emojis = defaultdict(int)
media = defaultdict(int)
deleted = defaultdict(int)
edited = defaultdict(int)
all_person_messages = defaultdict(list)


working_text =  raw_text.replace('‎','').replace('[','よ').replace(']','々')
working_text = re.sub("\nよ(\d)(\d)/(\d)(\d)/(\d)(\d), ",r"末よ\1\2/\3\4/\5\6, ", working_text)

data = '\n末'.join(working_text.split("末"))
with open('temp.txt', 'w', encoding="utf-8") as f:
    f.write(data)

for entry in working_text.split("末"):
    try:
        dt, a = entry.split('々 ', 1)
        person, msg = a.split(": ", 1)
    except ValueError:
        continue # Only system messages

    date_time_obj = datetime.datetime.strptime(dt, 'よ%d/%m/%y, %I:%M:%S\u202f%p')
    hour = str(date_time_obj.hour)
    date_iso = date_time_obj.date().isoformat()
    author=person.split()[0]
    if len(author) == 1:
        author = person.split()[1]

    if msg in ('video omitted', 'image omitted'):
        media[author] += 1
        continue
    if msg.strip() in ("This message was deleted.", "You deleted this message."):
        deleted[author] += 1
        continue
    if msg.endswith("<This message was edited>"):
        edited[author] += 1
        msg = msg.replace("<This message was edited>","")

    
    msgs_per_date[date_iso] += 1
    msgs_per_person[author] += 1
    total_characters_per_person[author] += len(msg)
    daily_msg_count_per_person[author][date_iso] += 1
    hourly[hour] += 1
    all_person_messages[author].append(msg)
    for raw in msg.split():
        word = raw.strip(string.punctuation).lower()
        if not word:
            continue
        words[word] += 1
        emojis_per_person[author] += len(emoji.emoji_list(raw))
    

for entry in emoji.emoji_list(raw_text):
    emojis[entry['emoji']] += 1

# Adding empty dates
def add_empty_dates(dict):
    new = {}
    current, last = STARTING_DAY_OF_SAMPLE, ENDING_DAY_OF_SAMPLE
    while current <= last:
        new[current.isoformat()] = dict[current.isoformat()] # Works because dict is a defaultdict
        current += datetime.timedelta(days=1)
    return new

msgs_per_date = add_empty_dates(msgs_per_date)
for person in daily_msg_count_per_person:
    daily_msg_count_per_person[person] = add_empty_dates(daily_msg_count_per_person[person])

In [None]:
def make_black_png_with_centered_text_lines(
        lines: Sequence[Tuple[str, int]],
        filename: str = "text_page.png",
        *,
        width: int        = 3500,     # px
        height: int | None = 2000,     # px; auto-size if None
        line_spacing: float = 1.25,   # 1.0 = tight, >1 = looser
        padding: int       = 80       # px margin top/bottom & left/right
) -> str:
    """
    Render multiple lines of text (each with its own font size) into
    a black PNG and return the saved path.

    Parameters
    ----------
    lines : sequence of (text, font_size)
        e.g. [("Title", 60), ("subtitle", 40), ("small note", 24)]
    filename : output file name
    width / height : canvas size; if height is None it's auto-sized
    font_path : TrueType/OTF font file; falls back to DejaVuSans or PIL default
    line_spacing : inter-line multiplier applied to each line’s font metrics
    padding : space between text block and image edge
    """

    # ── 2. measure every line ──────────────────────────────────────────────
    dummy = Image.new("RGB", (1, 1))
    draw_dummy = ImageDraw.Draw(dummy)

    line_metrics = []  # (text, font, width_px, height_px)
    max_line_px = width - 2 * padding

    for txt, sz in lines:
        font = ImageFont.truetype(TTF_FONTFILE_PATH, sz)
        w = draw_dummy.textlength(txt, font=font)
        if w > max_line_px:
            raise ValueError(
                f'"{txt}" (size {sz}) is wider than the allowed {max_line_px}px. '
                "Either provide a larger `width`, reduce `padding`, "
                "or use shorter text."
            )
        ascent, descent = font.getmetrics()
        h = int((ascent + descent) * line_spacing)
        line_metrics.append((txt, font, w, h))

    # ── 3. canvas height & creation ────────────────────────────────────────
    text_block_h = sum(h for *_, h in line_metrics)
    if height is None:
        height = text_block_h + 2 * padding

    img = Image.new("RGB", (width, height), color="black")
    draw = ImageDraw.Draw(img)

    # ── 4. render lines centred ────────────────────────────────────────────
    y = (height - text_block_h) // 2
    for txt, font, w, h in line_metrics:
        x = (width - w) // 2
        draw.text((x, y), txt, font=font, fill="white")
        y += h

    img.save(filename)
    return filename



cover_png = make_black_png_with_centered_text_lines([
    ("Statistical Inferences", 120),
    (f'based on "{WHATSAPP_GROUP_NAME}" chat messages', 90),
    ("", 120),
    (f"Data Cut-off: {DATA_CUTOFF_STR}", 90),
    ("", 120),
    (f"Data Exported by: {DATA_EXPORTED_BY}", 60),
    ("", 120),
    ("Prepared by: Madhav AK | DA24B", 60)
], filename=f"{WHATSAPP_GROUP_NAME}_cover.png")

In [None]:
def add_png_page(path, dpi=300):
    """
    Reads a PNG (or any image Matplotlib supports) and writes it
    as a full-page into an open PdfPages handle.
    """
    img = mpimg.imread(path)

    # Size the figure so the image is 1-for-1 at the chosen DPI
    h_px, w_px = img.shape[:2]
    fig = plt.figure(figsize=(w_px / dpi, h_px / dpi), dpi=dpi)

    ax = fig.add_axes([0, 0, 1, 1])   # edge-to-edge
    ax.axis("off")
    ax.imshow(img)

    PDF_EXPORT.savefig(fig)   # append to PDF
    plt.close(fig)     # keep memory tidy

add_png_page(cover_png)

In [None]:
def standard_plot(title, x, y, color, special_month_xticks=True, rotation=0):
    plt.style.use('dark_background')
    fig = plt.figure(figsize =(16, 8))
    plt.bar(x,y, color=color)
    if special_month_xticks:
        set_first_of_month_xticks(plt, x)
    plt.title(title, fontsize=16)
    plt.xticks(rotation=rotation)
    PDF_EXPORT.savefig(fig, bbox_inches="tight")
    plt.show()
    
def horizontal_tall_standard_plot(title, x, y, color, special_month_xticks=True, rotation=0):
    plt.style.use('dark_background')
    fig = plt.figure(figsize =(8, 13))
    plt.barh(list(reversed(x)), list(reversed(y)), color=color)
    if special_month_xticks:
        set_first_of_month_xticks(plt, x)
    plt.title(title, fontsize=16)
    plt.yticks(rotation=rotation)
    PDF_EXPORT.savefig(fig, bbox_inches="tight")
    plt.show()

In [None]:
standard_plot("Messages Per Day", msgs_per_date.keys(), msgs_per_date.values(), (1,0,0))

In [None]:
monthly={}
for date in msgs_per_date:
    d = datetime.date.fromisoformat(date)
    month = d.strftime('%b\n%Y')
    try: monthly[month] += msgs_per_date[date]
    except KeyError: monthly[month] = msgs_per_date[date]
standard_plot("Messages Per Month", monthly.keys(), monthly.values(),(1,0,0),False)

In [None]:
standard_plot("Messages Per Hour", [f"{item}-{int(item)+1}" for item in hourly.keys()], hourly.values(), (0,0,1), False)

In [None]:
title = "Total Messages Sent"
sorted_tuples = sorted(msgs_per_person.items(), key= lambda item: item[1], reverse=True)
fig = plt.figure(figsize =(10, 10))
plt.pie(
    x = [item[1] for item in sorted_tuples], 
    labels = [f"{item[0]} | {(item[1])}" for item in sorted_tuples],
    explode = [[0.1,0][i%2] for i in range(len(msgs_per_person))]
    )
plt.title(title, fontsize=16)
PDF_EXPORT.savefig(fig, bbox_inches="tight")
plt.show()

In [None]:
average_msg_length = {}
for person in msgs_per_person:
    average_msg_length[person] = total_characters_per_person[person] / msgs_per_person[person]
sorted_dict = {k:v for k,v in sorted(average_msg_length.items(), key= lambda item: item[1], reverse=True)[:20]}
standard_plot("Average message length", sorted_dict.keys(), sorted_dict.values(), (0,1,1), False, rotation=20)

In [None]:
title = "Number of Media Attachments"
sorted_tuples = sorted(media.items(), key= lambda item: item[1], reverse=True)
fig = plt.figure(figsize =(10, 10))
plt.pie(
    x = [item[1] for item in sorted_tuples], 
    labels = [f"{item[0]} | {(item[1])}" for item in sorted_tuples],
    explode = [[0.1,0][i%2] for i in range(len(media))]
    )
plt.title(title, fontsize=16)
PDF_EXPORT.savefig(fig, bbox_inches="tight")
plt.show()

In [None]:
title = "Number of deleted messages"
sorted_tuples = sorted(deleted.items(), key= lambda item: item[1], reverse=True)
fig = plt.figure(figsize =(10, 10))
plt.pie(
    x = [item[1] for item in sorted_tuples], 
    labels = [f"{item[0]} | {(item[1])}" for item in sorted_tuples],
    explode = [[0.1,0][i%2] for i in range(len(deleted))]
    )
plt.title(title, fontsize=16)
PDF_EXPORT.savefig(fig, bbox_inches="tight")
plt.show()

In [None]:
editer_ratio = {person:deleted[person]*100 / msgs_per_person[person] for person in deleted} 
sorted_dict = {k:v for k,v in sorted(editer_ratio.items(), key= lambda item: item[1], reverse=True)[:20]}
horizontal_tall_standard_plot("Percentage of Msgs deleted", sorted_dict.keys(), sorted_dict.values(), (1,1,0), False)

In [None]:
title = "Number of edited messages"
sorted_tuples = sorted(edited.items(), key= lambda item: item[1], reverse=True)
fig = plt.figure(figsize =(10, 10))
plt.pie(
    x = [item[1] for item in sorted_tuples], 
    labels = [f"{item[0]} | {(item[1])}" for item in sorted_tuples],
    explode = [[0.1,0][i%2] for i in range(len(edited))]
    )
plt.title(title, fontsize=16)
PDF_EXPORT.savefig(fig, bbox_inches="tight")
plt.show()

In [None]:
editer_ratio = {person:edited[person]*100 / msgs_per_person[person] for person in edited} 
sorted_dict = {k:v for k,v in sorted(editer_ratio.items(), key= lambda item: item[1], reverse=True)[:20]}
horizontal_tall_standard_plot("Percentage of Msgs edited", sorted_dict.keys(), sorted_dict.values(), (1,1,0), False)

In [None]:
title = "Number of Emojis Sent"
sorted_tuples = sorted(emojis_per_person.items(), key= lambda item: item[1], reverse=True)
fig = plt.figure(figsize =(10, 10))
plt.pie(
    x = [item[1] for item in sorted_tuples], 
    labels = [f"{item[0]} | {(item[1])}" for item in sorted_tuples],
    explode = [[0.1,0][i%2] for i in range(len(emojis_per_person))]
    )
plt.title(title, fontsize=16)
PDF_EXPORT.savefig(fig, bbox_inches="tight")
plt.show()

In [None]:
ranked_emojis = sorted(emojis.items(), key=lambda tup: tup[1], reverse=True)[:15]
print("Most Used Emojis:")
for item in ranked_emojis:
    print(item[0], '' ,item[1])

In [None]:
def emojis_to_png(emojis,
                  outfile,
                  *,
                  top_n=15,
                  font_size=72,
                  font_path=None,
                  padding=40,
                  line_spacing=1.15):

    # ✦ choose a font that has the ASCII digits we’ll print
    font_path = TTF_FONTFILE_PATH
    font = ImageFont.truetype(font_path, font_size)

    # ── layout pass ───────────────────────────────────────────────────────
    ranked  = sorted(emojis.items(), key=lambda t: t[1], reverse=True)[:top_n]
    line_h  = int(font_size * line_spacing)           # already an int

    dummy   = Image.new("RGB", (1, 1))
    draw    = ImageDraw.Draw(dummy)
    max_w   = max(draw.textlength(f"{g}  {c}", font=font) for g, c in ranked)

    w = int(math.ceil(max_w + padding * 2))           # ← cast to int
    h = int(line_h * len(ranked) + padding * 2)       # already int-ish, but be safe

    # ── render pass ───────────────────────────────────────────────────────
    im = Image.new("RGBA", (w, h), "black")
    with Pilmoji(im) as pilmoji:
        y = padding
        for glyph, count in ranked:
            pilmoji.text((padding, y),
                         f"{glyph}  {count}",
                         font=font,
                         fill="white")
            y += line_h

    im.save(outfile)
    print(f"✅  Saved {outfile}")


emoji_outfile=f"{WHATSAPP_GROUP_NAME}_emoji_stats.png"
emojis_to_png({k: v for k, v in ranked_emojis}, outfile=emoji_outfile)
add_png_page(emoji_outfile)

In [None]:
PDF_EXPORT.close()
print("Saved statistical inferences pdf")

In [None]:
if GEMINI_API_KEY:
	raise ValueError ("You didnt put a gemini api key")
client = genai.Client(api_key=GEMINI_API_KEY)

llm_output_dict = {}

for person, all_msgs in all_person_messages.items():
	all_text = '\n'.join(all_msgs)
	query = f"Here are all the messages sent by a particular person:\n{all_text}\n\nBased on all this data, point out any strengths, and any flaws/vulneribilities of this person. Note this is an informat text chat, so don't mention anything regarding clarity of comminication, or informal language."
	llm_response = client.models.generate_content(model="gemini-2.0-flash", contents=query).text
	llm_output_dict[person] = llm_response
	print(person, end = ' ')
	time.sleep(2)

with open(f"{WHATSAPP_GROUP_NAME} person best and worst.json", 'w') as f:
	json.dump(llm_output_dict, f)

In [None]:
with open(f"{WHATSAPP_GROUP_NAME} person best and worst.json", 'r') as f:
	llm_output_dict = json.load(f)
order_of_people = sorted(msgs_per_person.items(), key= lambda item: item[1], reverse=True)
single_text = ""
for tup in order_of_people:
	person = tup[0]
	summary = llm_output_dict[person]
	single_text = f"{single_text}# {person}:\n{summary}\n\n\n"
with open(f"{WHATSAPP_GROUP_NAME} person best and worst.md", 'w', encoding="utf-8") as f:
	f.write(single_text)

In [None]:
with open(f"{WHATSAPP_GROUP_NAME} person best and worst.md" , 'r', encoding="utf-8") as f:
    markdown_content = f.read()

pdf = MarkdownPdf()
pdf.meta["title"] = 'Title'
pdf.add_section(Section(markdown_content, toc=False))
pdf.save(f'{WHATSAPP_GROUP_NAME} Individual Strength-Weaknesses-Flaws.pdf')