# Whatstics

### This script summarizes a WhatsApp chat history.

<br>

🚨🚨🚨🚨🚨 CAUTION 🚨🚨🚨🚨🚨

The script use openai api and may cost you A LOT OF MONEY. Please be aware of that.

<br>

# 1. First configure Date limits and the paths

For example:
- START_DATE = "2024-08-23"
- END_DATE = "2024-08-23"
- PATH = "../history/chat.txt"
- OUTPUT_PATH = "../summary/output.txt"

Start and end date are inclusive, and must be equal

🚨🚨🚨🚨🚨 CAUTION 🚨🚨🚨🚨🚨

Bigger the date range, more expensive the cost.

In [None]:
START_DATE = "2024-08-23"
END_DATE = "2024-08-23"

FILE_PATH = "../history/conversa.txt"
SUMMARY_PATH = "../summary/summary.txt"

In [None]:
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()


In [None]:
import pandas as pd
import re


def add_mesage_to_df(messages, date, time, id, message, type_="message", subtype=None):
    messages.append(
        {
            "Date": date,
            "Time": time,
            "Id": id,
            "Message": message,
            "Type": type_,
            "Action_type": subtype,
        }
    )
    return messages


def manage_action(messages, current_date, current_time, id, rest_of_line):
    if "entrou usando o link de convite deste grupo" in rest_of_line:
        action_type = "entry"
        id = rest_of_line.split("entrou usando o link de convite deste grupo")[
            0
        ].strip()
    elif "saiu" in rest_of_line:
        action_type = "exit"
        id = rest_of_line.split("saiu")[0].strip()
    elif "mudou a descrição do grupo" in rest_of_line:
        action_type = "description_change"
        id = rest_of_line.split("mudou a descrição do grupo")[0].strip()
    elif "fixou uma mensagem" in rest_of_line:
        action_type = "message_pin"
        id = rest_of_line.split("fixou uma mensagem")[0].strip()
    elif "mudou as configurações" in rest_of_line:
        action_type = "group_settings_change"
        id = rest_of_line.split("mudou as configurações do grupo")[0].strip()
    elif "(arquivo anexado)" in rest_of_line:
        action_type = "file_attach"
        id = rest_of_line.split("(arquivo anexado)")[0].strip()
    elif "criou o grupo" in rest_of_line:
        action_type = "group_create"
        id = rest_of_line.split("criou o grupo")[0].strip()
    elif "foi adicionado(a)" in rest_of_line:
        action_type = "add"
        id = rest_of_line.split("foi adicionado(a)")[0].strip()
    else:
        print("Error line:", rest_of_line)
        assert False

    add_mesage_to_df(
        messages, current_date, current_time, id, None, "action", subtype=action_type
    )


def parse_whatsapp_history(file_path):
    # Lista para armazenar as mensagens extraídas
    messages = []

    # Ler o arquivo de histórico
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()

    # Regex para identificar a data, hora e número do remetente
    date_and_time_pattern = re.compile(r"(\d{2}/\d{2}/\d{4}) (\d{2}:\d{2}) - (.*)")

    # Variáveis para acumular mensagens
    current_date = None
    current_time = None
    current_id = None
    current_message = None

    for line in lines:
        # Verifica se a linha corresponde ao padrão de uma nova mensagem
        match = date_and_time_pattern.match(line)
        if match:
            if current_message:
                messages = add_mesage_to_df(
                    messages, current_date, current_time, current_id, current_message
                )
            current_message = ""
            current_date, current_time, rest_of_line = match.groups()
            if "‎" in rest_of_line:
                manage_action(
                    messages, current_date, current_time, current_id, rest_of_line
                )
            else:
                if ":" in rest_of_line:
                    current_id, message = rest_of_line.split(":", 1)
                    current_message = message.strip()
        else:
            current_message += "\n" + line.strip()

    messages = add_mesage_to_df(
        messages, current_date, current_time, current_id, current_message, "message"
    )

    df = pd.DataFrame(messages)

    df["Order"] = range(1, len(df) + 1)
    
    df["DateTime"] = pd.to_datetime(df["Date"].astype(str) + " " + df["Time"].astype(str))

    return df

In [None]:
# Exemplo de uso
history = parse_whatsapp_history(FILE_PATH)
history[history["Type"] != "message"].head()

In [None]:
history.info()

In [None]:
print("Min DateTime:", history["DateTime"].min())
print("Max DateTime:", history["DateTime"].max())

In [None]:
from datetime import datetime, timedelta

# Convert strings to datetime objects
start_datetime = datetime.strptime(START_DATE, "%Y-%m-%d")
end_datetime = datetime.strptime(END_DATE, "%Y-%m-%d") + timedelta(days=1)

print(start_datetime, end_datetime)

messages_df = history[
    (history["Type"] == "message")
    & (history["DateTime"] >= start_datetime)
    & (history["DateTime"] < end_datetime)
]
print(len(messages_df))
messages_df.tail()

In [None]:
print("Min DateTime:", messages_df["DateTime"].min())
print("Max DateTime:", messages_df["DateTime"].max())

In [None]:
from pprint import pprint

# Full chat string
# Every new message starts with a new line followed by NM:
full_chat_str = " ".join(messages_df["Message"].values)

full_chat_str[1000:2000]

In [None]:
from pprint import pprint

# prompts from: https://github.com/omer-go/group_chat_summarizer
# Many thanks!

start_datetime_prompt =  start_datetime.strftime("%d/%m/%Y")
end_datetime_prompt = datetime.strptime(END_DATE, "%Y-%m-%d").strftime("%d/%m/%Y")

print(start_datetime_prompt, end_datetime_prompt)

date_range_prompt = ""
if start_datetime_prompt != end_datetime_prompt:
    date_range_prompt = f"de {start_datetime_prompt} a {end_datetime_prompt}"
else:
    date_range_prompt = f"em {start_datetime_prompt}"

SUMMARY_PROMPT = f"""Por favor, resuma o seguinte bate-papo de grupo do WhatsApp com base nos tópicos discutidos. As conversas ocorreram {date_range_prompt}. Para cada tópico, inclua seu título e um resumo em tópicos. Os tópicos devem incluir informações detalhadas e relevantes para nossa turma de Graduação em Inteligência Artificial. Inclua no resumo todas as recomendações sobre empresas, serviços específicos, vagas de emprego, recomendações de ferramentas computacionais, recursos educacionais e contatos (links de redes sociais, email ou telefones). Inclua também todos os links que foram compartilhados."""
NEWSLETTER_PROMPT = f"""Por favor, forneça um parágrafo para abrir uma newsletter cobrindo os seguintes tópicos:"""

pprint(SUMMARY_PROMPT)

In [None]:
def print_model(model):
    print("Uging model: ")
    print(model.dict())
    print('\n\n')

In [None]:
import langchain
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser

# llm = ChatOpenAI(model="gpt-4o")
llm = ChatOpenAI(model="gpt-4o-mini")
print_model(llm)

code_prompt = PromptTemplate(
    template="{instruction}\nAs mensagens: ```\{messages}```",
    input_variables=["instruction", "messages"],
)

code_chain = code_prompt | llm

langchain.debug = True


In [None]:
results = code_chain.invoke({
    "instruction": SUMMARY_PROMPT,
    "messages": full_chat_str,
})
results

In [None]:
pprint(results.content)

# copy to clipboard:
import pyperclip
pyperclip.copy(results.content)

# save to file
with open(SUMMARY_PATH, "w") as file:
    file.write(results.content)

In [None]:
results.usage_metadata['total_tokens']

In [None]:
messages_df.info()