In [None]:
from google.colab import files
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
print("Uploaded file name:", file_name)


Saving sample_chat_with_all_features.txt to sample_chat_with_all_features.txt
Uploaded file name: sample_chat_with_all_features.txt


In [None]:
!pip install dateparser emoji transformers --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import re
import pandas as pd
from datetime import datetime, timedelta
import dateparser
import emoji
from transformers import pipeline

def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

def extract_links(message):
    if not isinstance(message, str): return None
    return re.findall(r"https?://\S+|www\.\S+", message) or None

def extract_deadline(message):
    match = re.search(r"\bby\s+([a-zA-Z0-9 ,:-]+)", message, re.IGNORECASE)
    if match:
        parsed = dateparser.parse(match.group(1), settings={"RELATIVE_BASE": datetime.now()})
        return parsed.date() if parsed else None
    return None

def extract_meeting_info(msg):
    time = re.search(r"\d{1,2}(:\d{2})?\s?(AM|PM)", msg, re.IGNORECASE)
    location = re.search(r"(Room\s*\d+|at\s+\w+|in\s+\w+)", msg, re.IGNORECASE)
    reason = re.search(r"(for|regarding)\s+(.+?)[\.\n]", msg, re.IGNORECASE)
    return {
        "time": time.group(0) if time else None,
        "location": location.group(0) if location else None,
        "reason": reason.group(2) if reason else None
    }


In [None]:
chat_pattern = r"\[(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2}:\d{2} [APM]{2})\] (.*?): (.*)"
chat_data = []

with open(file_name, "r", encoding="utf-8") as file:
    for line in file:
        match = re.match(chat_pattern, line)
        if match:
            chat_data.append(match.groups())

df = pd.DataFrame(chat_data, columns=["Date", "Time", "Sender", "Message"])
df["Message"] = df["Message"].apply(remove_emojis)
df.head()


Unnamed: 0,Date,Time,Sender,Message
0,26/06/24,10:15:45 AM,Sr. Aishwarya,Team meeting at 5 PM in Room 402 regarding Te...
1,26/06/24,10:18:02 AM,ECES 2024-25,Don't forget to submit your poster designs by ...
2,26/06/24,10:20:33 AM,Sr. Shashank,@Manvitha Can you present the slides for the e...
3,26/06/24,10:23:11 AM,ECES 2024-25,Here's the drive link for the resources: https...
4,26/06/24,10:25:55 AM,Sr. Aishwarya,"Btw, did you guys see what happened in the caf..."


In [None]:
# Feature extraction
df["Links"] = df["Message"].apply(extract_links)
df["Deadline"] = df["Message"].apply(lambda x: extract_deadline(x) if isinstance(x, str) else None)
df["Mentioned"] = df["Message"].str.contains("Manvitha", case=False, na=False)

meeting_info = df["Message"].apply(extract_meeting_info)
df["Meeting_Location"] = meeting_info.apply(lambda x: x["location"])
df["Meeting_Time"] = meeting_info.apply(lambda x: x["time"])
df["Meeting_Reason"] = meeting_info.apply(lambda x: x["reason"])

keywords = ["exam", "test", "placement", "deadline", "submission", "important"]
df["Keyword_Match"] = df["Message"].apply(lambda msg: any(k in msg.lower() for k in keywords))

df["Priority_Score"] = (
    df["Mentioned"].astype(int) * 3 +
    df["Deadline"].notnull().astype(int) * 3 +
    df["Meeting_Location"].notnull().astype(int) * 2 +
    df["Links"].apply(lambda x: any(".pdf" in link for link in x) if x else False).astype(int) * 1 +
    df["Keyword_Match"].astype(int) * 2
)


In [None]:
# ✅ Smart cleaner
def clean_and_expand(msg):
    if not isinstance(msg, str) or msg.strip() == "":
        return ""
    msg = emoji.replace_emoji(msg, replace="")
    msg = re.sub(r"\s+", " ", msg)
    if "@manvitha" in msg.lower():
        if "present" in msg.lower():
            return "Manvitha was asked to present the slides."
        elif "remind" in msg.lower():
            return "Manvitha was asked to remind others."
        elif "submit" in msg.lower():
            return "Manvitha was reminded to submit the material."
        else:
            return "Manvitha was mentioned for a task."
    msg = msg.replace("Here's the", "The team shared the")
    msg = msg.replace("drive.google.com", "a Google Drive link")
    msg = msg.strip()
    return msg if msg.endswith(('.', '?', '!')) else msg + "."

df["Cleaned_Sentences"] = df["Message"].apply(clean_and_expand)
priority_msgs = df[df["Priority_Score"] > 1]["Cleaned_Sentences"].dropna().tolist()
chat_text = " ".join(priority_msgs)[:1024]

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summary = summarizer(chat_text, max_length=120, min_length=40, do_sample=False)[0]['summary_text']

print("📋 Chat Summary:\n", summary)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu
Your max_length is set to 120, but your input_length is only 79. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)


📋 Chat Summary:
 Team meeting at 5 PM in Room 402 regarding Tech Fest. Don't forget to submit your poster designs by Friday! Manvitha was asked to present the slides. Group photo tomorrow at 3 PM near the seminar hall.


In [8]:
import pandas as pd
import dateparser
from datetime import datetime, timedelta
from google.colab import files

# ✅ Sample DataFrame (skip if you already have df)
# df = pd.read_csv("your_dataframe.csv")  # or your existing df from WhatsApp

# ✅ Step 1: Safe deadline parser function
def safe_parse_date(message):
    if isinstance(message, str) and "by" in message.lower():
        parsed = dateparser.parse(message, settings={"RELATIVE_BASE": datetime.now()})
        if parsed:
            return parsed.date()
    return None

# ✅ Step 2: Apply deadline parsing safely
df["Parsed_Deadline"] = df["Message"].apply(safe_parse_date)

# ✅ Step 3: Get today's and tomorrow's deadlines
today = datetime.now().date()
tomorrow = today + timedelta(days=1)

today_df = df[df["Parsed_Deadline"] == today]
tomorrow_df = df[df["Parsed_Deadline"] == tomorrow]

# ✅ Step 4: Priority score (optional — add if not already in df)
df["Priority_Score"] = (
    df["Parsed_Deadline"].notnull().astype(int)
    + df["Message"].str.contains("@manvitha", case=False, na=False).astype(int)
)

# ✅ Step 5: Filter important messages (you can use your own logic too)
important_df = df[df["Priority_Score"] > 0]

# ✅ Step 6: Export all 3 as CSVs
important_df.to_csv("important_messages.csv", index=False)
today_df.to_csv("today_tasks.csv", index=False)
tomorrow_df.to_csv("tomorrow_tasks.csv", index=False)

# ✅ Step 7: Download the files
files.download("important_messages.csv")
files.download("today_tasks.csv")
files.download("tomorrow_tasks.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>