# Feature Extraction

A quick notebook to test and determine the fastest ways to extract features from the data.

In [1]:
from typing import *
from messages import *
from helpers import *
from collections import Counter
import os
from datetime import datetime, timedelta
import calendar
import csv
import pandas as pd
import re

In [2]:
YEAR = 2024

In [3]:
conversations: list[Conversation] = []

for user in os.listdir(FILE_PATH):
    conversation = Conversation(os.path.join(FILE_PATH, user))

    conversations.append(conversation.messagesBetweenTime(datetime(YEAR, 1, 1).timestamp() * 1000, datetime(2025, 1, 1).timestamp() * 1000 - 1))



In [4]:
direct = [conversation for conversation in conversations if len(conversation.participants) == 2 and conversation.title in conversation.participants]

In [5]:
mainUser = predictUser(direct)

# Defining Categories

* Total Messages
* Total Messages Sent
* Messages Sent (top 5 people)
* Total Messages Received
* Messages Received (top 5 people)

#### Note: Stats should be limited to DMs only.

In [6]:
totalMessages = 0
totalMessagesSent = 0
totalMessagesReceived = 0
sentToCounts = Counter()
receivedFromCounts = Counter()

for conv in direct:
    totalMessages += len(conv.messages)
    totalMessagesSent += len(conv.messagesFrom(mainUser).messages)
    totalMessagesReceived += len(conv.messages) - len(conv.messagesFrom(mainUser).messages)

    otherUser = User(conv.title)
    sentToCounts[otherUser] = len(conv.messagesFrom(mainUser).messages)
    receivedFromCounts[otherUser] = len(conv.messagesFrom(otherUser).messages)


totalMessages, totalMessagesSent, totalMessagesReceived

receivedFromCounts.most_common(5)

[(User("Hooman"), 2259),
 (User("katelynð·"), 2146),
 (User("Neha Kasoju"), 1619),
 (User("Liam McNamara"), 768),
 (User("Josh Choong"), 482)]

## Other Categories

* Favorite Text 
* Favorite Emoji/Reaction
* Downbad Hours (12-4am)
* King of Slurs (most slurs used)
* Most Ghosted

In [7]:
allMessages = Counter()

for conv in direct:
    for message in conv.messagesFrom(mainUser).messages:
        if message.messageType == MessageType.TEXT and message.content.find("wasn't notified about this message because they're in quiet mode.") == -1:
            allMessages[message.content] += 1

allMessages.pop('You sent an attachment.')

allMessages.most_common(5) # Kinda boring

[('ð\x9f\x98\xadð\x9f\x98\xad', 127),
 ('ð\x9f\x92\x80ð\x9f\x92\x80', 81),
 ('Yea', 36),
 ('Yeah', 25),
 ('Ah okok', 20)]

In [8]:
downbadMessages = Counter()

for conv in direct:
    for day in range(366 if calendar.isleap(YEAR) else 365):
        # 1-4 each day
        startTimestamp = (datetime(YEAR, 1, 1) + timedelta(day)).timestamp()
        endTimeStamp = (datetime(YEAR, 1, 1, 4) + timedelta(day)).timestamp()

        otherUser = User(conv.title)

        downbadMessages[otherUser] += len(conv.messagesBetweenTime(startTimestamp * 1000, endTimeStamp * 1000).messages)

downbadMessages.most_common(5)

[(User("Liam McNamara"), 809),
 (User("Hooman"), 645),
 (User("Neha Kasoju"), 529),
 (User("Yousuf Ahmed Khan"), 349),
 (User("katelynð·"), 159)]

In [9]:
swearwords: list[str] = []

with open("assets/swearwords.csv") as f:
    reader = csv.DictReader(f)

    for row in reader:
        if "Mild" in row["Level of offensiveness"]:
            continue

        if "word" in row["Level of offensiveness"]:
            swearwords.append(row["Word"].lower())


In [17]:
# Find the slur king
slurs = Counter()

for conv in direct:
    otherUser = User(conv.title)
    for message in conv.messagesFrom(otherUser).messages:
        if message.messageType != MessageType.TEXT: continue

        messageLower = message.content.lower()

        for word in swearwords:
            if re.search(rf"\b{word}\b", messageLower):
                slurs[otherUser] += 1

slurs.most_common(5)

[(User("Hooman"), 15),
 (User("Liam McNamara"), 15),
 (User("Yousuf Ahmed Khan"), 13),
 (User("Charles Rachwal"), 7),
 (User("kshithij.malebennur"), 6)]

## AI Categories

* Biggest Simp
* Most Freaky
* Most Favours (Fake Friends)
* Biggest Hater
* Most desperate (Could be non ai) 