# Feature Extraction

A quick notebook to test and determine the fastest ways to extract features from the data.

In [1]:
from typing import *
from messages import *
from helpers import *
from collections import Counter
import os
from datetime import datetime, timedelta
import calendar
import csv
import re

In [2]:
YEAR = 2024

In [3]:
conversations: list[Conversation] = []

for user in os.listdir(FILE_PATH):
    conversation = Conversation(os.path.join(FILE_PATH, user))

    conversations.append(conversation.messagesBetweenTime(datetime(YEAR, 1, 1).timestamp() * 1000, datetime(2025, 1, 1).timestamp() * 1000 - 1))



In [4]:
direct = [conversation for conversation in conversations if len(conversation.participants) == 2 and conversation.title in conversation.participants]

In [5]:
mainUser = predictUser(direct)

# Defining Categories

* Total Messages
* Total Messages Sent
* Messages Sent (top 5 people)
* Total Messages Received
* Messages Received (top 5 people)
* Most active month

#### Note: Stats should be limited to DMs only.

In [None]:
totalMessages = 0
totalMessagesSent = 0
totalMessagesReceived = 0
sentToCounts = Counter()
receivedFromCounts = Counter()

for conv in direct:
    totalMessages += len(conv.messages)
    totalMessagesSent += len(conv.messagesFrom(mainUser).messages)
    totalMessagesReceived += len(conv.messages) - len(conv.messagesFrom(mainUser).messages)

    otherUser = User(conv.title)
    sentToCounts[otherUser] = len(conv.messagesFrom(mainUser).messages)
    receivedFromCounts[otherUser] = len(conv.messagesFrom(otherUser).messages)


totalMessages, totalMessagesSent, totalMessagesReceived

receivedFromCounts.most_common(5)

In [None]:
monthCounts = Counter()

for conv in direct:
    for message in conv.messages:
        monthCounts[datetime.fromtimestamp(message.timestamp / 1000).month] += 1

monthCounts

## Other Categories

* Favorite Text 
* Favorite Emoji/Reaction
* Downbad Hours (12-4am)
* King of Slurs (most slurs used)
* Most Ghosted

In [None]:
allMessages = Counter()

for conv in direct:
    for message in conv.messagesFrom(mainUser).messages:
        if message.messageType == MessageType.TEXT and message.content.find("wasn't notified about this message because they're in quiet mode.") == -1:
            allMessages[message.content] += 1

allMessages.pop('You sent an attachment.')

allMessages.most_common(5) # Kinda boring

In [None]:
downbadMessages = Counter()

for conv in direct:
    for day in range(366 if calendar.isleap(YEAR) else 365):
        # 1-4 each day
        startTimestamp = (datetime(YEAR, 1, 1) + timedelta(day)).timestamp()
        endTimeStamp = (datetime(YEAR, 1, 1, 4) + timedelta(day)).timestamp()

        otherUser = User(conv.title)

        downbadMessages[otherUser] += len(conv.messagesBetweenTime(startTimestamp * 1000, endTimeStamp * 1000).messages)

downbadMessages.most_common(5)

In [9]:
swearwords: list[str] = []

with open("assets/swearwords.csv") as f:
    reader = csv.DictReader(f)

    for row in reader:
        if "Mild" in row["Level of offensiveness"]:
            continue

        if "word" in row["Level of offensiveness"]:
            swearwords.append(row["Word"].lower())


In [None]:
# Find the slur king
slurs = Counter()

for conv in direct:
    otherUser = User(conv.title)
    for message in conv.messagesFrom(otherUser).messages:
        if message.messageType != MessageType.TEXT: continue

        messageLower = message.content.lower()

        for word in swearwords:
            if re.search(rf"\b{word}\b", messageLower):
                slurs[otherUser] += 1

slurs.most_common(5)

In [None]:
# Times left on read

leftOnRead = Counter()

for conv in direct:
    otherUser = User(conv.title)
    for i, lastMessage in enumerate(conv.messages):
        if i + 1 == len(conv.messages): break

        currMessage = conv.messages[i + 1]

        if currMessage.sender == mainUser and lastMessage.sender == otherUser and (currMessage.timestamp - lastMessage.timestamp) / 1000 > 60 * 60 * 24:
            leftOnRead[otherUser] += 1

leftOnRead.most_common(5)

## AI Categories

* Biggest Simp
* Most Freaky
* Most Favours (Fake Friends)
* Biggest Hater
* Most desperate (Could be non ai) 