# Feature Extraction

A quick notebook to test and determine the fastest ways to extract features from the data.

In [10]:
from typing import *
from messages import *
from helpers import *
from collections import Counter
import os
from datetime import datetime, timedelta
import calendar

In [11]:
YEAR = 2024

In [12]:
conversations: list[Conversation] = []

for user in os.listdir(FILE_PATH):
    conversation = Conversation(os.path.join(FILE_PATH, user))

    conversations.append(conversation.messagesBetweenTime(datetime(YEAR, 1, 1).timestamp() * 1000, datetime(2025, 1, 1).timestamp() * 1000 - 1))



In [13]:
direct = [conversation for conversation in conversations if len(conversation.participants) == 2 and conversation.title in conversation.participants]

In [14]:
mainUser = predictUser(direct)

# Defining Categories

* Total Messages
* Total Messages Sent
* Messages Sent (top 5 people)
* Total Messages Received
* Messages Received (top 5 people)

#### Note: Stats should be limited to DMs only.

In [15]:
totalMessages = 0
totalMessagesSent = 0
totalMessagesReceived = 0
sentToCounts = Counter()
receivedFromCounts = Counter()

for conv in direct:
    totalMessages += len(conv.messages)
    totalMessagesSent += len(conv.messagesFrom(mainUser).messages)
    totalMessagesReceived += len(conv.messages) - len(conv.messagesFrom(mainUser).messages)

    otherUser = User(conv.title)
    sentToCounts[otherUser] = len(conv.messagesFrom(mainUser).messages)
    receivedFromCounts[otherUser] = len(conv.messagesFrom(otherUser).messages)


totalMessages, totalMessagesSent, totalMessagesReceived

receivedFromCounts.most_common(5)

[(User("Hooman"), 2259),
 (User("katelynð·"), 2146),
 (User("Neha Kasoju"), 1619),
 (User("Liam McNamara"), 768),
 (User("Josh Choong"), 482)]

## Other Categories

* Favorite Text 
* Favorite Emoji/Reaction
* Downbad Hours (12-4am)
* King of Slurs (most slurs used)

In [16]:
allMessages = Counter()

for conv in conversations:
    for message in conv.messagesFrom(mainUser).messages:
        if message.messageType == MessageType.TEXT and message.content.find("wasn't notified about this message because they're in quiet mode.") == -1:
            allMessages[message.content] += 1

allMessages.pop('You sent an attachment.')

allMessages.most_common(5) # Kinda boring

[('ð\x9f\x98\xadð\x9f\x98\xad', 161),
 ('ð\x9f\x92\x80ð\x9f\x92\x80', 130),
 ('Yea', 55),
 ('Yeah', 38),
 ('Ah okok', 23)]

In [17]:
downbadMessages = Counter()

for conv in direct:
    for day in range(366 if calendar.isleap(YEAR) else 365):
        # 1-4 each day
        startTimestamp = (datetime(YEAR, 1, 1) + timedelta(day)).timestamp()
        endTimeStamp = (datetime(YEAR, 1, 1, 4) + timedelta(day)).timestamp()

        otherUser = User(conv.title)

        downbadMessages[otherUser] += len(conv.messagesBetweenTime(startTimestamp * 1000, endTimeStamp * 1000).messages)

downbadMessages.most_common(5)

TypeError: unsupported operand type(s) for +=: 'int' and 'Conversation'

## AI Categories

* Biggest Simp
* Most Freaky
* Most Favours (Fake Friends)
* Biggest Hater
* Most desperate (Could be non ai) 