# Analyzing Discord Messages 

Connects to the DB and turns it into a Pandas Dataframe

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3

db_path = "../data/Database.db"

conn = sqlite3.connect(db_path)

df = pd.read_sql("SELECT * FROM messages", conn)
df = df.rename(columns={'message': 'Button'})

conn.close()
pd.options.mode.chained_assignment = None

Check all unique values -> See which ones we want to sort out

In [None]:
df["Button"].unique()

Replaces certain key values for readability reasons

In [None]:
button_replacements = {
    "Button.left": "LC",  # Change left click to "LC"
    "Button.right": "RC",  # Change right click to "RC"
    "Key.space": " ",
    "Key.enter": "Enter",
    "Key.tab":"Tab",
    "Key.backspace": "<-",
    "Key.ctrl_l": "CTRL",
    "Key.alt_l": "alt",
    "Key.shift": "SH",
    "Button.x1": "M1",
    "Button.middle": "mid",
    "Key.esc": "esc"
}
def transform_button_name(button):
    return button_replacements.get(button, button)

df['Button'] = df['Button'].apply(transform_button_name)
df

## Plotting amount of presses per Button

Creating a List of buttons we do not care about, basically all non letters/numbers that don't show up in actual messages + symbols
remove characters from that list from our dataframe to plot its updated values now

In [None]:
# Non letter character list
notWanted = ["alt", "LC", "RC", "CTRL", "esc", "Tab", "SH", "Enter", "<-", " ", '.','?','!']
# Create lettercount_df = Filter out the notWanted list
lettercount_df = df[~df["Button"].isin(notWanted)]
lettercount_df['Button'] = lettercount_df['Button'].str.lower()

# Get value counts of the cleaned-up 'Button' column and sort by index
lettercount_df = lettercount_df['Button'].value_counts().sort_index().sort_values(ascending=False)

# Create a bar plot
plt.figure(figsize=(10, 5))
plt.bar(lettercount_df.index, lettercount_df.values, color="skyblue")

# Add labels and title
plt.xlabel("Letter")
plt.ylabel("Frequency")
plt.title("Letter Usage Frequency in Process")
plt.xticks(rotation=0)  # Keep letters horizontal

# Show the plot
plt.show()

## Read Discord messages

Putting together the messages 1 by 1 by using the enter presses as delimiter = message send. Saving them in a df for further use
also accounts for backspaces and ctrl + a + backspace, technically could still be messed up by ctrl + a + any other letter, still needs some finetuning

In [None]:
import time
cleaned_buttons = []
notWantedReconstruct = ["alt", "LC", "RC", "esc", "CTRL","Tab", "SH", '.','!']
dfReconstruct = df[~df["Button"].isin(notWantedReconstruct)]
dfReconstruct = dfReconstruct.sort_values(by='press_time', ascending=True)
df_messages = pd.DataFrame(columns=["Message","timeTaken"])

# Simulate the backspace logic
for idx, row in dfReconstruct.iterrows():
    button = row['Button']
    press_time = row['press_time']
    release_time = row['release_time'] 
    if button == "<-":
        if cleaned_buttons:
            if cleaned_buttons[-1][0] != '\x01':  # Avoid removing '\x01'
                cleaned_buttons.pop()
    else:
        cleaned_buttons.append((button, press_time, release_time))  # Save both button and timestamp

# Reconstruct messages and calculate time taken
messages = []
message_times = []
current_message = []
start_time = None  # To track when a message starts

for button, press_time, release_time in cleaned_buttons:
    if button == '\x01':  # Indicates clear/reset
        current_message = []
        start_time = None

    elif button.lower() == "enter":  # End of a message
        if current_message:
            message_str = ''.join(current_message)
            messages.append(message_str)

            # Calculate the time taken for the message
            if start_time is not None:
                time_taken = release_time - start_time
            else:
                time_taken = None  # If no start time, fallback to None

            message_times.append(time_taken)
            current_message = []
            start_time = None  # Reset timer

    else:
        if not current_message:  # First character in a new message
            start_time = press_time
        current_message.append(button)

# Handle a message if Enter wasn't pressed at the end
if current_message:
    current_message = []

# Create DataFrame
df_messages = pd.DataFrame({
    "Message": messages,
    "TimeTaken": message_times
})

df_messages

Autocorrects the new Messages df. Semi important as it doesn't know modern abbreviations sometimes, still worth it

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()

# Function to correct a full message
def correct_message(msg):
    words = msg.split()
    corrected_words = []
    for word in words:
        corrected = spell.correction(word)
        corrected_words.append(corrected if corrected else word)
    return ' '.join(corrected_words)


# Apply the correction to the entire column
df_messages['Corrected_Message'] = df_messages['Message'].apply(correct_message)

In [None]:
df_messages

In [None]:

def transform_df(df):
    # Filter out non-letters and problematic values
    df['dwell_time'] = df['release_time'] - df['press_time']
    df = df[df['dwell_time'] > 0]
    df = df[df['Button'].str.isalpha()]  # keep only letters
    df['is_enter'] = df['Button'].str.lower() == 'enter'
    df['message_id'] = df['is_enter'].cumsum()

# Set a constant user_id if you're analyzing your own data
    df['user_id'] = 'Ben'

# Drop the helper column if no longer needed
    df.drop(columns=['is_enter'], inplace=True)
    df = df[df['Button'].str.len() == 1]  # keep only single characters
    all_keys = sorted(df['Button'].str.lower().unique())

    feature_rows = []

    for msg_id, group in df.groupby('message_id'):
        row = {}
        row['message_id'] = msg_id
        row['user_id'] = group['user_id'].iloc[0]

        row['message_length'] = len(group)
        row['avg_dwell'] = group['dwell_time'].mean()
        row['std_dwell'] = group['dwell_time'].std()
        row['key_diversity'] = group['Button'].nunique()

        # Optional: overall message timing
        row['typing_duration'] = group['release_time'].max() - group['press_time'].min()

        # Per-key dwell averages
        group['Button'] = group['Button'].str.lower()
        for key in all_keys:
            key_dwell = group[group['Button'] == key]['dwell_time']
            row[f'dwell_{key}'] = key_dwell.mean() if not key_dwell.empty else 0

        feature_rows.append(row)

    return pd.DataFrame(feature_rows)


## Create dwell time per key feature table


In [None]:
df_dt = dfReconstruct
df_dt = transform_df(df_dt)
df_dt

### Form messages with avg values using enter as delimiter again

In [None]:
df_dt['is_enter'] = df_dt['Button'].str.lower() == 'enter'
df_dt['message_id'] = df_dt['is_enter'].cumsum()
df_dt

### Read in synthetic user data and concat it with my own

In [None]:
df_synthetic = pd.read_csv('../data/synthetic_typing_data.csv')
df_synthetic

In [None]:
df_combine = pd.concat([df_dt, df_synthetic])
df_combine = df_combine.fillna(0)
df_combine

## Now the model training starts

### Splitting the table

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

X = df_combine.drop(columns=["user_id", "message_id"])
y = le.fit_transform(df_combine['user_id'])  # y will now be numeric

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Training the model

In [None]:
import xgboost as xgb
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))