# Analyzing Discord Messages 

Connects to the DB and turns it into a Pandas Dataframe

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3

db_path = "Database.db"

conn = sqlite3.connect(db_path)

df = pd.read_sql("SELECT * FROM messages", conn)
df = df.rename(columns={'message': 'Button'})

conn.close()
df.head()

Check all unique values -> See which ones we want to sort out

In [None]:
df["Button"].unique()

Replaces certain key values for readability reasons

In [None]:
button_replacements = {
    "Button.left": "LC",  # Change left click to "LC"
    "Button.right": "RC",  # Change right click to "RC"
    "Key.space": " ",
    "Key.enter": "Enter",
    "Key.tab":"Tab",
    "Key.backspace": "<-",
    "Key.ctrl_l": "CTRL",
    "Key.alt_l": "alt",
    "Key.shift": "SH",
    "Button.x1": "M1",
    "Button.middle": "mid",
    "Key.esc": "esc"
}
def transform_button_name(button):
    return button_replacements.get(button, button)

df['Button'] = df['Button'].apply(transform_button_name)
df

## Plotting amount of presses per Button

Creating a List of buttons we do not care about, basically all non letters/numbers that don't show up in actual messages + symbols
remove characters from that list from our dataframe to plot its updated values now

In [None]:
# Non letter character list
notWanted = ["alt", "LC", "RC", "CTRL", "esc", "Tab", "SH", "Enter", "<-", " ", '.','?','!']
# Create lettercount_df = Filter out the notWanted list
lettercount_df = df[~df["Button"].isin(notWanted)]
lettercount_df['Button'] = lettercount_df['Button'].str.lower()

# Get value counts of the cleaned-up 'Button' column and sort by index
lettercount_df = lettercount_df['Button'].value_counts().sort_index().sort_values(ascending=False)

# Create a bar plot
plt.figure(figsize=(10, 5))
plt.bar(lettercount_df.index, lettercount_df.values, color="skyblue")

# Add labels and title
plt.xlabel("Letter")
plt.ylabel("Frequency")
plt.title("Letter Usage Frequency in Process")
plt.xticks(rotation=0)  # Keep letters horizontal

# Show the plot
plt.show()

## Read Discord messages

Putting together the messages 1 by 1 by using the enter presses as delimiter = message send. Saving them in a df for further use
also accounts for backspaces and ctrl + a + backspace, technically could still be messed up by ctrl + a + any other letter, still needs some finetuning

In [None]:
cleaned_buttons = []
notWantedReconstruct = ["alt", "LC", "RC", "esc", "CTRL","Tab", "SH", '.','!']
dfReconstruct = df[~df["Button"].isin(notWantedReconstruct)]
df_messages = pd.DataFrame(columns=["Message"])

# Simulate the backspace logic
for button in dfReconstruct['Button']:
    if button == "<-":
        if cleaned_buttons:
            if (cleaned_buttons[-1] != '\x01'):
                cleaned_buttons.pop()
    else:
        cleaned_buttons.append(button)

# Now reconstruct messages using Enter as delimiter
messages = []
current_message = []

for button in cleaned_buttons:
    if button == '\x01':
        current_message = []
        
    elif button.lower() == "enter":
        if current_message:
            message_str = ''.join(current_message)  # Turn list into single string
            messages.append(message_str)
            current_message = []
    else:
        current_message.append(button)

# Add last message if Enter wasn't pressed at the end
if current_message:
    messages.append(''.join(current_message))

# Convert to DataFrame where each row is one full message string
df_messages = pd.DataFrame(messages, columns=["Message"])
df_messages

# Print or save


Autocorrects the new Messages df. Semi important as it doesn't know modern abbreviations sometimes, still worth it

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()

# Function to correct a full message
def correct_message(msg):
    words = msg.split()
    corrected_words = []
    for word in words:
        corrected = spell.correction(word)
        corrected_words.append(corrected if corrected else word)
    return ' '.join(corrected_words)


# Apply the correction to the entire column
df_messages['Corrected_Message'] = df_messages['Message'].apply(correct_message)

In [None]:
df_messages