# Discourse Markers
list: https://github.com/sileod/Discovery/blob/master/data/markers_list.txt
paper: https://arxiv.org/abs/1903.11850

In [3]:
import os
import pandas as pd
import sqlite3
import re
import matplotlib.pyplot as plt
import seaborn as sns


from helpers.occurences import count_occurrences, count_punctuation
from helpers.statistical_tests import run_t_test_on_gender

db_path = "../../../data/giicg.db"
if not os.path.exists(db_path):
    raise FileNotFoundError(f"Database file does not exist: {db_path}")

conn = sqlite3.connect(db_path)
prompts = pd.read_sql("SELECT * FROM main.expanded_prompts", conn)
prompts = prompts.dropna(subset=['conversational'])
prompts = prompts[prompts['conversational'].str.strip() != '']
prompts = prompts.dropna(subset=['conversational'])
prompts

Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...
756,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code. I want to: Get nodes and edge...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92,en
757,1845,37,user,\n nun möchte ich judgement balancing m...,Now I want to bring judgement balancing into t...,,,Woman (cisgender),29,de
758,1847,37,user,\n ich sehe keine veränderung im Plot. Was ...,I do not see any change in the plot.,,,Woman (cisgender),29,de
759,1849,2,user,\n I am working on the problem of reconstru...,\n I am working on the problem of reconstru...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8,en


## Filter discourse markers
remove all markers from the list that are not present in the corpus


In [10]:
import numpy as np

full_text = ' '.join(prompts['conversational'].tolist()).lower()

# Load discourse markers, clean up, and lowercase for matching
with open("markers_list.txt", encoding="utf-8") as f:
    marker_list = [m.strip().lower().rstrip(",") for m in f if m.strip()]


present_markers = []
for marker in marker_list:
    # Create a regex to match marker as full words/phrases (case-insensitive, ignore punctuation)
    pattern = r'\b{}\b'.format(re.escape(marker))
    if re.search(pattern, full_text):
        present_markers.append(marker)

print(f"Markers present in dataset: {present_markers}")

to_remove = ['locally', ]

with open("filtered_markers_list.txt", "w", encoding="utf-8") as f:
    for marker in present_markers:
        f.write(f"{marker}\n")

# Structuring/Sequencing markers (organize information, signal sequence)
structuring_markers = [
    "first", "second", "third", "lastly", "finally", "next", "then", "in the end", "overall", "altogether", "soon", "later", "previously", "now", "currently", "again", "further", "once", "separately"
]

# Contrast/Connection markers (signal contrast, addition, cause/effect, alternatives)
connection_markers = [
    "and", "but", "however", "yet", "although", "though", "instead", "or", "otherwise", "also", "besides", "additionally", "moreover", "plus", "thus",
    "therefore", "so", "accordingly", "rather", "still", "for example", "already", "recently", "sometimes", "suddenly", "together"
]

# Commentary/Interaction markers (speaker attitude, stance, engagement)
interaction_markers = [
    "well", "actually", "basically", "maybe", "only", "really", "sadly", "unfortunately", "especially", "normally", "usually", "here", 'great'
]

included = structuring_markers + connection_markers + interaction_markers
np.setdiff1d(present_markers,included)



Markers present in dataset: ['accordingly', 'actually', 'additionally', 'again', 'already', 'also', 'although', 'altogether', 'and', 'basically', 'besides', 'but', 'currently', 'especially', 'finally', 'first', 'for example', 'further', 'here', 'however', 'in the end', 'instead', 'lastly', 'later', 'locally', 'maybe', 'next', 'normally', 'now', 'once', 'only', 'or', 'overall', 'plus', 'previously', 'rather', 'really', 'recently', 'sadly', 'second', 'separately', 'so', 'sometimes', 'soon', 'still', 'suddenly', 'then', 'therefore', 'third', 'this', 'though', 'together', 'unfortunately', 'usually', 'well', 'yet']


array(['locally', 'this'], dtype='<U13')