In [None]:
# Define the Phoneme class
class Phoneme:
    def __init__(self, p: str, es: bool, hu: bool):
        self.p = p 
        self.es = es
        self.hu = hu

    def __repr__(self):
        return f"(p={self.p}, es={True if self.es else False}, hu={True if self.hu else False})"


# Create p objects for each unique p
phonemes = {
    "y": Phoneme(p="y", es=False, hu=True),
    "æ": Phoneme(p="æ", es=False, hu=False),
    "ø": Phoneme(p="ø", es=False, hu=True),
    "ʁ": Phoneme(p="ʁ", es=False, hu=False),
    "ɶ": Phoneme(p="ɶ", es=False, hu=False),
    "u": Phoneme(p="u", es=True, hu=True),
    "u:": Phoneme(p="u:", es=False, hu=True),
    "e": Phoneme(p="e", es=True, hu=True),
    "i": Phoneme(p="i", es=True, hu=True),
    "o": Phoneme(p="o", es=True, hu=True),
    "a": Phoneme(p="a", es=True, hu=True),
    "ɛ": Phoneme(p="ɛ", es=False, hu=True),
    "ɡ": Phoneme(p="ɡ", es=True, hu=True),
    "ɔ": Phoneme(p="ɔ", es=False, hu=False),
    "p": Phoneme(p="p", es=True, hu=True),
    "d": Phoneme(p="d", es=True, hu=True),
    "l": Phoneme(p="l", es=True, hu=True),
    "v": Phoneme(p="v", es=True, hu=True),
}
# Dictionary for pairs and their associated p contrasts
pairs_phonemes = {
    ('ful', 'ful'): ('u', 'u'),
    ('lys', 'lys'): ('y', 'y'),
    ('mæt', 'mæt'): ('æ', 'æ'),
    ('kat', 'kat'): ('a', 'a'),
    ('ben', 'ben'): ('e', 'e'),
    ('grå', 'grå'): ('ɔ', 'ɔ'),
    ('nø', 'nø'): ('ø', 'ø'),
    ('rød', 'rød'): ('ʁ', 'ʁ'),
    ('møl', 'møl'): ('ø', 'ø'),
    ('syr', 'syr'): ('y', 'y'),
    ('dør', 'dør'): ('ɶ', 'ɶ'),
    ('skæl', 'skæl'): ('æ', 'æ'),
    ('røre', 'røre'): ('ʁ', 'ʁ'),
    ('fuld', 'fuld'): ('u', 'u'),
    ('ful', 'fyl'): ('u', 'y'),
    ('syn', 'søn'): ('y', 'ø'),
    ('set', 'sæt'): ('e', 'æ'),
    ('kat', 'kæt'): ('a', 'æ'),
    ('fisk', 'fæsk'): ('i', 'æ'),
    ('lys', 'los'): ('y', 'o'),
    ('ben', 'bøn'): ('e', 'ø'),
    ('nø', 'nu'): ('ø', 'u'),
    ('rød', 'ryd'): ('ø', 'y'),
    ('mølle', 'mulle'): ('ø', 'u'),
    ('syd', 'sød'): ('y', 'ø'),
    ('tat', 'tæt'): ('a', 'æ'),
    ('sø', 'su'): ('ø', 'u'),
    ('væg', 'vægt'): ('ɡ', 'd'),
    ('løg', 'ly'): ('ø', 'y'),
    ('rør', 'rær'): ('ø', 'æ'),
    ('rød', 'lød'): ('ʁ', 'l'),
    ('fuld', 'fugl'): ('u', 'u:'),
    ('gul', 'guld'): ('u:', 'u:'),
    ('sinde', 'sende'): ('i', 'e'),
    ('pil', 'bil'): ('v', 'p'),
    ('ven', 'pen'): ('v', 'p'),
    ('lidt', 'let'): ('i', 'ɛ'),
    ('før', 'fær'): ('ø', 'æ'),
}

# print(f"Total pairs: {len(pairs_phonemes)}")

# for pair, p in pairs_phonemes.items():
#     print(f"{pair} : {p}")

print(pairs_phonemes[("løg", "ly")])
# pairs_phonemes

('ø', 'y')


In [57]:
from typing import List

class Response:
    def __init__(self, is_correct: bool, word1: str, word2: str):
        self.is_correct = is_correct
        self.word1 = word1
        self.word2 = word2
        self.phoneme = pairs_phonemes[(word1, word2)]

    def __repr__(self):
        return (
            f"Response(isCorrect={self.is_correct}, "
            f"word1='{self.word1}', word2='{self.word2}')"
        )


# Class for participant data
class ParticipantData:
    def __init__(self, participant_id: str, language: str):
        self.id = participant_id
        self.language = language
        self.responses: List[Response] = []  # Initialize as an empty list

    def __repr__(self):
        return (
            f"ParticipantData(id={self.id}, language='{self.language}', "
            f"responses={self.responses})"
        )

In [58]:
import json
import pandas as pd

# Load the data from Excel
df = pd.read_excel("data.xlsx")

# Create a list to hold all participants' data
participants: List[ParticipantData] = []

# Process each row in the dataframe
for _, row in df.iterrows():
    # Extract participant-level data
    participant_id = row["Participant ID"]
    language = row["L1"]
    responses_json = row["Data"]

    responses_data = json.loads(responses_json)
    participant = ParticipantData(participant_id, language)

    # Add responses to the participant
    for response in responses_data:
        if response["word1"] != "sinde":
            participant.responses.append(
                Response(
                    is_correct=response["isCorrect"],
                    word1=response["word1"],
                    word2=response["word2"],
                )
            )
    participants.append(participant)

# Accessing the first response of the first participant
print(participants[0].responses[0].phoneme, participants[0].responses[0])

('ʁ', 'ʁ') Response(isCorrect=True, word1='rød', word2='rød')


In [59]:
from collections import defaultdict

accuracy_es: List[int] = []
accuracy_hu: List[int] = []
accuracy_dk: List[int] = []

# Function to calculate accuracy for each participant
def calculate_participant_accuracy(participant: ParticipantData):
    total_trials = 37
    correct_trials = sum(1 for p in participant.responses if p.is_correct)
    return correct_trials / total_trials if total_trials > 0 else 0

print (calculate_participant_accuracy(participants[61]))

0.6756756756756757


In [60]:
from collections import defaultdict
from typing import Dict, List

language_accuracies: Dict[str, List[float]] = defaultdict(list)

for participant in participants:
    accuracy = calculate_participant_accuracy(participant)
    language_accuracies[participant.language].append(accuracy)

average_accuracies = {
    language: sum(accuracies) / len(accuracies) if accuracies else 0
    for language, accuracies in language_accuracies.items()
}

for language, avg_accuracy in average_accuracies.items():
    print(f"Language: {language}, Average Accuracy: {avg_accuracy:.2%}")

print (language_accuracies)

for language, accuracies in language_accuracies.items():
    print(f"Language: {language}, Number of Accuracies: {len(accuracies)}")

Language: Danish, Average Accuracy: 94.26%
Language: Hungarian, Average Accuracy: 75.38%
Language: Spanish, Average Accuracy: 74.87%
defaultdict(<class 'list'>, {'Danish': [0.9459459459459459, 0.8918918918918919, 0.972972972972973, 0.8918918918918919, 0.972972972972973, 0.918918918918919, 1.0, 0.9459459459459459], 'Hungarian': [0.7837837837837838, 0.8108108108108109, 0.7837837837837838, 0.8108108108108109, 0.7297297297297297, 0.7837837837837838, 0.7027027027027027, 0.7567567567567568, 0.8648648648648649, 0.7567567567567568, 0.7297297297297297, 0.7297297297297297, 0.7567567567567568, 0.7027027027027027, 0.7297297297297297, 0.7567567567567568, 0.7297297297297297, 0.8108108108108109, 0.7567567567567568, 0.6756756756756757, 0.7297297297297297, 0.7027027027027027, 0.6486486486486487, 0.8108108108108109, 0.8108108108108109, 0.7837837837837838, 0.7027027027027027], 'Spanish': [0.7297297297297297, 0.5945945945945946, 0.8378378378378378, 0.8378378378378378, 0.6756756756756757, 0.891891891891891

## Data visualizations

In [64]:
from collections import defaultdict
import pandas as pd

# Initialize storage for phoneme accuracy and word pairs
phoneme_accuracy = defaultdict(lambda: defaultdict(list))
phoneme_word_pairs = defaultdict(list)  # To store word pairs for each phoneme

# Iterate over participants and their responses
for participant in participants:
    for response in participant.responses:
        wordpair = (response.word1, response.word2)
        phoneme = response.phoneme
        phoneme_accuracy[phoneme][participant.language].append(response.is_correct)
        phoneme_word_pairs[phoneme].append(wordpair)

# Calculate average accuracy and standard deviation for each phoneme per language
table_data = defaultdict(dict)  # Stores data in a row-wise structure

for phoneme, language_data in phoneme_accuracy.items():
    for language, accuracies in language_data.items():
        avg_accuracy = round(sum(accuracies) / len(accuracies) if accuracies else 0, 2)
        std_dev = round(pd.Series(accuracies).std() if accuracies else 0, 2)
        table_data[phoneme][f"{language} Accuracy"] = avg_accuracy
        table_data[phoneme][f"{language} Std"] = std_dev

# Convert to a pandas DataFrame
df = pd.DataFrame.from_dict(table_data, orient="index").fillna(0)

# Add word pairs to the DataFrame (remove duplicates)
df["Word Pair"] = df.index.map(lambda phoneme: len(set(phoneme_word_pairs[phoneme])))


# Rename columns for clarity
df.columns.name = "Metric"
df.index.name = "Phoneme"

# Reorder columns to group accuracy and std for each language together
columns = ["Word Pair"] + sorted(
    [col for col in df.columns if col != "Word Pair"], key=lambda x: x.split()[0]
)
df = df[columns]

# Print the table
print(df)

Metric  Word Pair  Danish Accuracy  Danish Std  Hungarian Accuracy  \
ʁ  ʁ            2             1.00        0.00                0.85   
æ  æ            2             1.00        0.00                0.91   
ø  y            2             1.00        0.00                0.89   
ɶ  ɶ            1             1.00        0.00                0.89   
a  æ            2             1.00        0.00                0.54   
y  ø            2             0.81        0.40                0.28   
v  p            1             1.00        0.00                0.96   
e  e            1             1.00        0.00                0.89   
i  æ            1             0.62        0.52                0.07   
u  u            2             0.88        0.34                0.91   
ʁ  l            1             1.00        0.00                1.00   
a  a            1             1.00        0.00                0.96   
ø  u            3             1.00        0.00                0.94   
u  y            1   