In [81]:
import os
import pandas as pd
import pingouin as pg
import seaborn as sns
import glob
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
from scipy.stats import ttest_ind
from scipy.stats import chi2_contingency
# Download NLTK data (tokenizers)
nltk.download('punkt')
#pyspellchecker
from spellchecker import SpellChecker

[nltk_data] Downloading package punkt to /Users/Nikita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [56]:
os.getcwd()

'/Users/Nikita/Desktop/EPL/experimental_psycholinguistics_2023/Exam_experiment'

In [57]:
# the only o´columns we'll need
useful_col = ["sound", "response"]
#Make df for condition a responses 
# load in jsonfiles
json_files = glob.glob(os.path.join("..","in","a","*.json"))  # Adjust the path to your actual directory
dataframes = [pd.read_json(file) for file in json_files]
a_df = pd.concat(dataframes, ignore_index=True)
df_a = a_df[useful_col].copy()
df_a['sound'] = df_a['sound'].astype(str)
#Making acolumn that defines that states this as condition a
df_a.insert(1, 'condition', 'a')
df_a


Unnamed: 0,sound,condition,response
0,18,a,mary could not discuss the thing
1,17,a,the bomb exploded with a blast
2,n-1,a,the roses bushes quickly thorns
3,n-2,a,we should have considered the juice
4,10,a,You are considering the gang
...,...,...,...
175,12,a,nancy should consider the fist
176,n-22,a,the farmer harvested his crops
177,5,a,roof poured the water down the drain
178,15,a,i've spoken about the pile


In [58]:
#dataframe b
json_files = glob.glob(os.path.join("..","in","b","*.json"))  # Adjust the path to your actual directory
dataframes = [pd.read_json(file) for file in json_files]
b_df = pd.concat(dataframes, ignore_index=True)
df_b = b_df[useful_col].copy()
df_b['sound'] = df_a['sound'].astype(str)
df_b.insert(1, 'condition', 'b')

# ahh it seems that the test data accidentally was transferred and by that leaves a lot of columns with "NaN"
# let's just remove it
df_b = df_b.dropna(subset=['sound'])
df_b

Unnamed: 0,sound,condition,response
0,18,b,a rose bush has prickly thorns
1,17,b,we should have considered the juice
2,n-1,b,the bomb exploded with a blast
3,n-2,b,mary could not discuss the tax
4,10,b,they marched to the beat of the drum
...,...,...,...
175,12,b,the sailor swapped the deck
176,n-22,b,bob could have known about the spoon'
177,5,b,the food was shipped in wooden crates
178,15,b,Ann was interested in the breath


In [59]:
# for now, let's just merge the two dataframes into one big masterdf so we can do some t-testing! 
df = pd.concat([df_a, df_b])
df

Unnamed: 0,sound,condition,response
0,18,a,mary could not discuss the thing
1,17,a,the bomb exploded with a blast
2,n-1,a,the roses bushes quickly thorns
3,n-2,a,we should have considered the juice
4,10,a,You are considering the gang
...,...,...,...
175,12,b,the sailor swapped the deck
176,n-22,b,bob could have known about the spoon'
177,5,b,the food was shipped in wooden crates
178,15,b,Ann was interested in the breath


In [60]:
# loading in the correct sentence answers and merging them with the dataframe
correct_df = pd.read_csv(os.path.join("in","sentences_3.csv"), delimiter=";", decimal=",")
df = pd.merge(df, correct_df, on='sound')
df = df.sort_values(by='sound', ascending=True)
# Convert "response" and "sentences" columns to lowercase
df['response'] = df['response'].str.lower()
df['sentences'] = df['sentences'].str.lower()
df

Unnamed: 0,sound,condition,response,predictability,sentences
335,1,b,the boy gave the football a kick,(H),a rose bush has prickly thorns.
336,1,b,a lord pushes,(H),a rose bush has prickly thorns.
334,1,a,a rosebush have prickley thornes,(H),a rose bush has prickly thorns.
333,1,a,a rosebush has prickly thorns,(H),a rose bush has prickly thorns.
332,1,a,a rose soon,(H),a rose bush has prickly thorns.
...,...,...,...,...,...
114,n-32,a,tom is talking about the fee,(L),tom is talking about the fee.
113,n-32,a,thom is talking about the fee,(L),tom is talking about the fee.
112,n-32,a,tom is talking about the fee.,(L),tom is talking about the fee.
110,n-32,a,tom is talking about the fee,(L),tom is talking about the fee.


In [67]:
# now we are ready to tokenize the words and count how many of the words articipants got right for each sound file
# tokenize the words 
df['response_words'] = df['response'].apply(word_tokenize)
df['sentence_words'] = df['sentences'].apply(word_tokenize)

# Initialize spell checker
spell = SpellChecker()

# Trying to adjust for spelling mistakes. It's not perfect but better than nothing
df['response_words'] = df['response_words'].apply(lambda words: [spell.correction(word) for word in words])


# Comparing the lists of words and counting the correct ones
df['word_count'] = [sum(word in sentence_words for word in response_words)
                                   for response_words, sentence_words
                                   in zip(df['response_words'], df['sentence_words'])]

print(df[['sound', 'response', 'sentences', 'word_count']])

df.to_csv(os.path.join("out", "results_all.csv"))


    sound                          response                        sentences  \
335     1  the boy gave the football a kick  a rose bush has prickly thorns.   
336     1                    a lord pushes   a rose bush has prickly thorns.   
334     1  a rosebush have prickley thornes  a rose bush has prickly thorns.   
333     1     a rosebush has prickly thorns  a rose bush has prickly thorns.   
332     1              a rose          soon  a rose bush has prickly thorns.   
..    ...                               ...                              ...   
114  n-32      tom is talking about the fee    tom is talking about the fee.   
113  n-32     thom is talking about the fee    tom is talking about the fee.   
112  n-32     tom is talking about the fee.    tom is talking about the fee.   
110  n-32      tom is talking about the fee    tom is talking about the fee.   
118  n-32     the farmer harvested his crop    tom is talking about the fee.   

     word_count  
335           1  
336

In [68]:
# Separate the DataFrame into two based on the 'sound' column
df_with_n = df[df['sound'].str.startswith('n-')]
df_without_n = df[~df['sound'].str.startswith('n-')]
df_with_n.to_csv(os.path.join("out", "results_noisy.csv"))
df_without_n.to_csv(os.path.join("out", "results_clear.csv"))

In [65]:
# Perform t-test using pinguin
t_result = pg.ttest(df_with_n['word_count'], df_without_n['word_count'], paired=False)

# Display the results
print(t_result)

# Check if the difference is statistically significant
alpha = 0.05
if t_result['p-val'][0] < alpha:
    print("The difference in word counts of noisy and clear is statistically significant.")
else:
    print("The difference in word counts of noisy and clear is not statistically significant.")


               T  dof alternative     p-val          CI95%   cohen-d   BF10  \
T-test -0.870573  358   two-sided  0.384571  [-0.72, 0.28]  0.091766  0.168   

           power  
T-test  0.139817  
The difference in word counts of noisy and clear is not statistically significant.


  if t_result['p-val'][0] < alpha:
