In [3]:
import pandas as pd
import os

In [155]:
# Load surprisal data from CSV
def load_surprisal_data(language):
    file_path = f"results/surprisal_wav2vec_lm/{language}_surprisal_data_wav2vec_golos.csv"
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        df.rename(columns={'Surprisal Data': 'Surprisal Data Wav2vec'}, inplace=True)
        return df
    else:
        print(f"Error: File '{file_path}' not found.")
        return None

def load_surprisal_data_whisper(language):
    file_path = f"results/surprisal_whisper_medium_ru/{language}_surprisal_data_whisper_medium_ru.csv"
    if os.path.exists(file_path):
        df =  pd.read_csv(file_path)
        df.rename(columns={'Surprisal Data': 'Surprisal Data Whisper'}, inplace=True)
        return df
    else:
        print(f"Error: File '{file_path}' not found.")
        return None
        
def open_dataframe(language, folder_path="results/intelligibility/"):
    file_name = f"{folder_path}{language}_average_results.csv"
    file_name_correct = f"{folder_path}{language}_average_results_correct.csv"

    # Load original and corrected results
    df = pd.read_csv(file_name)
    #df_correct = pd.read_csv(file_name_correct)
    # Merge dataframes on 'source_text_to_be_translated'
    #df = pd.merge(df, df_correct[['source_text_to_be_translated', 'user_free_translation_time_taken']], on='source_text_to_be_translated', suffixes=('', '_correct'), how='left')

    return df

# Load distances
def open_variables(language):
    file_name = "data/all_variables.csv"
    df = pd.read_csv(file_name)
    df = df[df['language'] == language]
    df.rename(columns={'source_text_to_be_translated': 'Expression L2'}, inplace=True)
    df.drop(columns = ['model_gpt_small_avg_surprisal_phrase_ru', 'model_gpt_small_avg_surprisal_literal', 'model_gpt_small_avg_surprisal_phrase_l2','model_gpt_large_avg_surprisal_phrase_ru', 'model_gpt_large_avg_surprisal_literal', 'model_gpt_large_avg_surprisal_phrase_l2', 'correct_percentage_mcq','language',	'correct_percentage_free',	'average_time_free', 'average_time_mcq'], inplace=True)
    return df



In [157]:
languages = ["BE", "BG", "CS", "UK", "PL"]
surprisal_golos = dict()
surprisal_whisper = dict()
combined_data = dict()
distance_variables = dict()
experiment_results = dict()
for language in languages:
    surprisal_golos[language] = load_surprisal_data(language)
    surprisal_golos[language] = load_surprisal_data(language)
    surprisal_whisper[language] = load_surprisal_data_whisper(language)
    distance_variables[language] = open_variables(language)
    experiment_results[language] = open_dataframe(language)
    experiment_results[language].columns = ['Expression L2','user_free_translation_time_taken',	'user_mcq_translation_time_taken', 'accuracy_mcq', 'accuracy_free']#, 'free_translation_time_correct']
    combined_df = pd.merge(experiment_results[language], surprisal_golos[language], on='Expression L2', how='inner')
    combined_df = pd.merge(combined_df, surprisal_whisper[language], on='Expression L2', how='inner')
    combined_df = pd.merge(combined_df, distance_variables[language], on='Expression L2', how='inner')
    combined_data[language] = combined_df
    combined_data['all'] = pd.concat(list(combined_data.values()))


In [158]:
combined_data['all']

Unnamed: 0,Expression L2,user_free_translation_time_taken,user_mcq_translation_time_taken,accuracy_mcq,accuracy_free,audio_number_x,Expression RU_x,Sentence_x,Surprisal Data Wav2vec,audio_number_y,...,was_fixed,pwld_literal,pwld_fixed,RU,model_bert_small_avg_surprisal_phrase_ru,model_bert_small_avg_surprisal_literal,model_bert_small_avg_surprisal_phrase_l2,model_bert_large_avg_surprisal_phrase_ru,model_bert_large_avg_surprisal_literal,model_bert_large_avg_surprisal_phrase_l2
0,а то і,12.500000,4.055556,100.000000,55.555556,16,а то и,"яна не верыць мне і думае, што яе пакараюць не...",14.011767,16,...,3.205246,0.363158,0.326316,а то и,0.000000,0.000000,0.000000,1.995531,6.530054,0.848966
1,ад таго што,11.111111,7.277778,94.444444,72.222222,20,оттого что,"прачнуўшыся ад таго, што мяне тармасіла лі-лі,...",13.971160,20,...,2.886588,0.239234,0.194079,оттого что,15.579988,12.436823,15.687457,0.066178,3.524041,0.237431
2,адным словам,11.458333,5.375000,83.333333,79.166667,33,одним словом,"адным словам, пад мінскам есць такая спецыяльн...",14.645604,33,...,3.469444,0.064593,0.064593,одним словом,19.848414,20.621407,23.131944,0.964844,6.163736,7.806912
3,але не,10.277778,5.222222,94.444444,16.666667,19,хотя и,тыгр здольны лазіць (але не так добра).,14.394813,19,...,3.191823,0.284211,0.324561,хотя и,20.307293,16.877299,16.842264,0.132899,8.756669,4.283575
4,амаль што,11.500000,5.750000,75.000000,16.666667,45,почти что,"яна разумела і нават апраўдвала, калі на гэта ...",13.747244,45,...,2.962412,0.236842,0.279605,почти что,0.000000,0.000000,0.000000,4.144357,4.558953,2.045249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,więc jednak,14.136364,7.090909,45.454545,0.000000,37,все же,"odróżnia nas... tak..., więc jednak",13.972869,37,...,3.485034,0.283626,0.652632,все же,18.476286,17.221178,34.307155,9.139466,11.092093,9.528961
54,wszystko jedno,22.705882,7.705882,64.705882,11.764706,2,все равно,"zamieszanie potęgował fakt, że niektórzy zacho...",11.729475,2,...,3.696618,0.406015,0.351974,все равно,19.194976,15.263934,38.355094,0.150975,4.155643,4.545707
55,wygląda na to,17.466667,5.000000,73.333333,13.333333,35,судя по всему,"wygląda na to, że lubimy takie pochwały.",12.576071,35,...,3.130294,0.260526,0.291866,судя по всему,18.350241,23.179404,23.558151,0.286702,4.019640,9.612821
56,z tego że,13.652174,5.347826,34.782609,4.347826,20,оттого что,pomiędzy nią a resztą pracowników nastąpiła kr...,9.950880,20,...,3.392154,0.075658,0.177632,оттого что,21.258415,13.903686,18.277955,1.012232,4.985864,3.013913


In [160]:
combined_data['all'].corr()

  combined_data['all'].corr()


Unnamed: 0,user_free_translation_time_taken,user_mcq_translation_time_taken,accuracy_mcq,accuracy_free,audio_number_x,Surprisal Data Wav2vec,audio_number_y,Surprisal Data Whisper,was_literal,was_fixed,pwld_literal,pwld_fixed,model_bert_small_avg_surprisal_phrase_ru,model_bert_small_avg_surprisal_literal,model_bert_small_avg_surprisal_phrase_l2,model_bert_large_avg_surprisal_phrase_ru,model_bert_large_avg_surprisal_literal,model_bert_large_avg_surprisal_phrase_l2
user_free_translation_time_taken,1.0,0.464914,-0.053324,-0.283384,0.03551,-0.326742,0.03551,-0.005482,0.077187,0.072446,0.070323,0.20805,-0.058121,0.01061,-0.055826,-0.010111,0.025385,0.097075
user_mcq_translation_time_taken,0.464914,1.0,-0.32887,-0.467291,0.002201,-0.308439,0.002201,-0.000748,-0.021579,0.118188,-0.013939,0.244972,0.004526,0.036619,0.096071,0.104092,0.091504,0.27821
accuracy_mcq,-0.053324,-0.32887,1.0,0.470072,-0.04459,0.399195,-0.04459,0.193176,-0.107377,-0.25919,0.158943,-0.4148,0.012436,0.006003,-0.150123,-0.034947,0.103829,-0.22556
accuracy_free,-0.283384,-0.467291,0.470072,1.0,-0.064515,0.404457,-0.064515,0.222628,-0.082167,-0.178237,-0.013643,-0.403207,-0.064425,-0.123663,-0.08008,-0.003925,-0.199954,-0.38567
audio_number_x,0.03551,0.002201,-0.04459,-0.064515,1.0,-0.070632,1.0,-0.03392,0.014985,0.123456,0.074562,0.074945,0.007207,-0.063493,-0.051453,-0.033262,-0.050743,-0.059808
Surprisal Data Wav2vec,-0.326742,-0.308439,0.399195,0.404457,-0.070632,1.0,-0.070632,0.029189,-0.064883,-0.052357,0.024212,-0.190831,0.015014,-0.037253,-0.103259,-0.039129,-0.057149,-0.363981
audio_number_y,0.03551,0.002201,-0.04459,-0.064515,1.0,-0.070632,1.0,-0.03392,0.014985,0.123456,0.074562,0.074945,0.007207,-0.063493,-0.051453,-0.033262,-0.050743,-0.059808
Surprisal Data Whisper,-0.005482,-0.000748,0.193176,0.222628,-0.03392,0.029189,-0.03392,1.0,-0.008907,-0.101063,0.052247,-0.162348,-0.021327,-0.114212,-0.086869,0.070268,-0.052651,-0.064881
was_literal,0.077187,-0.021579,-0.107377,-0.082167,0.014985,-0.064883,0.014985,-0.008907,1.0,0.285429,0.123856,0.083033,-0.005796,-0.031906,0.087457,0.030524,-0.038752,-0.028742
was_fixed,0.072446,0.118188,-0.25919,-0.178237,0.123456,-0.052357,0.123456,-0.101063,0.285429,1.0,0.031323,0.377947,0.013072,0.097139,0.075431,-0.111751,-0.032485,0.058116


## Below is correlation with distances

In [132]:
import pandas as pd
from scipy.stats import pearsonr


# Initialize empty lists to store correlations and p-values
corr_ortho_list, p_value_ortho_list = [], []
corr_phono_list, p_value_phono_list = [], []

# Iterate over languages
for language, df in combined_data.items():
    # Extracting relevant columns
    accuracy_mcq = df['accuracy_mcq']
    orthographic_distances = df['was_fixed']
    phonological_distances = df['pwld_fixed']

    # Calculating correlations
    corr_ortho, p_value_ortho = pearsonr(orthographic_distances, accuracy_mcq)
    corr_phono, p_value_phono = pearsonr(phonological_distances, accuracy_mcq)

    # Append results to lists
    corr_ortho_list.append(corr_ortho)
    p_value_ortho_list.append(p_value_ortho)
    corr_phono_list.append(corr_phono)
    p_value_phono_list.append(p_value_phono)

# Creating a DataFrame with results
correlation_results = pd.DataFrame({
    'Orthographic Distance': corr_ortho_list,
    'Phonological Distance': corr_phono_list,
    'p-value (Orthographic)': p_value_ortho_list,
    'p-value (Phonological)': p_value_phono_list
}, index=list(combined_data.keys()))

# Formatting for LaTeX table
latex_table = correlation_results.round(3).astype(str)
latex_table.to_latex('correlation_table_all_languages.tex', escape=False)


  latex_table.to_latex('correlation_table_all_languages.tex', escape=False)


In [133]:
import pandas as pd
from scipy.stats import pearsonr


# Initialize empty lists to store correlations and p-values
corr_ortho_list, p_value_ortho_list = [], []
corr_phono_list, p_value_phono_list = [], []

# Iterate over languages
for language, df in combined_data.items():
    # Extracting relevant columns
    accuracy_mcq = df['accuracy_free']
    orthographic_distances = df['was_fixed']
    phonological_distances = df['pwld_fixed']

    # Calculating correlations
    corr_ortho, p_value_ortho = pearsonr(orthographic_distances, accuracy_mcq)
    corr_phono, p_value_phono = pearsonr(phonological_distances, accuracy_mcq)

    # Append results to lists
    corr_ortho_list.append(corr_ortho)
    p_value_ortho_list.append(p_value_ortho)
    corr_phono_list.append(corr_phono)
    p_value_phono_list.append(p_value_phono)

# Creating a DataFrame with results
correlation_results = pd.DataFrame({
    'Orthographic Distance': corr_ortho_list,
    'Phonological Distance': corr_phono_list,
    'p-value (Orthographic)': p_value_ortho_list,
    'p-value (Phonological)': p_value_phono_list
}, index=list(combined_data.keys()))

# Formatting for LaTeX table
latex_table = correlation_results.round(3).astype(str)
latex_table.to_latex('correlation_free_table_all_languages.tex', escape=False)


  latex_table.to_latex('correlation_free_table_all_languages.tex', escape=False)


## Below is correlation with surprisal 

In [135]:
import pandas as pd
from scipy.stats import pearsonr

# Initialize empty lists to store correlations and p-values
corr_ortho_list, p_value_ortho_list = [], []
corr_phono_list, p_value_phono_list = [], []

# Iterate over languages
for language, df in combined_data.items():
    # Extracting relevant columns
    accuracy_mcq = df['accuracy_mcq']
    orthographic_distances = df['Surprisal Data Wav2vec']
    phonological_distances = df['Surprisal Data Whisper']

    # Calculating correlations
    corr_ortho, p_value_ortho = pearsonr(orthographic_distances, accuracy_mcq)
    corr_phono, p_value_phono = pearsonr(phonological_distances, accuracy_mcq)

    # Append results to lists
    corr_ortho_list.append(corr_ortho)
    p_value_ortho_list.append(p_value_ortho)
    corr_phono_list.append(corr_phono)
    p_value_phono_list.append(p_value_phono)

# Creating a DataFrame with results
correlation_results = pd.DataFrame({
    'Wav2Vec': corr_ortho_list,
    'Whisper': corr_phono_list,
    'p-value (Wav2Vec)': p_value_ortho_list,
    'p-value (Whisper)': p_value_phono_list
}, index=list(combined_data.keys()))

# Formatting for LaTeX table
latex_table = correlation_results.round(3).astype(str)
latex_table.to_latex('correlation_table_surprisal_mcq.tex', escape=False)

  latex_table.to_latex('correlation_table_surprisal_mcq.tex', escape=False)


In [None]:
import pandas as pd
from scipy.stats import pearsonr


# Initialize empty lists to store correlations and p-values
corr_ortho_list, p_value_ortho_list = [], []
corr_phono_list, p_value_phono_list = [], []

# Iterate over languages
for language, df in combined_data.items():
    # Extracting relevant columns
    accuracy_mcq = df['accuracy_free']
    surprisal_wav2vec = df['Surprisal Data Wav2vec']
    surprisal_whisper = df['Surprisal Data Whisper']

    # Calculating correlations
    corr_w2v, p_value_w2v = pearsonr(surprisal_wav2vec, accuracy_mcq)
    corr_whisper, p_value_whisper = pearsonr(surprisal_whisper, accuracy_mcq)

    # Append results to lists
    corr_ortho_list.append(corr_w2v)
    p_value_ortho_list.append(p_value_w2v)
    corr_phono_list.append(corr_whisper)
    p_value_phono_list.append(p_value_whisper)

# Creating a DataFrame with results
correlation_results = pd.DataFrame({
    'Wav2Vec': corr_ortho_list,
    'Whisper': corr_phono_list,
    'p-value (Wav2Vec)': p_value_ortho_list,
    'p-value (Whisper)': p_value_phono_list
}, index=list(combined_data.keys()))

# Formatting for LaTeX table
latex_table = correlation_results.round(3).astype(str)
latex_table.to_latex('correlation_table_surprisal_free.tex', escape=False)

In [137]:
import pandas as pd
from scipy.stats import pearsonr


# Initialize empty lists to store correlations and p-values
corr_ortho_list, p_value_ortho_list = [], []
corr_phono_list, p_value_phono_list = [], []

# Iterate over languages
for language, df in combined_data.items():
    # Extracting relevant columns
    accuracy_mcq = df['accuracy_free']
    orthographic_distances = df['Surprisal Data Wav2vec']
    phonological_distances = df['Surprisal Data Whisper']

    # Calculating correlations
    corr_ortho, p_value_ortho = pearsonr(orthographic_distances, accuracy_mcq)
    corr_phono, p_value_phono = pearsonr(phonological_distances, accuracy_mcq)

    # Append results to lists
    corr_ortho_list.append(corr_ortho)
    p_value_ortho_list.append(p_value_ortho)
    corr_phono_list.append(corr_phono)
    p_value_phono_list.append(p_value_phono)

# Creating a DataFrame with results
correlation_results = pd.DataFrame({
    'Wav2Vec': corr_ortho_list,
    'Whisper': corr_phono_list,
    'p-value (Wav2Vec)': p_value_ortho_list,
    'p-value (Whisper)': p_value_phono_list
}, index=list(combined_data.keys()))

# Formatting for LaTeX table
latex_table = correlation_results.round(3).astype(str)
latex_table.to_latex('correlation_table_surprisal_free.tex', escape=False)

  latex_table.to_latex('correlation_table_surprisal_free.tex', escape=False)


In [142]:
for language in combined_data:
    combined_data[language] = combined_data[language].dropna(subset=['free_translation_time_correct'])

In [143]:
import pandas as pd
from scipy.stats import pearsonr

# Initialize empty lists to store correlations and p-values
corr_ortho_list, p_value_ortho_list = [], []
corr_phono_list, p_value_phono_list = [], []

# Iterate over languages
for language, df in combined_data.items():
    print(language)
    # Extracting relevant columns
    accuracy_mcq = df['free_translation_time_correct']
    orthographic_distances = df['Surprisal Data Wav2vec']
    phonological_distances = df['Surprisal Data Whisper']

    # Calculating correlations
    corr_ortho, p_value_ortho = pearsonr(orthographic_distances, accuracy_mcq)
    corr_phono, p_value_phono = pearsonr(phonological_distances, accuracy_mcq)

    # Append results to lists
    corr_ortho_list.append(corr_ortho)
    p_value_ortho_list.append(p_value_ortho)
    corr_phono_list.append(corr_phono)
    p_value_phono_list.append(p_value_phono)

# Creating a DataFrame with results
correlation_results = pd.DataFrame({
    'Wav2Vec': corr_ortho_list,
    'Whisper': corr_phono_list,
    'p-value (Wav2Vec)': p_value_ortho_list,
    'p-value (Whisper)': p_value_phono_list
}, index=list(combined_data.keys()))

# Print the DataFrame instead of saving to LaTeX
print(correlation_results.round(3))


BE
BG
CS
UK
PL
    Wav2Vec  Whisper  p-value (Wav2Vec)  p-value (Whisper)
BE   -0.344   -0.129              0.009              0.342
BG   -0.373   -0.065              0.011              0.666
CS   -0.165    0.002              0.301              0.991
UK   -0.224   -0.091              0.097              0.506
PL   -0.208   -0.029              0.166              0.849


In [140]:
import pandas as pd
from scipy.stats import pearsonr


# Initialize empty lists to store correlations and p-values
corr_ortho_list, p_value_ortho_list = [], []
corr_phono_list, p_value_phono_list = [], []

# Iterate over languages
for language, df in combined_data.items():
    # Extracting relevant columns
    accuracy_mcq = df['user_mcq_translation_time_taken']
    orthographic_distances = df['Surprisal Data Wav2vec']
    phonological_distances = df['Surprisal Data Whisper']

    # Calculating correlations
    corr_ortho, p_value_ortho = pearsonr(orthographic_distances, accuracy_mcq)
    corr_phono, p_value_phono = pearsonr(phonological_distances, accuracy_mcq)

    # Append results to lists
    corr_ortho_list.append(corr_ortho)
    p_value_ortho_list.append(p_value_ortho)
    corr_phono_list.append(corr_phono)
    p_value_phono_list.append(p_value_phono)

# Creating a DataFrame with results
correlation_results = pd.DataFrame({
    'Wav2Vec': corr_ortho_list,
    'Whisper': corr_phono_list,
    'p-value (Wav2Vec)': p_value_ortho_list,
    'p-value (Whisper)': p_value_phono_list
}, index=list(combined_data.keys()))

# Print the DataFrame instead of saving to LaTeX
print(correlation_results.round(3))


    Wav2Vec  Whisper  p-value (Wav2Vec)  p-value (Whisper)
BE   -0.108   -0.015              0.422              0.910
BG   -0.215    0.039              0.106              0.772
CS    0.029   -0.154              0.828              0.239
UK   -0.187    0.137              0.152              0.296
PL   -0.085   -0.011              0.526              0.937


In [162]:
import pandas as pd
from scipy.stats import pearsonr


# Initialize empty lists to store correlations and p-values
corr_ortho_list, p_value_ortho_list = [], []
corr_phono_list, p_value_phono_list = [], []

corr_wav_list, p_value_wav_list = [], []
corr_whisper_list, p_value_whisper_list = [], []


# Iterate over languages
for language, df in combined_data.items():
    print(language)
    # Extracting relevant columns
    accuracy_mcq = df['accuracy_free']
    orthographic_distances = df['was_fixed']
    phonological_distances = df['pwld_fixed']
    wav_distances = df['Surprisal Data Wav2vec']
    whisper_distances = df['Surprisal Data Whisper']
    # Calculating correlations
    corr_ortho, p_value_ortho = pearsonr(orthographic_distances, accuracy_mcq)
    corr_phono, p_value_phono = pearsonr(phonological_distances, accuracy_mcq)
    corr_wav, p_value_wav = pearsonr(wav_distances, accuracy_mcq)
    corr_whisper, p_value_whisper = pearsonr(whisper_distances, accuracy_mcq)

    # Append results to lists
    corr_ortho_list.append(corr_ortho)
    p_value_ortho_list.append(p_value_ortho)
    corr_phono_list.append(corr_phono)
    p_value_phono_list.append(p_value_phono)
    corr_wav_list.append(corr_wav)
    p_value_wav_list.append(p_value_wav)
    corr_whisper_list.append(corr_whisper)
    p_value_whisper_list.append(p_value_whisper)

# Creating a DataFrame with results
correlation_results = pd.DataFrame({
    'WAS': corr_ortho_list,
    'PWLD': corr_phono_list,
    'wav2vec': corr_wav_list,
    'whisper': corr_whisper_list,
    'p-value (WAS)': p_value_ortho_list,
    'p-value (pWLD)': p_value_phono_list,
    'p-value (wav2vec)': p_value_wav_list,
    'p-value (whisper)': p_value_whisper_list
}, index=list(combined_data.keys()))

# Print the DataFrame instead of saving to LaTeX
print(correlation_results.round(3))


BE
all
BG
CS
UK
PL
       WAS   PWLD  wav2vec  whisper  p-value (WAS)  p-value (pWLD)  \
BE  -0.210 -0.300    0.054    0.427          0.114           0.022   
all -0.178 -0.403    0.404    0.223          0.000           0.000   
BG  -0.339 -0.445    0.243    0.099          0.009           0.000   
CS  -0.006 -0.093    0.009    0.232          0.966           0.479   
UK  -0.054 -0.558    0.355    0.120          0.681           0.000   
PL  -0.012 -0.284    0.030    0.039          0.929           0.031   

     p-value (wav2vec)  p-value (whisper)  
BE               0.688              0.001  
all              0.000              0.000  
BG               0.066              0.458  
CS               0.947              0.074  
UK               0.005              0.361  
PL               0.822              0.773  


In [175]:
import pandas as pd
import statsmodels.api as sm
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import numpy as np

# Assuming combined_data is a dictionary of DataFrames

# Iterate over languages
for language, df in combined_data.items():
    print(language)
    # Extracting relevant columns
    accuracy_mcq = df['accuracy_free']
    independent_variables = df[['was_fixed', 'pwld_fixed', 'Surprisal Data Wav2vec', 'Surprisal Data Whisper']]  # Exclude 'accuracy_free'

    # Perform multiple regression
    X = sm.add_constant(independent_variables)
    model = sm.OLS(accuracy_mcq, X).fit()

    # Print the regression summary
    print(f'Regression results for {language}:\n')
    print(model.summary())
    print('\n' + '='*80 + '\n')  # Separating results for different languages


BE
Regression results for BE:

                            OLS Regression Results                            
Dep. Variable:          accuracy_free   R-squared:                       0.239
Model:                            OLS   Adj. R-squared:                  0.182
Method:                 Least Squares   F-statistic:                     4.168
Date:                Wed, 06 Mar 2024   Prob (F-statistic):            0.00522
Time:                        14:40:53   Log-Likelihood:                -260.28
No. Observations:                  58   AIC:                             530.6
Df Residuals:                      53   BIC:                             540.9
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------

In [199]:
import pandas as pd
from tabulate import tabulate

# Assuming combined_data is a dictionary of DataFrames

# Initialize empty lists to store regression results
regression_results = []

# Iterate over languages
for language, df in combined_data.items():
    # Extracting relevant columns
    dependent_variable = 'accuracy_free'
    independent_variables = df[['was_fixed', 'pwld_fixed', 'Surprisal Data Wav2vec', 'Surprisal Data Whisper']]  # Exclude 'accuracy_free'

    # Perform multiple regression
    predictors = independent_variables.columns
    X = sm.add_constant(independent_variables)
    model = sm.OLS(df[dependent_variable], X).fit()

    # Extract relevant information from the regression results
    significant_predictors = model.pvalues[model.pvalues < 0.05].index
    for predictor in significant_predictors:
        t_value = model.tvalues[predictor]
        p_value = model.pvalues[predictor]
        r_squared = round(model.rsquared, 2)

        # Append results to the list
        regression_results.append({
            'Language': language,
            'Predictor': predictor,
            'R-squared': r_squared,
            't-value': round(t_value, 3),
            'p-value': round(p_value, 3),
        })

# Create a DataFrame from the list
results_df = pd.DataFrame(regression_results)

# Convert the DataFrame to LaTeX format and print
latex_table = tabulate(results_df, headers='keys', tablefmt='latex_raw')
print(latex_table)


\begin{tabular}{rllrrr}
\hline
    & Language   & Predictor              &   R-squared &   t-value &   p-value \\
\hline
  0 & BE         & Surprisal Data Whisper &        0.24 &     3.165 &     0.003 \\
  1 & all        & const                  &        0.3  &    -5.157 &     0     \\
  2 & all        & pwld_fixed             &        0.3  &    -9.583 &     0     \\
  3 & all        & Surprisal Data Wav2vec &        0.3  &    11.828 &     0     \\
  4 & all        & Surprisal Data Whisper &        0.3  &     5.606 &     0     \\
  5 & BG         & was_fixed              &        0.29 &    -2.015 &     0.049 \\
  6 & BG         & pwld_fixed             &        0.29 &    -2.888 &     0.006 \\
  7 & UK         & was_fixed              &        0.41 &     2.083 &     0.042 \\
  8 & UK         & pwld_fixed             &        0.41 &    -4.941 &     0     \\
  9 & UK         & Surprisal Data Wav2vec &        0.41 &     2.229 &     0.03  \\
 10 & PL         & pwld_fixed             &      

In [183]:
model.rsquared

0.08734735759072854