## Variable Acquisition

### Transcriptions, PWLD (phonological distance), Word Adaptation Surprisal (orthographic distance), Average time per Response, Surprisal from Language Models

### Transcriptions

In [5]:
from get_metrics.pwld import *
import pandas as pd
import matplotlib.pyplot as plt
import subprocess

In [11]:
# Load experiment data
xls = pd.ExcelFile('data/experiment_data.xlsx')
experiment_data = {}
# Iterate through sheet names and load into the dictionary
for sheet_name in xls.sheet_names:
    df = pd.read_excel(xls, sheet_name=sheet_name)
    experiment_data[sheet_name] = df
# Load experiment results
experiment_results = pd.read_csv('data/experiment_results_raw.csv')
experiment_results_lang = experiment_results.groupby('source_language')

In [10]:
# Get the average sentence length in tokens for each language
for sheet in experiment_data:
    print(f"Average sentence length in tokens for {sheet}: {round(experiment_data[sheet]['sentence l2'].apply(lambda x: len(str(x).split())).mean(), 1)}")

Average sentence length in tokens for CS-RU: 11.3
Average sentence length in tokens for BG-RU: 14.9
Average sentence length in tokens for UK-RU: 14.8
Average sentence length in tokens for BE-RU: 15.3
Average sentence length in tokens for PL-RU: 13.6


In [None]:
# Get transcriptions of fixed expressions
ru_transcription = get_phonetic_transcription(experiment_data['CS-RU']['phrase l2'].to_list(), 'RU')
cs_transcription = get_phonetic_transcription(experiment_data['CS-RU']['phrase l2'].to_list(), 'CS')
pl_transcription = get_phonetic_transcription(experiment_data['PL-RU']['phrase l2'].to_list(), 'PL')
be_transcription = get_phonetic_transcription(experiment_data['BE-RU']['phrase l2'].to_list(), 'BE')
bg_transcription = get_phonetic_transcription(experiment_data['BG-RU']['phrase l2'].to_list(), 'BG')
uk_transcription = get_phonetic_transcription(experiment_data['UK-RU']['phrase l2'].to_list(), 'UK')

In [None]:
# Get transcriptions of literal options
cs_transcription_literal = get_phonetic_transcription(experiment_data['CS-RU']['L1 literal'].to_list(), 'RU', 'cs')
be_transcription_literal = get_phonetic_transcription(experiment_data['BE-RU']['Glosbe & Vasmer L2 (BE) - L1 (RU) '].to_list(), 'RU', 'be')
pl_transcription_literal = get_phonetic_transcription(experiment_data['PL-RU']['Glosbe & Vasmer L2 (BE) - L1 (RU) '].to_list(), 'RU', 'pl')
bg_transcription_literal = get_phonetic_transcription(experiment_data['BG-RU']['Glosbe & Vasmer L2 (BG) - L1 (RU) '].to_list(), 'RU', 'bg')
uk_transcription_literal = get_phonetic_transcription(experiment_data['UK-RU']['Glosbe & Vasmer L2 (UK) - L1 (RU) '].to_list(), 'RU', 'uk')

### Phonologically weighted levenshtein distance

In [16]:
# Get transcriptions of fixed expression options
ru_transcriptions = load_transcription_dict('data/metrics/transcriptions/ru_transcriptions')
cs_transcriptions = load_transcription_dict('data/metrics/transcriptions/cs_transcriptions')
pl_transcriptions = load_transcription_dict('data/metrics/transcriptions/pl_transcriptions')
uk_transcriptions = load_transcription_dict('data/metrics/transcriptions/uk_transcriptions')
bg_transcriptions = load_transcription_dict('data/metrics/transcriptions/bg_transcriptions')
be_transcriptions = load_transcription_dict('data/metrics/transcriptions/be_transcriptions')

In [17]:
# Get transcriptions of literal options
cs_transcription_literal = load_transcription_dict('data/metrics/transcriptions/ru_cs_transcriptions_literal')
be_transcription_literal = load_transcription_dict('data/metrics/transcriptions/ru_be_transcriptions_literal')
pl_transcription_literal = load_transcription_dict('data/metrics/transcriptions/ru_pl_transcriptions_literal')
bg_transcription_literal = load_transcription_dict('data/metrics/transcriptions/ru_bg_transcriptions_literal')
uk_transcription_literal = load_transcription_dict('data/metrics/transcriptions/ru_uk_transcriptions_literal')

In [18]:
metrics# Get transcriptions of free responses
cs_transcription_responses = load_transcription_dict('data/metrics/transcriptions/ru_cs_transcriptions_responses')
be_transcription_responses = load_transcription_dict('data/metrics/transcriptions/ru_be_transcriptions_responses')
pl_transcription_responses = load_transcription_dict('data/metrics/transcriptions/ru_pl_transcriptions_responses')
bg_transcription_responses = load_transcription_dict('data/metrics/transcriptions/ru_bg_transcriptions_responses')
uk_transcription_responses = load_transcription_dict('data/metrics/transcriptions/ru_uk_transcriptions_responses')

In [19]:
# Get PWLD for fixed expression options
for expression_transcriptions, lang in zip([cs_transcriptions, uk_transcriptions, be_transcriptions, pl_transcriptions, bg_transcriptions],
                                           ['cs', 'uk', 'be', 'pl', 'bg']):
    pwld_dict = get_pwld(ru_transcriptions, expression_transcriptions)
    # Convert the dictionary to a DataFrame
    # Convert the dictionary to a DataFrame with separate columns for tuple elements
    df = pd.DataFrame(pwld_dict.values(), index=pd.MultiIndex.from_tuples(pwld_dict.keys(), names=['RU', lang.upper()]), columns=['Value'])
    # Save the DataFrame to a CSV file
    df.to_csv(f'data/results/pwld/pwld_dict_fixed_ru_{lang}.csv')

In [20]:
# Get PWLD for literal expression options
for expression_transcriptions, lang in zip([(cs_transcriptions,cs_transcription_literal), (uk_transcriptions, uk_transcription_literal), (be_transcriptions, be_transcription_literal), (pl_transcriptions, pl_transcription_literal), (bg_transcriptions, bg_transcription_literal)],
                                           ['cs', 'uk', 'be', 'pl', 'bg']):
    pwld_dict = get_pwld( expression_transcriptions[1], expression_transcriptions[0])
    # Convert the dictionary to a DataFrame
    # Convert the dictionary to a DataFrame with separate columns for tuple elements
    df = pd.DataFrame(pwld_dict.values(), index=pd.MultiIndex.from_tuples(pwld_dict.keys(), names=['RU', lang.upper()]), columns=['Value'])
    # Save the DataFrame to a CSV file
    df.to_csv(f'data/results/pwld/pwld_dict_literal_ru_{lang}.csv')

In [26]:
# Removing invalid entries
mappings = dict()
for group_name, group_data in experiment_results_lang:
    mappings[group_name] = set([(str(x).strip().lower(), str(y).strip().lower()) for x, y in zip(group_data['source_text_to_be_translated'].to_list(), group_data['user_free_translation'].to_list()) if str(y).strip().lower() not in ['nan', '']])

mappings['PL'] = {(x[0] + ' rzeczy', x[1]) if x[0] == 'w gruncie' else x for x in mappings['PL']}
mappings_part1 = dict()
mappings_part1['BE'] = {x: be_transcriptions[x[0]] for x in mappings['BE']}
mappings_part1['BG'] = {x: bg_transcriptions[x[0]] for x in mappings['BG']}
mappings_part1['UK'] = {x: uk_transcriptions[x[0]] for x in mappings['UK']}
mappings_part1['PL'] = {x: pl_transcriptions[x[0]] for x in mappings['PL']}
mappings_part1['CS'] = {x: cs_transcriptions[x[0]] for x in mappings['CS']}
mappings_part2 = dict()
mappings_part2['BE'] = {x: be_transcription_responses[x[1]] for x in mappings['BE']}
mappings_part2['BG'] = {x: bg_transcription_responses[x[1]] for x in mappings['BG']}
mappings_part2['UK'] = {x: uk_transcription_responses[x[1]] for x in mappings['UK']}
mappings_part2['PL'] = {x: pl_transcription_responses[x[1]] for x in mappings['PL']}
mappings_part2['CS'] = {x: cs_transcription_responses[x[1]] for x in mappings['CS']}
for l in mappings_part1:
    for m1 in list(mappings_part1[l].keys()):
        if type(mappings_part1[l][m1]) == float or type(mappings_part2[l][m1]) == float or len(mappings_part2[l][m1]) == 0 or len(mappings_part1[l][m1]) == 0 or mappings_part1[l][m1] == 'nan' or mappings_part1[l][m1] == 'nan':
            mappings_part2[l].pop(m1)
            mappings_part1[l].pop(m1)

In [29]:
# Get PWLD for responses
mappings_pwld = dict()
for lang in mappings:
    pwld_dict_responses = get_pwld({x: str(y) for x, y in mappings_part1[lang].items()}, {x: str(y) for x, y in mappings_part2[lang].items()})
    pwld_dict_responses = {x[0]: y for x,y in pwld_dict_responses.items()}
    # Convert the dictionary to a DataFrame
    # Convert the dictionary to a DataFrame with separate columns for tuple elements
    df = pd.DataFrame(pwld_dict_responses.values(), index=pd.MultiIndex.from_tuples(pwld_dict_responses.keys(), names=['RU', lang.upper()]), columns=['Value'])
    # Save the DataFrame to a CSV file
    df.to_csv(f'data/results/pwld/pwld_dict_responses_ru_{lang}.csv')

### Word Adaptation Surprisal

In [83]:


# Run the orthographic_equivalencies.py script
%run '/get_metrics/was/Multilingual_Cloze_Tests/Distances/orthographic_equivalencies.py'

# Run the utils.py script
%run '/get_metrics/was/Multilingual_Cloze_Tests/Distances/utils.py'

def get_was_ortho(dfs):
        # Use pandas to read the Excel file and create a dictionary of DataFrames
    dfs = pd.read_excel('data/experiment_data.xlsx', sheet_name=None)
    
    # 'dfs' is now a dictionary where keys are the sheet names, and values are DataFrames
    # You can access each DataFrame by its corresponding sheet name
    
    # Print the sheet names
    if 'stats' in dfs:
        dfs.pop('stats')
    # Access and print each DataFrame
        dfs[sheet_name] = df[['phrase L2', 'literal translation']]
        dfs[sheet_name].columns = sheet_name.split('-')
        dfs[sheet_name] = dfs[sheet_name].dropna()
    # List of language pairs
    language_pairs = ['BG-RU', 'CS-RU', 'BE-RU', 'UK-RU', 'PL-RU']
    results_dict = dict()
    # Define orth_dist and other necessary functions (e.g., levenshtein_distance, character_surprisals, character_entropy, etc.)
    
    for lang_pair in language_pairs:
        df = dfs[lang_pair]  # Access the DataFrame for the current language pair
        l2 = lang_pair.split('-')[0]
        print(l2)
        # Compute Levenshtein distance and other metrics
        levensthein_foreign = levenshtein_distance(df, foreign=l2, native='RU', costs=orth_dist)
        levensthein_native = levenshtein_distance(df, foreign='RU', native=l2, costs=orth_dist)
    
        # Check assertions
        assert levensthein_foreign['LD'].all() == levensthein_native['LD'].all()
        assert levensthein_foreign['normalized LD'].all() == levensthein_native['normalized LD'].all()
    
        # Compute character surprisals
        probs_foreign, surprisals_foreign = character_surprisals(levensthein_foreign, foreign=l2, native='RU')
        probs_native, surprisals_native = character_surprisals(levensthein_native, foreign='RU', native=l2)
    
        # Compute character entropy
        char_entropy_foreign = character_entropy(surprisals_foreign, probs_foreign)
        char_entropy_native = character_entropy(surprisals_native, probs_native)
    
        # Compute full conditional entropy
        H_foreign_native = full_conditional_entropy(l2, 'RU', levensthein_foreign, surprisals_native, probs_native)
        H_native_foreign = full_conditional_entropy('RU', l2, levensthein_native, surprisals_foreign, probs_foreign)
    
        # Compute word adaptation surprisal
        was_foreign = word_adaptation_surprisal(levensthein_foreign, surprisals_foreign, probs_foreign)
        was_native = word_adaptation_surprisal(levensthein_native, surprisals_native, probs_native)
        results_dict[lang_pair] = [was_foreign, was_native]
    return results_dict
        

In [84]:
# Save WAS metrics
result_dict = get_was_ortho(experiment_data)
for language_pair in result_dict:
    l2 = language_pair.split('-')[0]
    print(l2)
    ru_l2 = result_dict[language_pair][0][[l2, 'RU', 'normalized WAS']]
    l2_ru = result_dict[language_pair][1][[l2, 'RU', 'normalized WAS']]
    ru_l2.to_csv(f'data/metrics/was/was_literal_ru_{l2.lower()}.csv', index=False)
    l2_ru.to_csv(f'data/metrics/was/was_literal_{l2.lower()}_ru.csv', index=False)

### Time averaged 

In [13]:
def compute_average_times(data, time_field, averaged_time_field):
    """Calculate the average translation time and time per word for a given data set."""
    
    averaged_times = {}
    
    for group_name, group_data in data:
        relevant_data = group_data[['correct_translation', 'source_text_to_be_translated', time_field, "source_text"]]
        averaged_data = relevant_data.groupby(['correct_translation','source_text_to_be_translated', "source_text"]).mean()
        
        # Calculate the average time per word
        word_counts = [len(text.split()) for text in averaged_data.index.get_level_values('source_text')]
        averaged_data[averaged_time_field] = averaged_data[time_field] / word_counts
        
        averaged_times[group_name] = averaged_data
        
    return averaged_times


def save_averaged_times_to_csv(data, prefix):
    """Save the averaged times data to CSV files."""
    
    for group_name, group_data in data.items():
        file_name = f'data/results/time/{prefix}_{group_name.lower()}.csv'
        group_data.to_csv(file_name, index=True)


# Calculate average times for free translation
average_time_taken_free = compute_average_times(experiment_results_lang, 'user_free_translation_time_taken', 'time_averaged_free')
save_averaged_times_to_csv(average_time_taken_free, 'free')

# Calculate average times for MCQ translation
average_time_taken_mcq = compute_average_times(experiment_results_lang, 'user_mcq_translation_time_taken', 'time_averaged_free_mcq')
save_averaged_times_to_csv(average_time_taken_mcq, 'mcq')

### Surprisal from LMs

In [2]:
%run 'get_metrics/get_surprisal.py'

Using pad_token, but it is not set yet.
Using pad_token, but it is not set yet.


In [74]:
#cs_df_gpt_small = get_model_df(model_gpt_small, 'CS', experiment_data, 'model_gpt_small')
#pl_df_gpt_small = get_model_df(model_gpt_small, 'PL', experiment_data, 'model_gpt_small')
#uk_df_gpt_small = get_model_df(model_gpt_small, 'UK', experiment_data, 'model_gpt_small')
be_df_gpt_small = get_model_df(model_gpt_small, 'BE', experiment_data, 'model_gpt_small')
bg_df_gpt_small = get_model_df(model_gpt_small, 'BG', experiment_data, 'model_gpt_small')

Data saved to data/metrics/surprisal/BE_model_gpt_small_data.csv
Data saved to data/metrics/surprisal/BG_model_gpt_small_data.csv


In [70]:
cs_df_bert_small = get_model_df(model_bert_small, 'CS', experiment_data, 'model_bert_small')
pl_df_bert_small = get_model_df(model_bert_small, 'PL', experiment_data, 'model_bert_small')
uk_df_bert_small = get_model_df(model_bert_small, 'UK', experiment_data, 'model_bert_small')
be_df_bert_small = get_model_df(model_bert_small, 'BE', experiment_data, 'model_bert_small')
bg_df_bert_small = get_model_df(model_bert_small, 'BG', experiment_data, 'model_bert_small')


Data saved to data/metrics/surprisal/CS_model_bert_small_data.csv
Data saved to data/metrics/surprisal/PL_model_bert_small_data.csv
Data saved to data/metrics/surprisal/UK_model_bert_small_data.csv
Data saved to data/metrics/surprisal/BE_model_bert_small_data.csv
Data saved to data/metrics/surprisal/BG_model_bert_small_data.csv


In [5]:
bg_df_bert_small = get_model_df(model_bert_small, 'BG', experiment_data, 'model_bert_small')
bg_df_gpt_small = get_model_df(model_gpt_small, 'BG', experiment_data, 'model_gpt_small')
bg_df_bert_large = get_model_df(model_bert_large, 'BG', experiment_data, 'model_bert_large')
bg_df_gpt_large = get_model_df(model_gpt_large, 'BG', experiment_data, 'model_gpt_large')

Data saved to data/metrics/surprisal/BG_model_bert_small_data.csv
Data saved to data/metrics/surprisal/BG_model_gpt_small_data.csv
Data saved to data/metrics/surprisal/BG_model_bert_large_data.csv



KeyboardInterrupt



In [21]:
cs_df_bert_large = get_model_df(model_bert_large, 'CS', experiment_data, 'model_bert_large')
pl_df_bert_large = get_model_df(model_bert_large, 'PL', experiment_data, 'model_bert_large')
uk_df_bert_large = get_model_df(model_bert_large, 'UK', experiment_data, 'model_bert_large')
be_df_bert_large = get_model_df(model_bert_large, 'BE', experiment_data, 'model_bert_large')
bg_df_bert_large = get_model_df(model_bert_large, 'BG', experiment_data, 'model_bert_large')


Data saved to data/metrics/surprisal/CS_model_bert_large_data.csv
Data saved to data/metrics/surprisal/PL_model_bert_large_data.csv
Data saved to data/metrics/surprisal/UK_model_bert_large_data.csv
Data saved to data/metrics/surprisal/BE_model_bert_large_data.csv
Data saved to data/metrics/surprisal/BG_model_bert_large_data.csv


In [63]:
cs_df_gpt_large = get_model_df(model_gpt_large, 'CS', experiment_data, 'model_gpt_large')
pl_df_gpt_large = get_model_df(model_gpt_large, 'PL', experiment_data, 'model_gpt_large')
uk_df_gpt_large = get_model_df(model_gpt_large, 'UK', experiment_data, 'model_gpt_large')
be_df_gpt_large = get_model_df(model_gpt_large, 'BE', experiment_data, 'model_gpt_large')
bg_df_gpt_large = get_model_df(model_gpt_large, 'BG', experiment_data, 'model_gpt_large')


Data saved to data/metrics/surprisal/UK_model_gpt_large_data.csv
Data saved to data/metrics/surprisal/BE_model_gpt_large_data.csv
Data saved to data/metrics/surprisal/BG_model_gpt_large_data.csv
