In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import statsmodels.formula.api as smf
from mlxtend.feature_selection import SequentialFeatureSelector

In [42]:
experiment_filtered = pd.read_csv('data/filtered_experiment_results.csv')

In [2]:
import pandas as pd
import numpy as np

languages = ['BG', 'BE', 'CS', 'PL', 'UK']

# Initialize dictionaries to store the dataframes
was_data, pwld_data = {}, {}
agg_was, agg_pwld = [], []

for l2 in languages:
    l2_lower = l2.lower()
    
    # Load and process WAS data
    was_df = pd.read_csv(f'data/metrics/was/was_literal_{l2_lower}_ru.csv')
    was_fixed_df = pd.read_csv(f'data/metrics/was/was_fixed_{l2_lower}_ru.csv')
    was_df['normalized WAS fixed'] = was_fixed_df['normalized WAS']
    was_data[f'{l2_lower}_ru'] = was_df
    agg_was.append({
        'Language': l2_lower,
        'Literal_was': np.mean(was_df['normalized WAS']),
        'Fixed_was': np.mean(was_df['normalized WAS fixed'])
    })
    
    # Load and process PWLD data
    pwld_df = pd.read_csv(f'data/metrics/pwld/pwld_dict_literal_ru_{l2_lower}.csv')
    pwld_fixed_df = pd.read_csv(f'data/metrics/pwld/pwld_dict_fixed_ru_{l2_lower}.csv')
    pwld_df['Value fixed'] = pwld_fixed_df['Value']
    pwld_data[f'ru_{l2_lower}'] = pwld_df
    agg_pwld.append({
        'Language': l2_lower,
        'Literal_pwld': np.mean(pwld_df['Value']),
        'Fixed_pwld': np.mean(pwld_df['Value fixed'])
    })

# Convert aggregated data to DataFrames and merge
was_df = pd.DataFrame(agg_was)
pwld_df = pd.DataFrame(agg_pwld)
merged_df = pd.merge(was_df, pwld_df, on='Language')
print(merged_df)

  Language  Literal_was  Fixed_was  Literal_pwld  Fixed_pwld
0       bg     3.175256   3.221264      0.204141    0.252588
1       be     3.235893   3.249172      0.212757    0.219716
2       cs     3.323028   3.382418      0.174754    0.291087
3       pl     3.332122   3.388665      0.207772    0.297607
4       uk     3.257236   3.298384      0.197653    0.210293


In [14]:
import pandas as pd

results = {}
models = ['model_gpt_small', 'model_bert_small', 'model_gpt_large', 'model_bert_large']
columns = ['phrase ru', 'surprisal_sentence ru', 'literal translation', 'surprisal_phrase ru with literal']


for lang in languages:
    for model_name in models:
        file_path = f"data/metrics/surprisal/{lang}_{model_name}_data.csv"
        
        # Load the data into a DataFrame
        df = pd.read_csv(file_path, usecols=columns)
        
        # Filter out rows where any of the desired columns are missing
        df = df.dropna(subset=columns)
        
        # Save to the results dictionary
        results[(lang, model_name)] = df

In [17]:
import string
import ast
# Extending the punctuation list
extended_punctuation = string.punctuation + '…'

# Updated function to remove punctuation from a given word
def remove_punctuation(word):
    return ''.join(ch for ch in word if ch not in extended_punctuation)

# Updated function to compute average surprisal
def compute_average_surprisal(phrase, surprisal_str):
    try:
        # Convert string to dictionary
        surprisal_dict = ast.literal_eval(surprisal_str)
        # Clean keys in surprisal_dict (removing punctuation)
        surprisal_dict = {remove_punctuation(key): value for key, value in surprisal_dict.items()}
        # Get the tokens from the phrase
        tokens = remove_punctuation(phrase).split()
        # Compute average surprisal for the tokens in the phrase
        total_surprisal = sum(surprisal_dict.get(token, float('nan')) for token in tokens)
        return total_surprisal / len(tokens)
    except (ValueError, SyntaxError):
        return float('nan')

for key, df in results.items():
    # Calculate average surprisal for 'phrase ru'
    df['avg_surprisal_phrase ru'] = df.apply(lambda row: compute_average_surprisal(row['phrase ru'], row['surprisal_sentence ru']), axis=1)
    # Calculate average surprisal for 'literal translation'
    df['avg_surprisal_literal'] = df.apply(lambda row: compute_average_surprisal(row['literal translation'], row['surprisal_phrase ru with literal']), axis=1)


In [21]:
# Prepare a combined dataframe
dfs = []
for model_name in models:
    for lang in languages:
        temp_df = results[(lang, model_name)].copy()
        temp_df['language'] = lang
        temp_df['model'] = model_name
        dfs.append(temp_df)
combined_df = pd.concat(dfs)

# Melt the dataframe to transform it for plotting
df_melted = combined_df.melt(id_vars=["language", "model"], 
                             value_vars=["avg_surprisal_phrase ru", "avg_surprisal_literal"],
                             var_name="Metric",
                             value_name="Surprisal")

In [30]:
# Group by 'language' and 'model', then calculate the mean of 'Surprisal' for each group
average_surprisal = df_melted.groupby(['language', 'model', 'Metric'])['Surprisal'].mean().reset_index()

In [36]:
df = pd.DataFrame(average_surprisal)

# Add prefixes to the 'Metric' values
df['Metric'] = df['Metric'].apply(lambda x: '_literal' if 'literal' in x else '_phrase')

# Create a new column combining 'model' and 'Metric'
df['model_metric'] = df['model'] + df['Metric']

# Pivot the data: 'model_metric' values become new columns, filled with 'Surprisal' values
pivoted_df = df.pivot(index='language', columns='model_metric', values='Surprisal').reset_index()

# Display the resulting DataFrame
print(pivoted_df)

model_metric language  model_bert_large_literal  model_bert_large_phrase  \
0                  BE                  6.161847                 0.815297   
1                  BG                  7.459920                 1.027850   
2                  CS                  7.003779                 1.139527   
3                  PL                  6.403018                 1.037337   
4                  UK                  6.512173                 0.787037   

model_metric  model_bert_small_literal  model_bert_small_phrase  \
0                            16.660070                15.648769   
1                            15.376191                15.444899   
2                            16.695863                15.992625   
3                            17.255480                16.992005   
4                            16.416540                16.380210   

model_metric  model_gpt_large_literal  model_gpt_large_phrase  \
0                            7.130234                3.332662   
1         

In [39]:
pivoted_df['language'] = pivoted_df['language'].str.lower()
merged_df['Language'] = merged_df['Language'].str.lower()

In [103]:
import pandas as pd

df = pd.DataFrame(experiment_filtered)

# Calculating the percentage of correct responses per 'source_text_to_be_translated'
correct_percentage = df.groupby(['source_text_to_be_translated', 'source_language'])['is_correct'].mean() * 100

# Converting to DataFrame for better visualization
correct_percentage_df = correct_percentage.reset_index()

# Renaming columns for clarity
correct_percentage_df.columns = ['source_text_to_be_translated', 'language', 'correct_percentage']


In [104]:
correct_percentage_df

Unnamed: 0,source_text_to_be_translated,language,correct_percentage
0,a navíc,CS,40.000000
1,a tu,PL,27.272727
2,a zasię,PL,82.608696
3,albo i,PL,56.666667
4,ale przecież,PL,30.000000
...,...,...,...
288,ў сілу,BE,100.000000
289,ўбок,BE,53.846154
290,ўсе жыцце,BE,88.461538
291,ўсе роўна,BE,92.592593


In [69]:
# Creating new columns for checking if the translations are correct
experiment_filtered['is_correct_mcq'] = experiment_filtered['user_mcq_translation'] == experiment_filtered['correct_translation']
experiment_filtered['is_correct_free'] = experiment_filtered['user_free_translation'] == experiment_filtered['correct_translation']

# Counting the number of correct translations by language for each type of translation
correct_counts_mcq = experiment_filtered.groupby('source_language')['is_correct_mcq'].sum().reset_index()
correct_counts_free = experiment_filtered.groupby('source_language')['is_correct_free'].sum().reset_index()

# Calculating the percentage of correct translations
total_counts = experiment_filtered['source_language'].value_counts().reset_index()
total_counts.columns = ['source_language', 'total']

# Merging and calculating the percentage for MCQ translations
correct_counts_mcq = pd.merge(correct_counts_mcq, total_counts, on='source_language')
correct_counts_mcq['correct_percentage_mcq'] = (correct_counts_mcq['is_correct_mcq'] / correct_counts_mcq['total']) * 100

# Merging and calculating the percentage for free translations
correct_counts_free = pd.merge(correct_counts_free, total_counts, on='source_language')
correct_counts_free['correct_percentage_free'] = (correct_counts_free['is_correct_free'] / correct_counts_free['total']) * 100

# Merging the two results into a single DataFrame
final_correct_counts = pd.merge(correct_counts_mcq, correct_counts_free, on=['source_language', 'total'])
# Rename the column
final_correct_counts = final_correct_counts.rename(columns={'source_language': 'language'})

In [67]:
# Merge while converting 'Language' to lowercase on the fly
final_merged_df = pd.merge(pivoted_df, merged_df, left_on='language', right_on=merged_df['Language'].str.lower(), how='inner')

# Optionally, drop the redundant column
final_merged_df = final_merged_df.drop(columns=['Language'])

In [75]:
# Merge final_merged_df with final_correct_counts on the language column
final_df = pd.merge(final_merged_df, final_correct_counts, left_on='language', right_on=merged_df['Language'].str.lower(), how='left')
final_df = final_df.drop(columns=['language_x', 'is_correct_mcq', 'total', 'language_y', 'is_correct_free'])

In [88]:
final_df

Unnamed: 0,language,model_bert_large_literal,model_bert_large_phrase,model_bert_small_literal,model_bert_small_phrase,model_gpt_large_literal,model_gpt_large_phrase,model_gpt_small_literal,model_gpt_small_phrase,Literal_was,Fixed_was,Literal_pwld,Fixed_pwld,correct_percentage_mcq,correct_percentage_free
0,be,6.161847,0.815297,16.66007,15.648769,7.130234,3.332662,7.184592,3.557769,3.235893,3.249172,0.212757,0.219716,74.546722,17.921897
1,bg,7.45992,1.02785,15.376191,15.444899,7.723927,3.707474,7.631683,3.983022,3.175256,3.221264,0.204141,0.252588,82.905983,29.273504
2,cs,7.003779,1.139527,16.695863,15.992625,7.328689,3.695524,7.258239,3.758246,3.323028,3.382418,0.174754,0.291087,59.795222,4.641638
3,pl,6.403018,1.037337,17.25548,16.992005,7.494197,3.519089,7.552764,3.725802,3.332122,3.388665,0.207772,0.297607,57.201087,11.005435
4,uk,6.512173,0.787037,16.41654,16.38021,7.332158,3.411002,7.307725,3.572677,3.257236,3.298384,0.197653,0.210293,80.999296,30.26038


In [87]:
import pandas as pd
import statsmodels.api as sm

# Create a DataFrame
df = final_df

# List of potential predictor variables
predictors = [
    'model_bert_large_literal', 'model_bert_large_phrase',
    'model_bert_small_literal', 'model_bert_small_phrase',
    'model_gpt_large_literal', 'model_gpt_large_phrase',
    'model_gpt_small_literal', 'model_gpt_small_phrase',
    'Literal_was', 'Fixed_was', 'Literal_pwld', 'Fixed_pwld'
]

# Dependent variable
dependent_var = 'correct_percentage_mcq'  # or 'correct_percentage_free'

# Forward stepwise regression
def forward_stepwise_regression(predictors, dependent_var, df):
    remaining_predictors = set(predictors)
    selected_predictors = []
    current_score, best_new_score = float('inf'), float('inf')
    
    while remaining_predictors and current_score == best_new_score:
        scores_with_predictors = []
        for predictor in remaining_predictors:
            formula = "{} ~ {}".format(dependent_var, ' + '.join(selected_predictors + [predictor]))
            score = smf.ols(formula, df).fit().aic
            scores_with_predictors.append((score, predictor))
        scores_with_predictors.sort()
        best_new_score, best_predictor = scores_with_predictors.pop(0)
        if current_score > best_new_score:
            remaining_predictors.remove(best_predictor)
            selected_predictors.append(best_predictor)
            current_score = best_new_score
    formula = "{} ~ {}".format(dependent_var, ' + '.join(selected_predictors))
    model = smf.ols(formula, df).fit()
    return model

# Run the stepwise regression
model = forward_stepwise_regression(predictors, dependent_var, df)

# Display the model summary
print(model.summary())

                              OLS Regression Results                              
Dep. Variable:     correct_percentage_mcq   R-squared:                       1.000
Model:                                OLS   Adj. R-squared:                    nan
Method:                     Least Squares   F-statistic:                       nan
Date:                    Sat, 14 Oct 2023   Prob (F-statistic):                nan
Time:                            16:16:09   Log-Likelihood:                 145.41
No. Observations:                       5   AIC:                            -280.8
Df Residuals:                           0   BIC:                            -282.8
Df Model:                               4                                         
Covariance Type:                nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------

  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid


In [89]:
model.summary()

  warn("omni_normtest is not valid with less than 8 observations; %i "


0,1,2,3
Dep. Variable:,correct_percentage_mcq,R-squared:,1.0
Model:,OLS,Adj. R-squared:,
Method:,Least Squares,F-statistic:,
Date:,"Sat, 14 Oct 2023",Prob (F-statistic):,
Time:,16:17:59,Log-Likelihood:,145.41
No. Observations:,5,AIC:,-280.8
Df Residuals:,0,BIC:,-282.8
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,24.7784,inf,0,,,
Literal_was,-91.0205,inf,-0,,,
Fixed_pwld,-352.6533,inf,-0,,,
Fixed_was,82.5449,inf,0,,,
model_gpt_small_literal,3.9380,inf,0,,,
model_gpt_small_phrase,35.2157,inf,0,,,

0,1,2,3
Omnibus:,,Durbin-Watson:,1.277
Prob(Omnibus):,,Jarque-Bera (JB):,1.173
Skew:,1.185,Prob(JB):,0.556
Kurtosis:,2.894,Cond. No.,1220.0
