# Imports

In [4]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr, kendalltau
from sklearn.preprocessing import MinMaxScaler

# Data Importing and Sorting

In [5]:
files = os.listdir('corpus')
files.remove('.DS_Store')
files.remove('scores_ru-en.csv')
scaler = MinMaxScaler()
for file_ in files:
  name = file_.split('-')[0] + file_.split('-')[1]
  vars()[name] = pd.read_csv(os.path.join('corpus', file_, 'scores.csv'))
  vars()[name].drop(columns = ['source', 'annotators', 'z-score'], inplace = True)
  vars()[name]['avg-score'] = scaler.fit_transform(vars()[name]['avg-score'].values.reshape(-1,1)) #normalizing values betwewen 0 and 1

In [6]:
english = csen.copy()
for df in [deen, ruen, zhen]:
  english = english.append(df)

In [7]:
finnish = enfi.copy()
chinese = enzh.copy()

# Train, Dev & Test Split

In [8]:
en_train, en_dev = train_test_split(english, shuffle = True, test_size = 0.2, random_state = 7)
en_dev, en_test = train_test_split(en_dev, shuffle = True, test_size = 0.5, random_state = 7)

fin_train, fin_dev = train_test_split(finnish, shuffle = True, test_size = 0.2, random_state = 7)
fin_dev, fin_test = train_test_split(fin_dev, shuffle = True, test_size = 0.5, random_state = 7)

ch_train, ch_dev = train_test_split(chinese, shuffle = True, test_size = 0.2, random_state = 7)
ch_dev, ch_test = train_test_split(ch_dev, shuffle = True, test_size = 0.5, random_state = 7)

# Cleaning Function

# Baseline encoding

In [None]:
baseline_encoder = CountVectorizer(max_features = 5000)#settting limit for computational reasons

# Correlation Function

In [None]:
def correlation(y_train, pred_train, y_dev, pred_dev, y_test, pred_test):

    corr_train, corr_train_pvalue = pearsonr(y_train, pred_train)
    corr_dev, corr_dev_pvalue = pearsonr(y_dev, pred_dev)
    corr_test, corr_test_pvalue = pearsonr(y_test, pred_test)

    corr_ktau_train, corr_ktau_train_pvalue = kendalltau(y_train, pred_train)
    corr_ktau_dev, corr_ktau_dev_pvalue = kendalltau(y_dev, pred_dev)
    corr_ktau_test, corr_ktau_test_pvalue = kendalltau(y_test, pred_test)
    
    print(f'Pearson correlation between cosine similarity and score on training set: {corr_train} (p-value < 0.001: {corr_train_pvalue < 0.001}); and Kendall Tau: {corr_ktau_train} (p-value < 0.001: {corr_ktau_train_pvalue < 0.001})')
    print(f'Pearson correlation between cosine similarity and score on development set: {corr_dev} (p-value < 0.001: {corr_dev_pvalue < 0.001}); and Kendall Tau: {corr_ktau_dev} (p-value < 0.001: {corr_ktau_dev_pvalue < 0.001})')
    print(f'Pearson correlation between cosine similarity and score on test set: {corr_test} (p-value < 0.001: {corr_test_pvalue < 0.001}); and Kendall Tau: {corr_ktau_test} (p-value < 0.001: {corr_ktau_test_pvalue < 0.001})')
    
    return corr_train, corr_dev, corr_test, corr_ktau_train, corr_ktau_dev, corr_ktau_test

# Grid search in cleaning methods

In [15]:
en_train['avg-score']

5954     0.656667
1045     0.855000
14106    1.000000
8575     0.770000
6739     0.790000
           ...   
2190     0.475000
10742    1.000000
16400    0.270000
7295     0.620000
10346    0.405000
Name: avg-score, Length: 62150, dtype: float64

In [None]:
names = ['en_train', 'en_dev', 'en_test']
results = {}
for cleaning in cleaning_configurations:    
    for i,df in enumerate([en_train, en_dev, en_test]):
        for column in ['reference', 'translation']:
            encoded_df = names[i] + '_bl_encoded_' + column
            
            vars()[encoded_df] = clean(english[column], cleaning)
            if i == 0:
                vars()[encoded_df] = baseline_encoder.fit_transform(df[column]).todense()
            else:
                vars()[encoded_df] = baseline_encoder.transform(df[column]).todense()


        
        
    results[cleaning] = (correlation())

# Graphical Visualization of results