# Packages Import

In [20]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr, kendalltau

# Data Importing and Sorting

In [21]:
files = os.listdir('corpus')
files.remove('.DS_Store')
files.remove('.ipynb_checkpoints')
scaler = MinMaxScaler()
for file_ in files:
  name = file_.split('-')[0] + file_.split('-')[1]
  vars()[name] = pd.read_csv(os.path.join('corpus', file_, 'scores.csv'))
  vars()[name].drop(columns = ['source', 'annotators', 'z-score'], inplace = True)
  vars()[name]['avg-score'] = scaler.fit_transform(vars()[name]['avg-score'].values.reshape(-1,1)) #normalizing values betwewen 0 and 1

In [22]:
english = csen.copy()
for df in [deen, ruen, zhen]:
  english = english.append(df)

In [23]:
finnish = enfi.copy()
chinese = enzh.copy()

# Train, Dev & Test Split

In [24]:
en_train, en_dev = train_test_split(english, shuffle = True, test_size = 0.2, random_state = 7)
en_dev, en_test = train_test_split(en_dev, shuffle = True, test_size = 0.5, random_state = 7)

fin_train, fin_dev = train_test_split(finnish, shuffle = True, test_size = 0.2, random_state = 7)
fin_dev, fin_test = train_test_split(fin_dev, shuffle = True, test_size = 0.5, random_state = 7)

ch_train, ch_dev = train_test_split(chinese, shuffle = True, test_size = 0.2, random_state = 7)
ch_dev, ch_test = train_test_split(ch_dev, shuffle = True, test_size = 0.5, random_state = 7)

# BaseLine models

## English corpus 

In [25]:
baseline_encoder = CountVectorizer(max_features = 5000)#settting limit for computational reasons
names = ['en_train', 'en_dev', 'en_test']
for i,df in enumerate([en_train, en_dev, en_test]):
    for column in ['reference', 'translation']:
        encoded_df = names[i] + '_bl_encoded_' + column
        if i == 0:
            vars()[encoded_df] = baseline_encoder.fit_transform(df[column]).todense()
        else:
            vars()[encoded_df] = baseline_encoder.transform(df[column]).todense()
            
    y_name = 'y_' + names[i].split('_')[1]
    vars()[y_name] = np.array(df['avg-score'])

In [27]:
baseline_encoder_reference = CountVectorizer(max_features = 5000)#settting limit for computational reasons
baseline_encoder_translation = CountVectorizer(max_features = 5000)
names = ['en_train', 'en_dev', 'en_test']
for i,df in enumerate([en_train, en_dev, en_test]):

    for column in ['reference', 'translation']:

        encoded = names[i] + '_bl_encoded_' + column

        if i == 0 and column == 'reference':
            vars()[encoded] = baseline_encoder_reference.fit_transform(df[column]).todense()
        elif i == 0 and column == 'translation':
            vars()[encoded] = baseline_encoder_translation.fit_transform(df[column]).todense()

        if i != 0 and column == 'reference':
            vars()[encoded] = baseline_encoder_reference.transform(df[column]).todense()
        elif i != 0 and column == 'translation':
            vars()[encoded] = baseline_encoder_translation.transform(df[column]).todense()

y_name = 'y_' + names[i].split('_')[1]
vars()[y_name] = np.array(df['avg-score'])

In [28]:
en_train_bl_encoded_reference.shape

(62150, 5000)

## Regression

In [14]:
en_train_bl_encoded = np.hstack((en_train_bl_encoded_reference, en_train_bl_encoded_translation))
en_dev_bl_encoded = np.hstack((en_dev_bl_encoded_reference, en_dev_bl_encoded_translation))
en_test_bl_encoded = np.hstack((en_test_bl_encoded_reference, en_test_bl_encoded_translation))

baseline_regressor = LinearRegression()
baseline_regressor.fit(en_train_bl_encoded, y_train)

The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse)

In [15]:
baseline_r2_dev = baseline_regressor.score(en_dev_bl_encoded, y_dev)
baseline_r2_test = baseline_regressor.score(en_test_bl_encoded, y_test)

print(f'Baseline R^2 score on development set : {baseline_r2_dev}')
print(f'Baseline R^2 score on test set : {baseline_r2_test}')

In [16]:
pred_y_dev = baseline_regressor.predict(en_dev_bl_encoded)
pred_y_test = baseline_regressor.predict(en_test_bl_encoded)

baseline_mae_dev = mean_absolute_error(y_dev, pred_y_dev)
baseline_mse_dev = mean_squared_error(y_dev, pred_y_dev)

baseline_mae_test = mean_absolute_error(y_test, pred_y_test)
baseline_mse_test = mean_squared_error(y_test, pred_y_test)

print(f'Baseline mae score on development set : {baseline_mae_dev} and mse: {baseline_mse_dev}')
print(f'Baseline mae score on test set : {baseline_mae_test} and mse: {baseline_mse_test}')

## Distance

In [29]:
en_dev_bl_encoded_reference

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [31]:
cos_train = []
for i in range(en_dev_bl_encoded_reference.shape[0]):
    cos_train.append(cosine_similarity(en_train_bl_encoded_reference[i], en_train_bl_encoded_translation[i])[0])
cos_train = np.array(cos_train)
cos_train.shape = (cos_train.shape[0],)

cos_dev = []
for i in range(en_dev_bl_encoded_reference.shape[0]):
    cos_dev.append(cosine_similarity(en_dev_bl_encoded_reference[i], en_dev_bl_encoded_translation[i])[0])
cos_dev = np.array(cos_dev)
cos_dev.shape = (cos_dev.shape[0],)

cos_test = []
for i in range(en_test_bl_encoded_reference.shape[0]):
    cos_test.append(cosine_similarity(en_test_bl_encoded_reference[i], en_test_bl_encoded_translation[i])[0])
cos_test = np.array(cos_test)
cos_test.shape = (cos_test.shape[0],)

In [36]:
y_train.shape

(62150,)

In [34]:
cos_train.s

array([0.        , 0.        , 0.14433757, ..., 0.        , 0.        ,
       0.        ])

In [32]:
baseline_corr_train, baseline_corr_train_pvalue = pearsonr(y_train, cos_train)
baseline_corr_dev, baseline_corr_dev_pvalue = pearsonr(y_dev, cos_dev)
baseline_corr_test, baseline_corr_test_pvalue = pearsonr(y_test,cos_test)

baseline_corr_ktau_train, baseline_corr_ktau_train_pvalue = kendalltau(y_train, cos_train)
baseline_corr_ktau_dev, baseline_corr_ktau_dev_pvalue = kendalltau(y_dev, cos_dev)
baseline_corr_ktau_test, baseline_corr_ktau_test_pvalue = kendalltau(y_test,cos_test)

print(f'Pearson correlation between cosine similarity and score on train set: {baseline_corr_train} (p-value < 0.001: {baseline_corr_train_pvalue < 0.001}); and Kendall Tau: {baseline_corr_ktau_train} (p-value < 0.001: {baseline_corr_ktau_train_pvalue < 0.001})')
print(f'Pearson correlation between cosine similarity and score on development set: {baseline_corr_dev} (p-value < 0.001: {baseline_corr_dev_pvalue < 0.001}); and Kendall Tau: {baseline_corr_ktau_dev} (p-value < 0.001: {baseline_corr_ktau_dev_pvalue < 0.001})')
print(f'Pearson correlation between cosine similarity and score on development set: {baseline_corr_test} (p-value < 0.001: {baseline_corr_test_pvalue < 0.001}); and Kendall Tau: {baseline_corr_ktau_test} (p-value < 0.001: {baseline_corr_ktau_test_pvalue < 0.001})')

ValueError: x and y must have the same length.