In [1]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import re
import string
import nltk
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr, kendalltau
import gensim.downloader as api

In [2]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

# Data Importing and Sorting

In [3]:
files = os.listdir('corpus')
files.remove('.DS_Store')
files.remove('scores_ru-en.csv')
scaler = MinMaxScaler()
for file_ in files:
  name = file_.split('-')[0] + file_.split('-')[1]
  vars()[name] = pd.read_csv(os.path.join('corpus', file_, 'scores.csv'))
  vars()[name].drop(columns = ['source', 'annotators', 'z-score'], inplace = True)
  vars()[name]['avg-score'] = scaler.fit_transform(vars()[name]['avg-score'].values.reshape(-1,1)) #normalizing values betwewen 0 and 1

In [4]:
english = csen.copy()
for df in [deen, ruen, zhen]:
  english = english.append(df)

In [5]:
finnish = enfi.copy()
chinese = enzh.copy()

In [6]:
english.reset_index(drop = True, inplace = True)

# Cleaning the corpus

In [7]:
def clean(text_list,
          lower = False,
          remove_char = False,
          remove_stop = False,
          lemmatize = False,
          stemmer = False,
          split_list = False):
    """
    Function that a receives a list of strings and preprocesses it.
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    
    updates = []
    for j in tqdm(range(len(text_list))):
        
        text = text_list[j]
        
        #LOWERCASE TEXT
        if lower:
            text = text.lower()
        
        #REMOVE THAT IS NOT TEXT
        if remove_char:
            text = re.sub("[^a-zA-Z]", ' ', text)
        
        #REMOVE STOP WORDS
        if remove_stop:
            text = ' '.join([word for word in text.split(' ') if word not in stop])
        
        #LEMMATIZATION
        if lemmatize:
            text = " ".join(lemma.lemmatize(word) for word in text.split())
        
        #STEMMER
        if stemmer:
            text = " ".join(snowball_stemmer.stem(word) for word in text.split())
        
        #SPLITTING THE TEXT INTO A LIST OF WORDS
        if split_list:
            text = text.split()
        
        updates.append(text)
        
    return updates

def update_df(dataframe, list_updated):
    dataframe.update(pd.DataFrame({"Text": list_updated}))

In [8]:
english_cleaned = pd.DataFrame()
english_cleaned['avg-score'] = english['avg-score']
for column in ['reference', 'translation']:
    english_cleaned[column] = clean(english[column], lower = True, 
                                                    remove_char = True,
                                                    remove_stop = True,
                                                    lemmatize = True,
                                                    stemmer = False,
                                                    split_list = True)

  0%|          | 0/77688 [00:00<?, ?it/s]

  0%|          | 0/77688 [00:00<?, ?it/s]

In [9]:
english_cleaned.head()

Unnamed: 0,avg-score,reference,translation
0,0.6,"[grab, weapon, forearm, shoulder, hit, face, f...","[grasp, gun, forearm, shoulder, hitting, face,..."
1,0.44,"[new, york, changed, also, rediscovery]","[new, york, change, also, reinvention]"
2,0.965,"[thinking, summer, improve, give, depth, need,...","[thought, long, hard, course, summer, might, i..."
3,0.905,"[find, another, way, cheat, somewhere]","[find, another, way, defraud, others]"
4,0.746667,"[report, replacement, president, administratio...","[news, replacement, top, president, office, co..."


# Train, Dev & Test Split

In [10]:
en_train, en_dev = train_test_split(english_cleaned, shuffle = True, test_size = 0.2, random_state = 7)
en_dev, en_test = train_test_split(en_dev, shuffle = True, test_size = 0.5, random_state = 7)

##NEED TO IMPLEMENT
# fin_train, fin_dev = train_test_split(finnish, shuffle = True, test_size = 0.2, random_state = 7)
# fin_dev, fin_test = train_test_split(fin_dev, shuffle = True, test_size = 0.5, random_state = 7)

# ch_train, ch_dev = train_test_split(chinese, shuffle = True, test_size = 0.2, random_state = 7)
# ch_dev, ch_test = train_test_split(ch_dev, shuffle = True, test_size = 0.5, random_state = 7)

# Encoding (Word2Vec + Word Mover Distance)

In [21]:
# model = api.load('word2vec-google-news-300')
names = ['en_train', 'en_dev', 'en_test']
for j,df in enumerate([en_train, en_dev, en_test]):
    name = 'distances_' + names[j]
    vars()[name] = []
    df.reset_index(drop = True, inplace = True)
    for i in tqdm(range(len(df))):
                  
        vars()[name].append(model.wmdistance(df['reference'][i], df['translation'][i]))
    
    name2 = 'score_' + names[j]
    vars()[name2] = np.array(df['avg-score'])

  0%|          | 0/62150 [00:00<?, ?it/s]

  0%|          | 0/7769 [00:00<?, ?it/s]

  0%|          | 0/7769 [00:00<?, ?it/s]

# Calculating correlation

In [67]:
def corr(y_dev_true, y_dev_pred, y_test_true = None, y_test_pred = None, return_corr = False):

    cleaned_corr_dev, cleaned_corr_dev_pvalue = pearsonr(y_dev_true, y_dev_pred)
    cleaned_corr_ktau_test, cleaned_corr_ktau_test_pvalue = kendalltau(y_test_true, y_test_pred)

    if y_test_true != None:
        cleaned_corr_ktau_dev, cleaned_corr_ktau_dev_pvalue = kendalltau(y_dev_true, y_dev_pred)
        cleaned_corr_test, cleaned_corr_test_pvalue = pearsonr(y_test_true, y_test_pred)
        

    print(f'Pearson correlation between cosine similarity and score on development set: {cleaned_corr_dev} (p-value < 0.001: {cleaned_corr_dev_pvalue < 0.001}); and Kendall Tau: {cleaned_corr_ktau_dev} (p-value < 0.001: {cleaned_corr_ktau_dev_pvalue < 0.001})')
    print(f'Pearson correlation between cosine similarity and score on test set: {cleaned_corr_test} (p-value < 0.001: {cleaned_corr_test_pvalue < 0.001}); and Kendall Tau: {cleaned_corr_ktau_test} (p-value < 0.001: {cleaned_corr_ktau_test_pvalue < 0.001})')
    
    if return_corr:
        return cleaned_corr_dev

In [63]:
while np.inf in distances_en_dev:
    distances_en_dev[distances_en_dev.index(np.inf)] = 10
    
while np.inf in distances_en_test:
    distances_en_test[distances_en_test.index(np.inf)] = 10

In [68]:
corr(score_en_dev, distances_en_dev, score_en_test, distances_en_test)

Pearson correlation between cosine similarity and score on development set: -0.19745565618100078 (p-value < 0.001: True); and Kendall Tau: -0.21976558377750754 (p-value < 0.001: True)
Pearson correlation between cosine similarity and score on test set: -0.217635667687527 (p-value < 0.001: True); and Kendall Tau: -0.20764484095340793 (p-value < 0.001: True)
