In [110]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import re
import string
import nltk
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [97]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

# Data Importing and Sorting

In [98]:
files = os.listdir('corpus')
files.remove('.DS_Store')
files.remove('.ipynb_checkpoints')
scaler = MinMaxScaler()
for file_ in files:
  name = file_.split('-')[0] + file_.split('-')[1]
  vars()[name] = pd.read_csv(os.path.join('corpus', file_, 'scores.csv'))
  vars()[name].drop(columns = ['source', 'annotators', 'z-score'], inplace = True)
  vars()[name]['avg-score'] = scaler.fit_transform(vars()[name]['avg-score'].values.reshape(-1,1)) #normalizing values betwewen 0 and 1

In [99]:
english = csen.copy()
for df in [deen, ruen, zhen]:
  english = english.append(df)

In [100]:
finnish = enfi.copy()
chinese = enzh.copy()

In [101]:
english.reset_index(drop = True, inplace = True)

# Cleaning the corpus

In [102]:
def clean(text_list,
          lower = False,
          remove_char = False,
          remove_stop = False,
          lemmatize = False,
          stemmer = False):
    """
    Function that a receives a list of strings and preprocesses it.
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    
    updates = []
    for j in tqdm(range(len(text_list))):
        
        text = text_list[j]
        
        #LOWERCASE TEXT
        if lower:
            text = text.lower()
        
        #REMOVE THAT IS NOT TEXT
        if remove_char:
            text = re.sub("[^a-zA-Z]", ' ', text)
        
        #REMOVE STOP WORDS
        if remove_stop:
            text = ' '.join([word for word in text.split(' ') if word not in stop])
        
        #LEMMATIZATION
        if lemmatize:
            text = " ".join(lemma.lemmatize(word) for word in text.split())
        
        #STEMMER
        if stemmer:
            text = " ".join(snowball_stemmer.stem(word) for word in text.split())
        
        updates.append(text)
        
    return updates

def update_df(dataframe, list_updated):
    dataframe.update(pd.DataFrame({"Text": list_updated}))

In [103]:
english_cleaned = pd.DataFrame()
english_cleaned['avg-score'] = english['avg-score']
for column in ['reference', 'translation']:
    english_cleaned[column] = clean(english[column], lower = True, 
                                                    remove_char = True,
                                                    remove_stop = True,
                                                    lemmatize = True,
                                                    stemmer = False)

  0%|          | 0/77688 [00:00<?, ?it/s]

  0%|          | 0/77688 [00:00<?, ?it/s]

In [104]:
english_cleaned.head()

Unnamed: 0,avg-score,reference,translation
0,0.6,grab weapon forearm shoulder hit face free elbow,grasp gun forearm shoulder hitting face free e...
1,0.44,new york changed also rediscovery,new york change also reinvention
2,0.965,thinking summer improve give depth need get hi...,thought long hard course summer might improve ...
3,0.905,find another way cheat somewhere,find another way defraud others
4,0.746667,report replacement president administration he...,news replacement top president office come sur...


# Train, Dev & Test Split

In [105]:
en_train, en_dev = train_test_split(english_cleaned, shuffle = True, test_size = 0.2, random_state = 7)
en_dev, en_test = train_test_split(en_dev, shuffle = True, test_size = 0.5, random_state = 7)

##NEED TO IMPLEMENT
# fin_train, fin_dev = train_test_split(finnish, shuffle = True, test_size = 0.2, random_state = 7)
# fin_dev, fin_test = train_test_split(fin_dev, shuffle = True, test_size = 0.5, random_state = 7)

# ch_train, ch_dev = train_test_split(chinese, shuffle = True, test_size = 0.2, random_state = 7)
# ch_dev, ch_test = train_test_split(ch_dev, shuffle = True, test_size = 0.5, random_state = 7)

# Encoding

In [106]:
baseline_encoder = CountVectorizer()
names = ['en_train', 'en_dev', 'en_test']
for i,df in enumerate([en_train, en_dev, en_test]):
    for column in ['reference', 'translation']:
        encoded_df = names[i] + '_encoded_' + column
        if i == 0:
            vars()[encoded_df] = baseline_encoder.fit_transform(df[column]).todense()
        else:
            vars()[encoded_df] = baseline_encoder.transform(df[column]).todense()
            
    y_name = 'y_' + names[i].split('_')[1]
    vars()[y_name] = np.array(df['avg-score'])

# Distance

In [112]:
cos_dev = []
for i in range(en_dev_encoded_reference.shape[0]):
    cos_dev.append(cosine_similarity(en_dev_encoded_reference[i], en_dev_encoded_translation[i])[0])
cos_dev = np.array(cos_dev)
cos_dev.shape = (cos_dev.shape[0],)

cos_test = []
for i in range(en_test_encoded_reference.shape[0]):
    cos_test.append(cosine_similarity(en_test_encoded_reference[i], en_test_encoded_translation[i])[0])
cos_test = np.array(cos_test)
cos_test.shape = (cos_test.shape[0],)

In [113]:
baseline_corr_dev, baseline_corr_dev_pvalue = pearsonr(y_dev, cos_dev)
baseline_corr_test, baseline_corr_test_pvalue = pearsonr(y_test,cos_test)

baseline_corr_ktau_dev, baseline_corr_ktau_dev_pvalue = kendalltau(y_dev, cos_dev)
baseline_corr_ktau_test, baseline_corr_ktau_test_pvalue = kendalltau(y_test,cos_test)

print(f'Pearson correlation between cosine similarity and score on development set: {baseline_corr_dev} (p-value < 0.001: {baseline_corr_dev_pvalue < 0.001}); and Kendall Tau: {baseline_corr_ktau_dev} (p-value < 0.001: {baseline_corr_ktau_dev_pvalue < 0.001})')
print(f'Pearson correlation between cosine similarity and score on test set: {baseline_corr_test} (p-value < 0.001: {baseline_corr_test_pvalue < 0.001}); and Kendall Tau: {baseline_corr_ktau_test} (p-value < 0.001: {baseline_corr_ktau_test_pvalue < 0.001})')

NameError: name 'pearsonr' is not defined