In [2]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import re
import string
import nltk
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr, kendalltau
# import libvoikko

In [3]:
exclude = set(string.punctuation)

In [4]:
# !pip install libvoikko
# !pip install voikko

# Data Importing and Sorting

In [5]:
files = os.listdir('corpus')
files.remove('.DS_Store')
files.remove('scores_ru-en.csv')
scaler = MinMaxScaler()
for file_ in files:
  name = file_.split('-')[0] + file_.split('-')[1]
  vars()[name] = pd.read_csv(os.path.join('corpus', file_, 'scores.csv'))
  vars()[name].drop(columns = ['source', 'annotators', 'z-score'], inplace = True)
  vars()[name]['avg-score'] = scaler.fit_transform(vars()[name]['avg-score'].values.reshape(-1,1)) #normalizing values betwewen 0 and 1

In [6]:
english = csen.copy()
for df in [deen, ruen, zhen]:
  english = english.append(df)

In [7]:
finnish = enfi.copy()
chinese = enzh.copy()

In [8]:
english.reset_index(drop = True, inplace = True)

# Cleaning the corpus

In [9]:
def clean(text_list,
          lower = False,
          keep_numbers = False,
          keep_expression = False,
          remove_char = False,
          remove_stop = False,
          remove_tag = False,
          lemmatize = False,
          stemmer = False,
          english = True
          ):
    """
    Function that a receives a list of strings and preprocesses it.
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    if english:
        lang = 'english'
    else:
        lang = 'finnish'
    
    stop = set(stopwords.words(lang))
    stem = SnowballStemmer(lang)
    
    updates = []
    for j in tqdm(range(len(text_list))):
        
        text = text_list[j]
        
        #LOWERCASE TEXT
        if lower:
            text = text.lower()
            
        #KEEP NUMBERS AS TOKENS
        if keep_numbers:
            text = re.sub("[\d+]", 'NUMBER', text)
        
        #KEEP '?' and '!' AS TOKENS
        if keep_expression:
            text = re.sub("[\?|\!]", 'EXPRESSION', text)
            
        #REMOVE THAT IS NOT TEXT
        if remove_char:
            text = re.sub("[^a-zA-Z]", ' ', text)
            
        #REMOVE TAGS
        if remove_tag:
            text = BeautifulSoup(text).get_text()
        
        #REMOVE STOP WORDS
        if remove_stop:
            text = ' '.join([word for word in text.split(' ') if word not in stop])
        
        #LEMMATIZATION
        if lemmatize:
            if english:
                lemma = WordNetLemmatizer()
                text = " ".join(lemma.lemmatize(word) for word in text.split())
#             else:
#                 lemma = libvoikko.Voikko(u"fi")
#                 text = " ".join(lemma.analyze(word)[0]['BASEFORM'] for word in text.split())
        
        #STEMMER
        if stemmer:
            text = " ".join(stem.stem(word) for word in text.split())
        
        updates.append(text)
        
    return updates

def clean_zh_stopwords(text_list, stopwords_set='merged'):
    """
    Function that removes chinese stopwords
    
    :param stopwords_set: remove words of both sets (merged), just the 1st (fst) or just the second (snd) 
    """
    updates = []
    
    zh_stopwords1 = [line.strip() for line in open('chinese_stopwords/chinese_stopwords1.txt', 'r', encoding='utf-8').readlines()]
    zh_stopwords2 = [line.strip() for line in open('chinese_stopwords/chinese_stopwords2.txt', 'r', encoding='utf-8').readlines()]
    
    if stopwords_set == 'merged':
        stop = list(set(zh_stopwords1 + zh_stopwords2))
    elif stopwords_set == 'fst':
        stop = zh_stopwords1
    elif stopwords_set == 'snd':
        stop = zh_stopwords2
        

    for j in tqdm(range(len(text_list))):
        text = text_list[j]
        text = ' '.join([word for word in text.split(' ') if word not in stop])
        updates.append(text)
        
    return updates
        
    
def update_df(dataframe, list_updated):
    dataframe.update(pd.DataFrame({"Text": list_updated}))

In [10]:
english_cleaned = pd.DataFrame()
english_cleaned['avg-score'] = english['avg-score']
for column in ['reference', 'translation']:
    english_cleaned[column] = clean(english[column], lower = True, 
                                                    remove_char = True,
                                                    remove_stop = True,
                                                    lemmatize = True,
                                                    stemmer = False)

  0%|          | 0/77688 [00:00<?, ?it/s]

  0%|          | 0/77688 [00:00<?, ?it/s]

In [11]:
english_cleaned.head()

Unnamed: 0,avg-score,reference,translation
0,0.6,grab weapon forearm shoulder hit face free elbow,grasp gun forearm shoulder hitting face free e...
1,0.44,new york changed also rediscovery,new york change also reinvention
2,0.965,thinking summer improve give depth need get hi...,thought long hard course summer might improve ...
3,0.905,find another way cheat somewhere,find another way defraud others
4,0.746667,report replacement president administration he...,news replacement top president office come sur...


In [10]:
finnish_cleaned = pd.DataFrame()
finnish_cleaned['avg-score'] = finnish['avg-score']
for column in ['reference', 'translation']:
    finnish_cleaned[column] = clean(finnish[column], lower = True, 
                                                    remove_char = True,
                                                    lemmatize = True,
                                                    stemmer = False,
                                                    english=False)
finnish_cleaned.head()

  0%|          | 0/6748 [00:00<?, ?it/s]

  0%|          | 0/6748 [00:00<?, ?it/s]

Unnamed: 0,avg-score,reference,translation
0,0.342,voit muuttaa itsesi ananasta koirasta tai roy...,voit muuttaa itsesi ananakseksi koiraksi tai ...
1,0.584,my s ammuttiin kolme miest kaksi vuotiait...,my s kolmea miest ammuttiin kahta vuotias...
2,0.746,tiedot tallennetaan kassakoneisiin joka tapauk...,tiedot kuitenkin tallentuvat kassoilla joka ta...
3,0.536,xinhua kertoo ett xinyin n ytteest oli sunn...,xinhua kertoo ett xinyin sunnuntaina antamas...
4,0.3225,voitaisiin kuulla cbd n kommenttitiimin toimi...,macdonaldin joka tuli cbc n selostajatiimiin ...


In [11]:
chinese_cleaned = pd.DataFrame()
chinese_cleaned['avg-score'] = chinese['avg-score']
for column in ['reference', 'translation']:
    chinese_cleaned[column] = clean_zh_stopwords(chinese[column])
chinese_cleaned.head()

FileNotFoundError: [Errno 2] No such file or directory: 'chinese_stopwords/chinese_stopwords1.txt'

# Train, Dev & Test Split

In [18]:
en_train, en_dev = train_test_split(english_cleaned, shuffle = True, test_size = 0.2, random_state = 7)
en_dev, en_test = train_test_split(en_dev, shuffle = True, test_size = 0.5, random_state = 7)

##NEED TO IMPLEMENT
# fin_train, fin_dev = train_test_split(finnish, shuffle = True, test_size = 0.2, random_state = 7)
# fin_dev, fin_test = train_test_split(fin_dev, shuffle = True, test_size = 0.5, random_state = 7)

# ch_train, ch_dev = train_test_split(chinese, shuffle = True, test_size = 0.2, random_state = 7)
# ch_dev, ch_test = train_test_split(ch_dev, shuffle = True, test_size = 0.5, random_state = 7)

# Encoding (for now only bag of words)

In [25]:
encoder = CountVectorizer()
names = ['en_train', 'en_dev', 'en_test']
for i,df in enumerate([en_train, en_dev, en_test]):
    for column in ['reference', 'translation']:
        encoded_df = names[i] + '_encoded_' + column
        if i == 0:
            vars()[encoded_df] = encoder.fit_transform(df[column]).todense()
        else:
            vars()[encoded_df] = encoder.transform(df[column]).todense()
            
    y_name = 'y_' + names[i].split('_')[1]
    vars()[y_name] = np.array(df['avg-score'])

# Distance

In [20]:
cos_dev = []
for i in range(en_dev_encoded_reference.shape[0]):
    cos_dev.append(cosine_similarity(en_dev_encoded_reference[i], en_dev_encoded_translation[i])[0])
cos_dev = np.array(cos_dev)
cos_dev.shape = (cos_dev.shape[0],)

cos_test = []
for i in range(en_test_encoded_reference.shape[0]):
    cos_test.append(cosine_similarity(en_test_encoded_reference[i], en_test_encoded_translation[i])[0])
cos_test = np.array(cos_test)
cos_test.shape = (cos_test.shape[0],)

In [27]:
cleaned_corr_dev, cleaned_corr_dev_pvalue = pearsonr(y_dev, cos_dev)
cleaned_corr_test, cleaned_corr_test_pvalue = pearsonr(y_test,cos_test)

cleaned_corr_ktau_dev, cleaned_corr_ktau_dev_pvalue = kendalltau(y_dev, cos_dev)
cleaned_corr_ktau_test, cleaned_corr_ktau_test_pvalue = kendalltau(y_test,cos_test)

print(f'Pearson correlation between cosine similarity and score on development set: {cleaned_corr_dev} (p-value < 0.001: {cleaned_corr_dev_pvalue < 0.001}); and Kendall Tau: {cleaned_corr_ktau_dev} (p-value < 0.001: {cleaned_corr_ktau_dev_pvalue < 0.001})')
print(f'Pearson correlation between cosine similarity and score on test set: {cleaned_corr_test} (p-value < 0.001: {cleaned_corr_test_pvalue < 0.001}); and Kendall Tau: {cleaned_corr_ktau_test} (p-value < 0.001: {cleaned_corr_ktau_test_pvalue < 0.001})')

Pearson correlation between cosine similarity and score on development set: 0.27125392707842255 (p-value < 0.001: True); and Kendall Tau: 0.1941738794482738 (p-value < 0.001: True)
Pearson correlation between cosine similarity and score on test set: 0.26633356530702623 (p-value < 0.001: True); and Kendall Tau: 0.18264304507499624 (p-value < 0.001: True)
