In [11]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import re
import string
import nltk
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr, kendalltau
import gensim.downloader as api
import jieba

In [2]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

# Data Importing and Sorting

In [3]:
files = os.listdir('corpus')
files.remove('.DS_Store')
files.remove('scores_ru-en.csv')
scaler = MinMaxScaler()
for file_ in files:
  name = file_.split('-')[0] + file_.split('-')[1]
  vars()[name] = pd.read_csv(os.path.join('corpus', file_, 'scores.csv'))
  vars()[name].drop(columns = ['source', 'annotators', 'avg-score'], inplace = True)
  vars()[name]['z-score'] = scaler.fit_transform(vars()[name]['z-score'].values.reshape(-1,1)) #normalizing values betwewen 0 and 1

In [4]:
english = csen.copy()
for df in [deen, ruen, zhen]:
  english = english.append(df)

In [5]:
finnish = enfi.copy()
chinese = enzh.copy()

In [6]:
english.reset_index(drop = True, inplace = True)

# Cleaning the corpus (updated  cleaning function)

In [7]:
def clean(text_list,
          lower = False,
          keep_numbers = False,
          keep_expression = False,
          remove_char = False,
          remove_stop = False,
          remove_tag = False,
          lemmatize = False,
          stemmer = False,
          english = True
          ):
    """
    Function that a receives a list of strings and preprocesses it.
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    if english:
        lang = 'english'
    else:
        lang = 'finnish'
    
    stop = set(stopwords.words(lang))
    stem = SnowballStemmer(lang)
    
    updates = []
    for j in range(len(text_list)):
        
        text = text_list[j]
        
        #LOWERCASE TEXT
        if lower:
            text = text.lower()
            
        #KEEP NUMBERS AS TOKENS
        if not keep_numbers:
            text = re.sub("[\d+]", 'X', text)
        
        #KEEP '?' and '!' AS TOKENS
        if not keep_expression:
            text = re.sub("[\?|\!]", 'EXPRESSION', text)
            
        #REMOVE TAGS
        if remove_tag:
            text = BeautifulSoup(text).get_text()
            
        #REMOVE THAT IS NOT TEXT
        if remove_char:
            text = re.sub("[^a-zA-Z]", ' ', text)
        
        #REMOVE STOP WORDS
        if remove_stop:
            text = ' '.join([word for word in text.split(' ') if word not in stop])
        
        #LEMMATIZATION
        if lemmatize:
            if english:
                lemma = WordNetLemmatizer()
                text = " ".join(lemma.lemmatize(word) for word in text.split())
        
        #STEMMER
        if stemmer:
            text = " ".join(stem.stem(word) for word in text.split())
        
        updates.append(text)
        
    return updates

def clean_ch(text_list, keep_numbers=False, remove_punctuation=False, remove_stop = False, stopwords_set='merged'):
    """
    Function that removes chinese stopwords
    
    :param stopwords_set: remove words of both sets (merged), just the 1st (fst) or just the second (snd) 
    """
    updates = []
    
    zh_stopwords1 = [line.strip() for line in open('chinese_stopwords/chinese_stopwords1.txt', 'r', encoding='utf-8').readlines()]
    zh_stopwords2 = [line.strip() for line in open('chinese_stopwords/chinese_stopwords2.txt', 'r', encoding='utf-8').readlines()]
    
    if stopwords_set == 'merged':
        stop = list(set(zh_stopwords1 + zh_stopwords2))
    elif stopwords_set == 'fst':
        stop = zh_stopwords1
    elif stopwords_set == 'snd':
        stop = zh_stopwords2

    for j in range(len(text_list)):
        
        text = text_list[j]
        
        #KEEP NUMBERS AS TOKENS
        if keep_numbers:
            text = re.sub("[\d+]", 'X', text)
        
        # REMOVE PUNCTUATION
        if remove_punctuation:
            # https://stackoverflow.com/questions/36640587/how-to-remove-chinese-punctuation-in-python
            punc = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
            text = re.sub(r"[%s]+" %punc, "", text)
        
        # REMOVE STOP WORDS
        if remove_stop:
            pretext = text
            text = ' '.join([word for word in jieba.cut(text) if word not in stop])
            
        updates.append(text)
        
    return updates

In [8]:
finnish_cleaned = pd.DataFrame()
finnish_cleaned['z-score'] = finnish['z-score']
for column in ['reference', 'translation']:
    finnish_cleaned[column] = clean(finnish[column],
                                    lower = True,
                                    keep_numbers = False,
                                    keep_expression = True,
                                    remove_char = True,
                                    remove_stop = False,
                                    remove_tag = True,
                                    lemmatize = False,
                                    stemmer = True,
                                    english = False)

In [12]:
chinese_cleaned = pd.DataFrame()
chinese_cleaned['z-score'] = chinese['z-score']
for column in ['reference', 'translation']:
    chinese_cleaned[column] = clean_ch(chinese[column],
                                    keep_numbers = True,
                                    remove_punctuation = False,
                                    remove_stop = True,
                                    stopwords_set = 'merged')

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\Public\Documents\Wondershare\CreatorTemp\jieba.cache
Loading model cost 1.321 seconds.
Prefix dict has been built successfully.


# Train, Dev & Test Split

In [13]:
# en_train, en_dev = train_test_split(english, shuffle = True, test_size = 0.2, random_state = 7)
# en_dev, en_test = train_test_split(en_dev, shuffle = True, test_size = 0.5, random_state = 7)

# en_train_cleaned, en_dev_cleaned = train_test_split(english_cleaned, shuffle = True, test_size = 0.2, random_state = 7)
# en_dev_cleaned, en_test_cleaned = train_test_split(en_dev_cleaned, shuffle = True, test_size = 0.5, random_state = 7)

##NEED TO IMPLEMENT
fin_train, fin_dev = train_test_split(finnish, shuffle = True, test_size = 0.2, random_state = 7)
fin_dev, fin_test = train_test_split(fin_dev, shuffle = True, test_size = 0.5, random_state = 7)

fin_train_cleaned, fin_dev_cleaned = train_test_split(finnish_cleaned, shuffle = True, test_size = 0.2, random_state = 7)
fin_dev_cleaned, fin_test_cleaned = train_test_split(fin_dev_cleaned, shuffle = True, test_size = 0.5, random_state = 7)

ch_train, ch_dev = train_test_split(chinese, shuffle = True, test_size = 0.2, random_state = 7)
ch_dev, ch_test = train_test_split(ch_dev, shuffle = True, test_size = 0.5, random_state = 7)

ch_train_cleaned, ch_dev_cleaned = train_test_split(chinese_cleaned, shuffle = True, test_size = 0.2, random_state = 7)
ch_dev_cleaned, ch_test_cleaned = train_test_split(ch_dev_cleaned, shuffle = True, test_size = 0.5, random_state = 7)

# Not cleaned corpus

## Encoding (Word2Vec + Word Mover Distance)

In [14]:
model = api.load('word2vec-google-news-300')

In [15]:
names = ['fin_train', 'fin_dev', 'fin_test']
for j,df in enumerate([fin_train, fin_dev, fin_test]):
    name = 'distances_' + names[j]
    vars()[name] = []
    df.reset_index(drop = True, inplace = True)
    for i in tqdm(range(len(df))):
                  
        vars()[name].append(model.wmdistance(df['reference'][i], df['translation'][i]))
    
    name2 = 'score_' + names[j]
    vars()[name2] = np.array(df['z-score'])

  0%|          | 0/5398 [00:00<?, ?it/s]

  0%|          | 0/675 [00:00<?, ?it/s]

  0%|          | 0/675 [00:00<?, ?it/s]

In [16]:
names = ['ch_train', 'ch_dev', 'ch_test']
for j,df in enumerate([ch_train, ch_dev, ch_test]):
    name = 'distances_' + names[j]
    vars()[name] = []
    df.reset_index(drop = True, inplace = True)
    for i in tqdm(range(len(df))):
                  
        vars()[name].append(model.wmdistance(df['reference'][i], df['translation'][i]))
    
    name2 = 'score_' + names[j]
    vars()[name2] = np.array(df['z-score'])

  0%|          | 0/8176 [00:00<?, ?it/s]

  0%|          | 0/1022 [00:00<?, ?it/s]

  0%|          | 0/1023 [00:00<?, ?it/s]

## Calculating correlation

In [17]:
def corr(y_train_true, y_train_pred, y_dev_true, y_dev_pred, y_test_true, y_test_pred, return_corr = False):
    
    cleaned_corr_train, cleaned_corr_train_pvalue = pearsonr(y_train_true, y_train_pred)
    cleaned_corr_ktau_train, cleaned_corr_ktau_train_pvalue = kendalltau(y_train_true, y_train_pred)
    
    cleaned_corr_dev, cleaned_corr_dev_pvalue = pearsonr(y_dev_true, y_dev_pred)
    cleaned_corr_ktau_dev, cleaned_corr_ktau_dev_pvalue = kendalltau(y_dev_true, y_dev_pred)


    cleaned_corr_ktau_test, cleaned_corr_ktau_test_pvalue = kendalltau(y_test_true, y_test_pred)
    cleaned_corr_test, cleaned_corr_test_pvalue = pearsonr(y_test_true, y_test_pred)
        
    print(f'Pearson correlation between cosine similarity and score on train set: {cleaned_corr_train} (p-value < 0.001: {cleaned_corr_train_pvalue < 0.001}); and Kendall Tau: {cleaned_corr_ktau_train} (p-value < 0.001: {cleaned_corr_ktau_train_pvalue < 0.001})')
    print(f'Pearson correlation between cosine similarity and score on development set: {cleaned_corr_dev} (p-value < 0.001: {cleaned_corr_dev_pvalue < 0.001}); and Kendall Tau: {cleaned_corr_ktau_dev} (p-value < 0.001: {cleaned_corr_ktau_dev_pvalue < 0.001})')
    print(f'Pearson correlation between cosine similarity and score on test set: {cleaned_corr_test} (p-value < 0.001: {cleaned_corr_test_pvalue < 0.001}); and Kendall Tau: {cleaned_corr_ktau_test} (p-value < 0.001: {cleaned_corr_ktau_test_pvalue < 0.001})')
    
    if return_corr:
        return cleaned_corr_dev

How to treat np.inf???

In [20]:
while np.inf in distances_fin_train:
    distances_fin_train[distances_fin_train.index(np.inf)] = 0

while np.inf in distances_fin_dev:
    distances_fin_dev[distances_fin_dev.index(np.inf)] = 0
    
while np.inf in distances_fin_test:
    distances_fin_test[distances_fin_test.index(np.inf)] = 0

In [21]:
while np.inf in distances_ch_train:
    distances_ch_train[distances_ch_train.index(np.inf)] = 0

while np.inf in distances_ch_dev:
    distances_ch_dev[distances_ch_dev.index(np.inf)] = 0
    
while np.inf in distances_ch_test:
    distances_ch_test[distances_ch_test.index(np.inf)] = 0

In [22]:
corr(score_fin_train, distances_fin_train, score_fin_dev, distances_fin_dev, score_fin_test, distances_fin_test)

Pearson correlation between cosine similarity and score on train set: -0.4423001770601065 (p-value < 0.001: True); and Kendall Tau: -0.304006891633581 (p-value < 0.001: True)
Pearson correlation between cosine similarity and score on development set: -0.40423504312735 (p-value < 0.001: True); and Kendall Tau: -0.2822384195577719 (p-value < 0.001: True)
Pearson correlation between cosine similarity and score on test set: -0.35667416940720503 (p-value < 0.001: True); and Kendall Tau: -0.25342281135122874 (p-value < 0.001: True)


In [23]:
corr(score_ch_train, distances_ch_train, score_ch_dev, distances_ch_dev, score_ch_test, distances_ch_test)

Pearson correlation between cosine similarity and score on train set: -0.3213889382134847 (p-value < 0.001: True); and Kendall Tau: -0.23975835475012616 (p-value < 0.001: True)
Pearson correlation between cosine similarity and score on development set: -0.3497285570880814 (p-value < 0.001: True); and Kendall Tau: -0.24686863006174922 (p-value < 0.001: True)
Pearson correlation between cosine similarity and score on test set: -0.32907919454326545 (p-value < 0.001: True); and Kendall Tau: -0.25310354525228784 (p-value < 0.001: True)


# Cleaned corpus

## Encoding (Word2Vec + Word Mover Distance)

In [24]:
# model = api.load('word2vec-google-news-300')
names = ['fin_train_cleaned', 'fin_dev_cleaned', 'fin_test_cleaned']
for j,df in enumerate([fin_train_cleaned, fin_dev_cleaned, fin_test_cleaned]):
    name = 'distances_' + names[j]
    vars()[name] = []
    df.reset_index(drop = True, inplace = True)
    for i in tqdm(range(len(df))):
                  
        vars()[name].append(model.wmdistance(df['reference'][i], df['translation'][i]))
    
    name2 = 'score_' + names[j]
    vars()[name2] = np.array(df['z-score'])

  0%|          | 0/5398 [00:00<?, ?it/s]

  0%|          | 0/675 [00:00<?, ?it/s]

  0%|          | 0/675 [00:00<?, ?it/s]

In [25]:
# model = api.load('word2vec-google-news-300')
names = ['ch_train_cleaned', 'ch_dev_cleaned', 'ch_test_cleaned']
for j,df in enumerate([ch_train_cleaned, ch_dev_cleaned, ch_test_cleaned]):
    name = 'distances_' + names[j]
    vars()[name] = []
    df.reset_index(drop = True, inplace = True)
    for i in tqdm(range(len(df))):
                  
        vars()[name].append(model.wmdistance(df['reference'][i], df['translation'][i]))
    
    name2 = 'score_' + names[j]
    vars()[name2] = np.array(df['z-score'])

  0%|          | 0/8176 [00:00<?, ?it/s]

  0%|          | 0/1022 [00:00<?, ?it/s]

  0%|          | 0/1023 [00:00<?, ?it/s]

## Calculating correlation

In [26]:
while np.inf in distances_fin_train_cleaned:
    distances_fin_train_cleaned[distances_fin_train_cleaned.index(np.inf)] = 0

while np.inf in distances_fin_dev_cleaned:
    distances_fin_dev_cleaned[distances_fin_dev_cleaned.index(np.inf)] = 0
    
while np.inf in distances_fin_test_cleaned:
    distances_fin_test_cleaned[distances_fin_test_cleaned.index(np.inf)] = 0

In [27]:
while np.inf in distances_ch_train_cleaned:
    distances_ch_train_cleaned[distances_ch_train_cleaned.index(np.inf)] = 0

while np.inf in distances_ch_dev_cleaned:
    distances_ch_dev_cleaned[distances_ch_dev_cleaned.index(np.inf)] = 0
    
while np.inf in distances_ch_test_cleaned:
    distances_ch_test_cleaned[distances_ch_test_cleaned.index(np.inf)] = 0

In [28]:
corr(score_fin_train_cleaned, distances_fin_train_cleaned, score_fin_dev_cleaned, distances_fin_dev_cleaned, score_fin_test_cleaned, distances_fin_test_cleaned)

Pearson correlation between cosine similarity and score on train set: -0.43161416179470985 (p-value < 0.001: True); and Kendall Tau: -0.29760017477818435 (p-value < 0.001: True)
Pearson correlation between cosine similarity and score on development set: -0.3815949530749775 (p-value < 0.001: True); and Kendall Tau: -0.27013049060618927 (p-value < 0.001: True)
Pearson correlation between cosine similarity and score on test set: -0.3367278955650935 (p-value < 0.001: True); and Kendall Tau: -0.24948405560974188 (p-value < 0.001: True)


In [29]:
corr(score_ch_train_cleaned, distances_ch_train_cleaned, score_ch_dev_cleaned, distances_ch_dev_cleaned, score_ch_test_cleaned, distances_ch_test_cleaned)

Pearson correlation between cosine similarity and score on train set: -0.2956892364499134 (p-value < 0.001: True); and Kendall Tau: -0.2100104363215054 (p-value < 0.001: True)
Pearson correlation between cosine similarity and score on development set: -0.30223590648625553 (p-value < 0.001: True); and Kendall Tau: -0.21777037176121378 (p-value < 0.001: True)
Pearson correlation between cosine similarity and score on test set: -0.32710221458107114 (p-value < 0.001: True); and Kendall Tau: -0.23194633772384848 (p-value < 0.001: True)
