# Text Mining Project

### NOVA IMS MT Metrics Shared Task

**Group members:**
- Lorenzo Pigozzi	--- m20200745
- Davide Farinati
- Antonio
- Luis Reis

## 1. Importing libraries and corpora <a class="anchor" id="1"></a>

In [1]:
# general libraries
import pandas as pd
import numpy as np
import re 
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

# word's preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer
from nltk.translate.bleu_score import sentence_bleu
from bs4 import BeautifulSoup
import string

# sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

# warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# importing the corpora
# cs_en = pd.read_csv('corpus\cs-en\scores.csv')
de_en = pd.read_csv('corpus\de-en\scores.csv')
# ru_en = pd.read_csv('corpus\scores_ru-en.csv')
# zh_en = pd.read_csv('corpus\zh-en\scores.csv')
# en_fi = pd.read_csv('corpus\en-fi\scores.csv')
# en_zh = pd.read_csv('corpus\en-zh\scores.csv')

In [100]:
de_en.head()

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,"Ihr Zeitlupentempo maßen sie, als sie vor Spit...",Her timeless pace measures them when they equi...,Their slow speed was measured by researchers o...,-0.345024,76.0,1
1,"Er sagte, dass die Bereiche ruhige Treffpunkte...",He said the areas offer quiet meeting points b...,He said the spaces provided calm meeting point...,0.9038,97.5,2
2,Für die Geschäftsleute an der B 27 ist es nur ...,"For businessmen at the B 27, it's only a small...",This is only a small consolation for businesse...,0.700503,94.0,1
3,Diese Fähigkeit sei möglicherweise angeboren o...,This ability may be born or developed with gen...,"This ability may be innate, or may develop as ...",-1.256572,51.5,2
4,Weil sie Wassertemperaturen um die sechs Grad ...,Because they prefer water temperatures around ...,They generally only come to the surface in win...,0.293909,87.0,2


#  3. Pre-processing <a class="anchor" id="3."></a>

In [101]:
# selecting the necessary variables for the baseline
deen_reference = de_en[['source','reference']]
deen_translation = de_en[['source','translation']]

In [102]:
deen_reference.head()

Unnamed: 0,source,reference
0,"Ihr Zeitlupentempo maßen sie, als sie vor Spit...",Her timeless pace measures them when they equi...
1,"Er sagte, dass die Bereiche ruhige Treffpunkte...",He said the areas offer quiet meeting points b...
2,Für die Geschäftsleute an der B 27 ist es nur ...,"For businessmen at the B 27, it's only a small..."
3,Diese Fähigkeit sei möglicherweise angeboren o...,This ability may be born or developed with gen...
4,Weil sie Wassertemperaturen um die sechs Grad ...,Because they prefer water temperatures around ...


## 3.1. Cleaning <a class="anchor" id="3.1."></a>

In [103]:
def clean(text_list,
          lower = False,
          keep_numbers = False,
          keep_expression = False,
          remove_char = False,
          remove_stop = False,
          remove_tag = False,
          lemmatize = False,
          stemmer = False,
          english = True
          ):
    """
    Function that a receives a list of strings and preprocesses it.
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    if english:
        lang = 'english'
    else:
        lang = 'german'
    
    stop = set(stopwords.words(lang))
    stem = SnowballStemmer(lang)
    
    updates = []
    for j in tqdm(range(len(text_list))):
        
        text = text_list[j]
        
        #LOWERCASE TEXT
        if lower:
            text = text.lower()
            
        #KEEP NUMBERS AS TOKENS
        if keep_numbers:
            text = re.sub("[\d+]", 'NUMBER', text)
        
        #KEEP '?' and '!' AS TOKENS
        if keep_expression:
            text = re.sub("[\?|\!]", 'EXPRESSION', text)
            
        #REMOVE THAT IS NOT TEXT
        if remove_char:
            text = re.sub("[^a-zA-Z]", ' ', text)
            
        #REMOVE TAGS
        if remove_tag:
            text = BeautifulSoup(text).get_text()
        
        #REMOVE STOP WORDS
        if remove_stop:
            text = ' '.join([word for word in text.split(' ') if word not in stop])
        
        #LEMMATIZATION
        if lemmatize:
            if english:
                lemma = WordNetLemmatizer()
                text = " ".join(lemma.lemmatize(word) for word in text.split())
#             else:
#                 lemma = libvoikko.Voikko(u"fi")
#                 text = " ".join(lemma.analyze(word)[0]['BASEFORM'] for word in text.split())
        
        #STEMMER
        if stemmer:
            text = " ".join(stem.stem(word) for word in text.split())
        
        updates.append(text)
        
    return updates

def clean_zh_stopwords(text_list, stopwords_set='merged'):
    """
    Function that removes chinese stopwords
    
    :param stopwords_set: remove words of both sets (merged), just the 1st (fst) or just the second (snd) 
    """
    updates = []
    
    zh_stopwords1 = [line.strip() for line in open('chinese_stopwords/chinese_stopwords1.txt', 'r', encoding='utf-8').readlines()]
    zh_stopwords2 = [line.strip() for line in open('chinese_stopwords/chinese_stopwords2.txt', 'r', encoding='utf-8').readlines()]
    
    if stopwords_set == 'merged':
        stop = list(set(zh_stopwords1 + zh_stopwords2))
    elif stopwords_set == 'fst':
        stop = zh_stopwords1
    elif stopwords_set == 'snd':
        stop = zh_stopwords2
        

    for j in tqdm(range(len(text_list))):
        text = text_list[j]
        text = ' '.join([word for word in text.split(' ') if word not in stop])
        updates.append(text)
        
    return updates
        
    
def update_df(dataframe, list_updated):
    dataframe.update(pd.DataFrame({"Text": list_updated}))

In [117]:
deen_reference['source'] = clean(deen_reference['source'], 
      lower = True, 
      remove_char = True,
      remove_stop = True,
      lemmatize = True,
      stemmer = False,
     english = False)

deen_reference['reference'] = clean(deen_reference['reference'], 
      lower = True, 
      remove_char = True,
      remove_stop = True,
      lemmatize = True,
      stemmer = False,
     english = True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=21704.0), HTML(value='')))




In [118]:
deen_translation['source'] = clean(deen_translation['source'], 
      lower = True, 
      remove_char = True,
      remove_stop = True,
      lemmatize = True,
      stemmer = False,
     english = False)

deen_translation['translation'] = clean(deen_translation['translation'], 
      lower = True, 
      remove_char = True,
      remove_stop = True,
      lemmatize = True,
      stemmer = False,
     english = True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=21704.0), HTML(value='')))




In [119]:
deen_reference.head()

Unnamed: 0,source,reference
0,zeitlupentempo ma en spitzbergen sechs tiere ...,timeless pace measure equipped six animal broa...
1,sagte bereiche ruhige treffpunkte fl chtlinge...,said area offer quiet meeting point refugee vo...
2,f r gesch ftsleute b kleiner trost f r kun...,businessmen b small consolation customer rotte...
3,f higkeit sei m glicherweise angeboren entwick...,ability may born developed gender maturity
4,wassertemperaturen sechs grad celsius bevorzug...,prefer water temperature around six degree cel...


In [120]:
deen_translation.head()

Unnamed: 0,source,translation
0,zeitlupentempo ma en spitzbergen sechs tiere ...,slow speed measured researcher svalbard fitted...
1,sagte bereiche ruhige treffpunkte fl chtlinge...,said space provided calm meeting point refugee...
2,f r gesch ftsleute b kleiner trost f r kun...,small consolation business located along b roa...
3,f higkeit sei m glicherweise angeboren entwick...,ability may innate may develop animal reach se...
4,wassertemperaturen sechs grad celsius bevorzug...,generally come surface winter prefer water tem...


# LaBSE

https://ai.googleblog.com/2020/08/language-agnostic-bert-sentence.html \
https://arxiv.org/abs/2007.01852 \
https://tfhub.dev/google/LaBSE/1 \
https://pytorch.org/docs/stable/generated/torch.matmul.html

\
Pre-trained Model: https://huggingface.co/sentence-transformers/LaBSE

In [13]:
import tensorflow as tf
import torch
# from transformers import BertModel, BertTokenizerFast
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

OSError: Can't load tokenizer for 'setu4993/LaBSE'. Make sure that:

- 'setu4993/LaBSE' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'setu4993/LaBSE' is the correct path to a directory containing relevant tokenizer files



In [None]:
# For similarity between sentences, an L2-norm is recommended before calculating the similarity
def similarity(embeddings_1, embeddings_2):
    normalized_embeddings_1 = F.normalize(embeddings_1, p=2)
    normalized_embeddings_2 = F.normalize(embeddings_2, p=2)
    return torch.matmul(
        normalized_embeddings_1, normalized_embeddings_2.transpose(0, 1)
    )

In [14]:
# tokenizer = BertTokenizerFast.from_pretrained("setu4993/LaBSE")
# model = BertModel.from_pretrained("setu4993/LaBSE")
# model = model.eval()

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
model = AutoModel.from_pretrained("sentence-transformers/LaBSE")

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=654.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=5220781.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=112.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=49.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1888175414.0), HTML(value='')))




Some weights of the model checkpoint at sentence-transformers/LaBSE were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [134]:
de_en.head()

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,"Ihr Zeitlupentempo maßen sie, als sie vor Spit...",Her timeless pace measures them when they equi...,Their slow speed was measured by researchers o...,-0.345024,76.0,1
1,"Er sagte, dass die Bereiche ruhige Treffpunkte...",He said the areas offer quiet meeting points b...,He said the spaces provided calm meeting point...,0.9038,97.5,2
2,Für die Geschäftsleute an der B 27 ist es nur ...,"For businessmen at the B 27, it's only a small...",This is only a small consolation for businesse...,0.700503,94.0,1
3,Diese Fähigkeit sei möglicherweise angeboren o...,This ability may be born or developed with gen...,"This ability may be innate, or may develop as ...",-1.256572,51.5,2
4,Weil sie Wassertemperaturen um die sechs Grad ...,Because they prefer water temperatures around ...,They generally only come to the surface in win...,0.293909,87.0,2


In [121]:
deen_reference.head()

Unnamed: 0,source,reference
0,zeitlupentempo ma en spitzbergen sechs tiere ...,timeless pace measure equipped six animal broa...
1,sagte bereiche ruhige treffpunkte fl chtlinge...,said area offer quiet meeting point refugee vo...
2,f r gesch ftsleute b kleiner trost f r kun...,businessmen b small consolation customer rotte...
3,f higkeit sei m glicherweise angeboren entwick...,ability may born developed gender maturity
4,wassertemperaturen sechs grad celsius bevorzug...,prefer water temperature around six degree cel...


In [126]:
deen_translation.head()

Unnamed: 0,source,translation
0,zeitlupentempo ma en spitzbergen sechs tiere ...,slow speed measured researcher svalbard fitted...
1,sagte bereiche ruhige treffpunkte fl chtlinge...,said space provided calm meeting point refugee...
2,f r gesch ftsleute b kleiner trost f r kun...,small consolation business located along b roa...
3,f higkeit sei m glicherweise angeboren entwick...,ability may innate may develop animal reach se...
4,wassertemperaturen sechs grad celsius bevorzug...,generally come surface winter prefer water tem...


In [127]:
german_source = list(deen_reference['source'].head(100))
english_reference = list(deen_reference['reference'].head(100))
english_translation = list(deen_translation['translation'].head(100))

# german_source = list(deen_reference['source'])
# english_reference = list(deen_reference['reference'])

In [128]:
german_inputs = tokenizer(german_source, return_tensors="pt", padding=True)
english_reference_inputs = tokenizer(english_reference, return_tensors="pt", padding=True)
english_translation_inputs = tokenizer(english_translation, return_tensors="pt", padding=True)

with torch.no_grad():
    german_outputs = model(**german_inputs)
    english_reference_outputs = model(**english_reference_inputs)
    english_translation_outputs = model(**english_translation_inputs)

In [129]:
# To get the sentence embeddings, use the pooler output
german_embeddings = german_outputs.pooler_output
english_reference_embeddings = english_reference_outputs.pooler_output
english_translation_embeddings = english_translation_outputs.pooler_output

In [130]:
matrix_reference = similarity(german_embeddings, english_reference_embeddings)
diagonal_reference = pd.Series(tf.linalg.tensor_diag_part(matrix_reference))

matrix_translation = similarity(german_embeddings, english_translation_embeddings)
diagonal_translation = pd.Series(tf.linalg.tensor_diag_part(matrix_translation))

In [131]:
result = pd.concat([diagonal_reference, diagonal_translation, de_en['z-score'].head(100), 
                    de_en['avg-score'].head(100)], axis = 1)
result.columns = ['source_reference_similarity', 'source_translation_similarity', 'z-score', 'avg-score']
result['difference_similarity'] = result['source_reference_similarity'] - result['source_translation_similarity']

In [137]:
print('Pearson correlation difference_similarity and z-score: ')
result[['z-score', 'difference_similarity']].corr(method='pearson').iloc[1:2,:1].values[0][0]

Pearson correlation difference_similarity and z-score: 


-0.046691809705358835

In [141]:
print('Pearson correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'source_translation_similarity']].corr(method='pearson').iloc[1:2,:1].values[0][0])
print('---------------------------------------------------------------------')
print('Kendall correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'source_translation_similarity']].corr(method='kendall').iloc[1:2,:1].values[0][0])

Pearson correlation source_translation_similarity and z-score: 
0.10183868388061088
---------------------------------------------------------------------
Kendall correlation source_translation_similarity and z-score: 
0.0386131704867555
