# Text Mining Project

### NOVA IMS MT Metrics Shared Task

**Group members:**
- Lorenzo Pigozzi	--- m20200745
- Davide Farinati
- Antonio
- Luis Reis

## 1. Importing libraries and corpora <a class="anchor" id="1"></a>

In [93]:
# general libraries
import pandas as pd
import numpy as np
import re 
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

# word's preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer
from nltk.translate.bleu_score import sentence_bleu
from bs4 import BeautifulSoup
import string

# sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

import jieba

# warnings
import warnings
warnings.filterwarnings("ignore")

In [119]:
# importing the corpora
# cs_en = pd.read_csv('corpus\cs-en\scores.csv')
# de_en = pd.read_csv('corpus\de-en\scores.csv')
# ru_en = pd.read_csv('corpus\scores_ru-en.csv')
zh_en = pd.read_csv('corpus\zh-en\scores.csv')
# en_fi = pd.read_csv('corpus\en-fi\scores.csv')
# en_zh = pd.read_csv('corpus\en-zh\scores.csv')

In [120]:
zh_en.head()

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,他性格活泼，这对英国赛马来说是好事，但是除此之外，他还是一位不可思议的骑师。,"His character is good for the British horse, b...",He's a lively character which is good for Brit...,0.625559,92.75,4
1,近日刚搬至旧金山的一位28岁厨师本周被发现死于当地一家商场的楼梯间。,"A 28 chef, who has just moved to San Francisco...",A 28-year-old chef who had recently moved to S...,0.550952,92.0,4
2,去年，有官员表示，胡克先生的团队所得出的结论是针对伊斯兰国炼油厂的空袭并未大幅削减恐怖组织的...,"Last year, officials said Mr. Hooker's team ha...","Last year, officials said, Mr. Hooker's team c...",0.540814,89.0,5
3,尤其值得玩味的是政府对于饥饿民众们的回应，比如总统市民赫伯特·胡佛“别人的事我可管不了”的态度。,It is particularly interesting to note the gov...,Especially savory are the accounts of the gove...,-0.793944,49.5,4
4,不过，从20世纪90年代至今，人类共进行了18次火星探测，而月球探测只进行了14次。,"However, from the 1990s to the present, human ...","However, ever since the 1990s, a total of 18 h...",0.046532,77.5,4


In [121]:
corpus = zh_en.copy()

#  3. Pre-processing <a class="anchor" id="3."></a>

In [122]:
# selecting the necessary variables for the baseline
source_reference = corpus[['source','reference']]
source_translation = corpus[['source','translation']]

In [123]:
source_reference.head()

Unnamed: 0,source,reference
0,他性格活泼，这对英国赛马来说是好事，但是除此之外，他还是一位不可思议的骑师。,"His character is good for the British horse, b..."
1,近日刚搬至旧金山的一位28岁厨师本周被发现死于当地一家商场的楼梯间。,"A 28 chef, who has just moved to San Francisco..."
2,去年，有官员表示，胡克先生的团队所得出的结论是针对伊斯兰国炼油厂的空袭并未大幅削减恐怖组织的...,"Last year, officials said Mr. Hooker's team ha..."
3,尤其值得玩味的是政府对于饥饿民众们的回应，比如总统市民赫伯特·胡佛“别人的事我可管不了”的态度。,It is particularly interesting to note the gov...
4,不过，从20世纪90年代至今，人类共进行了18次火星探测，而月球探测只进行了14次。,"However, from the 1990s to the present, human ..."


## 3.1. Cleaning <a class="anchor" id="3.1."></a>

In [124]:
def clean(text_list,
          lower = False,
          keep_numbers = False,
          keep_expression = False,
          remove_char = False,
          remove_stop = False,
          remove_tag = False,
          lemmatize = False,
          stemmer = False,
          english = True
          ):
    """
    Function that a receives a list of strings and preprocesses it.
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    if english:
        lang = 'english'
    else:
        lang = 'german'
    
    stop = set(stopwords.words(lang))
    stem = SnowballStemmer(lang)
    
    updates = []
    for j in tqdm(range(len(text_list))):
        
        text = text_list[j]
        
        #LOWERCASE TEXT
        if lower:
            text = text.lower()
            
        #KEEP NUMBERS AS TOKENS
        if keep_numbers:
            text = re.sub("[\d+]", 'NUMBER', text)
        
        #KEEP '?' and '!' AS TOKENS
        if keep_expression:
            text = re.sub("[\?|\!]", 'EXPRESSION', text)
            
        #REMOVE THAT IS NOT TEXT
        if remove_char:
            text = re.sub("[^a-zA-Z]", ' ', text)
            
        #REMOVE TAGS
        if remove_tag:
            text = BeautifulSoup(text).get_text()
        
        #REMOVE STOP WORDS
        if remove_stop:
            text = ' '.join([word for word in text.split(' ') if word not in stop])
        
        #LEMMATIZATION
        if lemmatize:
            if english:
                lemma = WordNetLemmatizer()
                text = " ".join(lemma.lemmatize(word) for word in text.split())
#             else:
#                 lemma = libvoikko.Voikko(u"fi")
#                 text = " ".join(lemma.analyze(word)[0]['BASEFORM'] for word in text.split())
        
        #STEMMER
        if stemmer:
            text = " ".join(stem.stem(word) for word in text.split())
        
        updates.append(text)
        
    return updates

def clean_ch(text_list, keep_numbers=False, remove_punctuation=False, remove_stop = False, stopwords_set='merged'):
    """
    Function that removes chinese stopwords
    
    :param stopwords_set: remove words of both sets (merged), just the 1st (fst) or just the second (snd) 
    """
    updates = []
    
    zh_stopwords1 = [line.strip() for line in open('chinese_stopwords/chinese_stopwords1.txt', 'r', encoding='utf-8').readlines()]
    zh_stopwords2 = [line.strip() for line in open('chinese_stopwords/chinese_stopwords2.txt', 'r', encoding='utf-8').readlines()]
    
    if stopwords_set == 'merged':
        stop = list(set(zh_stopwords1 + zh_stopwords2))
    elif stopwords_set == 'fst':
        stop = zh_stopwords1
    elif stopwords_set == 'snd':
        stop = zh_stopwords2

    for j in range(len(text_list)):
        
        text = text_list[j]
        
        #KEEP NUMBERS AS TOKENS
        if keep_numbers:
            text = re.sub("[\d+]", 'X', text)
        
        # REMOVE PUNCTUATION
        if remove_punctuation:
            # https://stackoverflow.com/questions/36640587/how-to-remove-chinese-punctuation-in-python
            punc = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
            text = re.sub(r"[%s]+" %punc, "", text)
        
        # REMOVE STOP WORDS
        if remove_stop:
            pretext = text
            text = ' '.join([word for word in jieba.cut(text) if word not in stop])
            
        updates.append(text)
        
    return updates

def update_df(dataframe, list_updated):
    dataframe.update(pd.DataFrame({"Text": list_updated}))

In [125]:
source_reference.head()

Unnamed: 0,source,reference
0,他性格活泼，这对英国赛马来说是好事，但是除此之外，他还是一位不可思议的骑师。,"His character is good for the British horse, b..."
1,近日刚搬至旧金山的一位28岁厨师本周被发现死于当地一家商场的楼梯间。,"A 28 chef, who has just moved to San Francisco..."
2,去年，有官员表示，胡克先生的团队所得出的结论是针对伊斯兰国炼油厂的空袭并未大幅削减恐怖组织的...,"Last year, officials said Mr. Hooker's team ha..."
3,尤其值得玩味的是政府对于饥饿民众们的回应，比如总统市民赫伯特·胡佛“别人的事我可管不了”的态度。,It is particularly interesting to note the gov...
4,不过，从20世纪90年代至今，人类共进行了18次火星探测，而月球探测只进行了14次。,"However, from the 1990s to the present, human ..."


In [126]:
source_translation.head()

Unnamed: 0,source,translation
0,他性格活泼，这对英国赛马来说是好事，但是除此之外，他还是一位不可思议的骑师。,He's a lively character which is good for Brit...
1,近日刚搬至旧金山的一位28岁厨师本周被发现死于当地一家商场的楼梯间。,A 28-year-old chef who had recently moved to S...
2,去年，有官员表示，胡克先生的团队所得出的结论是针对伊斯兰国炼油厂的空袭并未大幅削减恐怖组织的...,"Last year, officials said, Mr. Hooker's team c..."
3,尤其值得玩味的是政府对于饥饿民众们的回应，比如总统市民赫伯特·胡佛“别人的事我可管不了”的态度。,Especially savory are the accounts of the gove...
4,不过，从20世纪90年代至今，人类共进行了18次火星探测，而月球探测只进行了14次。,"However, ever since the 1990s, a total of 18 h..."


In [127]:
source_reference['source'] = clean_ch(source_reference['source'], 
                                keep_numbers = False,
                                remove_punctuation = False,
                                remove_stop = True,
                                stopwords_set = 'snd')

source_reference['reference'] = clean(source_reference['reference'], 
                                      lower = True, 
                                      remove_char = True,
                                      remove_stop = True,
                                      lemmatize = True,
                                      stemmer = False,
                                      english = True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26419.0), HTML(value='')))




In [128]:
source_translation['source'] = clean_ch(source_translation['source'], 
                                keep_numbers = False,
                                remove_punctuation = False,
                                remove_stop = True,
                                stopwords_set = 'snd')

source_translation['translation'] = clean(source_translation['translation'], 
                                          lower = True, 
                                           remove_char = True,
                                           remove_stop = True,
                                            lemmatize = True,
                                            stemmer = False,
                                            english = True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26419.0), HTML(value='')))




In [129]:
source_translation.head()

Unnamed: 0,source,translation
0,性格 活泼 英国 赛马 好事 一位 不可思议 骑师,lively character good british racing incredibl...
1,近日 刚 搬 旧金山 一位 28 岁 厨师 本周 发现 死 一家 商场 楼梯间,year old chef recently moved san francisco fou...
2,去年 官员 表示 胡克 先生 团队 得出 结论 伊斯兰 国 炼油厂 空袭 并未 大幅 削减 ...,last year official said mr hooker team conclud...
3,尤其 值得 玩味 政府 饥饿 民众 回应 总统 市民 赫伯特 · 胡佛 “ 事 管不了 ” 态度,especially savory account government response ...
4,20 世纪 90 年代 人类 进行 18 次 火星 探测 月球 探测 进行 14 次,however ever since total human exploration car...


In [130]:
source_reference.head()

Unnamed: 0,source,reference
0,性格 活泼 英国 赛马 好事 一位 不可思议 骑师,character good british horse addition incredib...
1,近日 刚 搬 旧金山 一位 28 岁 厨师 本周 发现 死 一家 商场 楼梯间,chef moved san francisco found dead stair loca...
2,去年 官员 表示 胡克 先生 团队 得出 结论 伊斯兰 国 炼油厂 空袭 并未 大幅 削减 ...,last year official said mr hooker team conclud...
3,尤其 值得 玩味 政府 饥饿 民众 回应 总统 市民 赫伯特 · 胡佛 “ 事 管不了 ” 态度,particularly interesting note government respo...
4,20 世纪 90 年代 人类 进行 18 次 火星 探测 月球 探测 进行 14 次,however present human being carried mar probe ...


# LaBSE

https://ai.googleblog.com/2020/08/language-agnostic-bert-sentence.html \
https://arxiv.org/abs/2007.01852 \
https://tfhub.dev/google/LaBSE/1 \
https://pytorch.org/docs/stable/generated/torch.matmul.html

\
Pre-trained Model: https://huggingface.co/sentence-transformers/LaBSE

In [73]:
import tensorflow as tf
import torch
# from transformers import BertModel, BertTokenizerFast
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

In [74]:
# For similarity between sentences, an L2-norm is recommended before calculating the similarity
def similarity(embeddings_1, embeddings_2):
    normalized_embeddings_1 = F.normalize(embeddings_1, p=2)
    normalized_embeddings_2 = F.normalize(embeddings_2, p=2)
    return torch.matmul(
        normalized_embeddings_1, normalized_embeddings_2.transpose(0, 1)
    )

In [131]:
# tokenizer = BertTokenizerFast.from_pretrained("setu4993/LaBSE")
# model = BertModel.from_pretrained("setu4993/LaBSE")
# model = model.eval()

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
model = AutoModel.from_pretrained("sentence-transformers/LaBSE")

Some weights of the model checkpoint at sentence-transformers/LaBSE were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [132]:
sources = list(source_reference['source'].head(500))
references = list(source_reference['reference'].head(500))
translations = list(source_translation['translation'].head(500))

# german_source = list(deen_reference['source'])
# english_reference = list(deen_reference['reference'])

In [133]:
source_inputs = tokenizer(sources, return_tensors="pt", padding=True)
reference_inputs = tokenizer(references, return_tensors="pt", padding=True)
translation_inputs = tokenizer(translations, return_tensors="pt", padding=True)

with torch.no_grad():
    source_outputs = model(**source_inputs)
    reference_outputs = model(**reference_inputs)
    translation_outputs = model(**translation_inputs)

In [134]:
# To get the sentence embeddings, use the pooler output
source_embeddings = source_outputs.pooler_output
reference_embeddings = reference_outputs.pooler_output
translation_embeddings = translation_outputs.pooler_output

In [135]:
matrix_reference = similarity(source_embeddings, reference_embeddings)
diagonal_reference = pd.Series(tf.linalg.tensor_diag_part(matrix_reference))

matrix_translation = similarity(source_embeddings, translation_embeddings)
diagonal_translation = pd.Series(tf.linalg.tensor_diag_part(matrix_translation))

In [136]:
result = pd.concat([diagonal_reference, diagonal_translation, corpus['z-score'].head(500), 
                    corpus['avg-score'].head(500)], axis = 1)
result.columns = ['source_reference_similarity', 'source_translation_similarity', 'z-score', 'avg-score']
result['difference_similarity'] = result['source_reference_similarity'] - result['source_translation_similarity']

In [112]:
print('Source : ENGLISH   |  Reference and Translation : CHINESE')
print('Pearson correlation difference_similarity and z-score: ')
print(result[['z-score', 'difference_similarity']].corr(method='pearson').iloc[1:2,:1].values[0][0])
print('---------------------------------------------------------------------')
print('---------------------------------------------------------------------')
print('Pearson correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'source_translation_similarity']].corr(method='pearson').iloc[1:2,:1].values[0][0])
print('---------------------------------------------------------------------')
print('Kendall correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'source_translation_similarity']].corr(method='kendall').iloc[1:2,:1].values[0][0])

Source : ENGLISH   |  Reference and Translation : CHINESE
Pearson correlation difference_similarity and z-score: 
0.020196960018330712
---------------------------------------------------------------------
---------------------------------------------------------------------
Pearson correlation source_translation_similarity and z-score: 
0.22479069539029461
---------------------------------------------------------------------
Kendall correlation source_translation_similarity and z-score: 
0.098239157536945


In [137]:
print('Source : CHINESE   |  Reference and Translation : ENGLISH')
print('Pearson correlation difference_similarity and z-score: ')
print(result[['z-score', 'difference_similarity']].corr(method='pearson').iloc[1:2,:1].values[0][0])
print('---------------------------------------------------------------------')
print('---------------------------------------------------------------------')
print('Pearson correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'source_translation_similarity']].corr(method='pearson').iloc[1:2,:1].values[0][0])
print('---------------------------------------------------------------------')
print('Kendall correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'source_translation_similarity']].corr(method='kendall').iloc[1:2,:1].values[0][0])

Source : CHINESE   |  Reference and Translation : ENGLISH
Pearson correlation difference_similarity and z-score: 
0.09269746386076858
---------------------------------------------------------------------
---------------------------------------------------------------------
Pearson correlation source_translation_similarity and z-score: 
0.30105157682509415
---------------------------------------------------------------------
Kendall correlation source_translation_similarity and z-score: 
0.16196237654836318


In [56]:
print('Source : CHINESE   |  Reference and Translation : ENGLISH')
print('Pearson correlation difference_similarity and z-score: ')
print(result[['z-score', 'difference_similarity']].corr(method='pearson').iloc[1:2,:1].values[0][0])
print('---------------------------------------------------------------------')
print('---------------------------------------------------------------------')
print('Pearson correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'source_translation_similarity']].corr(method='pearson').iloc[1:2,:1].values[0][0])
print('---------------------------------------------------------------------')
print('Kendall correlation source_translation_similarity and z-score: ')
print(result[['z-score', 'source_translation_similarity']].corr(method='kendall').iloc[1:2,:1].values[0][0])

CHINESE-ENGLISH
Pearson correlation difference_similarity and z-score: 
0.06076862915462028
---------------------------------------------------------------------
---------------------------------------------------------------------
Pearson correlation source_translation_similarity and z-score: 
0.24995177828414075
---------------------------------------------------------------------
Kendall correlation source_translation_similarity and z-score: 
0.10516926263437275


# Regression

In [49]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity

In [141]:
data = result[['source_reference_similarity', 'source_translation_similarity', 'z-score']]

In [142]:
X = data.drop(['z-score'], axis=1)
y = data['z-score']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, test_size = 0.2, random_state = 7)

In [143]:
baseline_regressor = LinearRegression()
baseline_regressor.fit(X_train, y_train)

LinearRegression()

In [144]:
y_pred = baseline_regressor.predict(X_test)

In [145]:
baseline_r2_test = baseline_regressor.score(X_test, y_test)

print(f'Baseline R^2 score on test set : {baseline_r2_test}')

Baseline R^2 score on test set : 0.14687139609738986


In [None]:
y_pred
baseline_corr_train, baseline_corr_train_pvalue = pearsonr(y_train, cos_train)