In [13]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import re
import string
import nltk
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr, kendalltau
# import gensim.downloader as api
import jieba
from rouge import Rouge

In [None]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

# Data Importing and Sorting

In [3]:
files = os.listdir('corpus')
files.remove('.DS_Store')
files.remove('scores_ru-en.csv')
scaler = MinMaxScaler()
for file_ in files:
  name = file_.split('-')[0] + file_.split('-')[1]
  vars()[name] = pd.read_csv(os.path.join('corpus', file_, 'scores.csv'))
  vars()[name].drop(columns = ['source', 'annotators', 'avg-score'], inplace = True)
  vars()[name]['z-score'] = scaler.fit_transform(vars()[name]['z-score'].values.reshape(-1,1)) #normalizing values betwewen 0 and 1

In [4]:
english = csen.copy()
for df in [deen, ruen, zhen]:
  english = english.append(df)

In [5]:
finnish = enfi.copy()
chinese = enzh.copy()

In [6]:
english.reset_index(drop = True, inplace = True)

# Cleaning the corpus (updated  cleaning function)

In [41]:
def clean(text_list,
          lower = False,
          keep_numbers = False,
          keep_expression = False,
          remove_char = False,
          remove_stop = False,
          remove_tag = False,
          lemmatize = False,
          stemmer = False,
          english = True
          ):
    """
    Function that a receives a list of strings and preprocesses it.
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    if english:
        lang = 'english'
    else:
        lang = 'finnish'
    
    stop = set(stopwords.words(lang))
    stem = SnowballStemmer(lang)
    
    updates = []
    for j in range(len(text_list)):
        
        text = text_list[j]
        
        #LOWERCASE TEXT
        if lower:
            text = text.lower()
            
        #KEEP NUMBERS AS TOKENS
        if not keep_numbers:
            text = re.sub("[\d+]", 'X', text)
        
        #KEEP '?' and '!' AS TOKENS
        if not keep_expression:
            text = re.sub("[\?|\!]", 'EXPRESSION', text)
            
        #REMOVE TAGS
        if remove_tag:
            text = BeautifulSoup(text).get_text()
            
        #REMOVE THAT IS NOT TEXT
        if remove_char:
            text = re.sub("[^a-zA-Z]", ' ', text)
        
        #REMOVE STOP WORDS
        if remove_stop:
            text = ' '.join([word for word in text.split(' ') if word not in stop])
        
        #LEMMATIZATION
        if lemmatize:
            if english:
                lemma = WordNetLemmatizer()
                text = " ".join(lemma.lemmatize(word) for word in text.split())
        
        #STEMMER
        if stemmer:
            text = " ".join(stem.stem(word) for word in text.split())
        
        updates.append(text)
        
    return updates

def clean_ch(text_list, keep_numbers=False, remove_punctuation=False, remove_stop = False, stopwords_set='merged'):
    """
    Function that removes chinese stopwords
    
    :param stopwords_set: remove words of both sets (merged), just the 1st (fst) or just the second (snd) 
    """
    updates = []
    
    zh_stopwords1 = [line.strip() for line in open('chinese_stopwords/chinese_stopwords1.txt', 'r', encoding='utf-8').readlines()]
    zh_stopwords2 = [line.strip() for line in open('chinese_stopwords/chinese_stopwords2.txt', 'r', encoding='utf-8').readlines()]
    
    if stopwords_set == 'merged':
        stop = list(set(zh_stopwords1 + zh_stopwords2))
    elif stopwords_set == 'fst':
        stop = zh_stopwords1
    elif stopwords_set == 'snd':
        stop = zh_stopwords2

    for j in range(len(text_list)):
        
        text = text_list[j]
        
        #KEEP NUMBERS AS TOKENS
        if keep_numbers:
            text = re.sub("[\d+]", 'X', text)
        
        # REMOVE PUNCTUATION
        if remove_punctuation:
            # https://stackoverflow.com/questions/36640587/how-to-remove-chinese-punctuation-in-python
            punc = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
            text = re.sub(r"[%s]+" %punc, "", text)
        
        # REMOVE STOP WORDS
        if remove_stop:
            pretext = text
            text = ' '.join([word for word in jieba.cut(text) if word not in stop])
            
        updates.append(text)
        
    return updates

In [42]:
# En
cleaning_dict = {'lower': False, 'keep_numbers': True, 'keep_expression': False, 'remove_char': True, 'remove_stop': True, 'remove_tag': False, 'lemmatize': False, 'stemmer': True}
english_clean = pd.DataFrame()
english_clean['z-score'] = english['z-score']
for column in ['reference', 'translation']:
    english_clean[column] = clean(english[column], cleaning_dict)
    
# Fi
finnish_clean = pd.DataFrame()
for c in ['reference', 'translation']:
    finnish_clean[c] = clean(finnish[c],
                             lower = False,
                             keep_numbers = False,
                             keep_expression = True,
                             remove_char = True,
                             remove_stop = False,
                             remove_tag = True,
                             lemmatize = False,
                             stemmer = True,
                             english=False)
finnish_clean['z-score'] = finnish['z-score']

#Ch
chinese_clean = pd.DataFrame()
for c in ['reference', 'translation']:
    chinese_clean[c] = clean_ch(chinese[c],
                                keep_numbers = False,
                                remove_punctuation = False,
                                remove_stop = True,
                                stopwords_set = 'snd')
chinese_clean['z-score'] = chinese['z-score']


In [43]:
chinese_clean

Unnamed: 0,reference,translation,z-score
0,GSIS 科学家 AnthonyDelGenio 新闻稿 解释 “ GISS 模型 模拟 模...,戈达德 太空 研究所 科学家 安东尼 · 德尔 · 杰 尼奥 新闻 发布会 解释 “ 戈达德...,0.451621
1,中国 英国 女性 4x200mFreestreyWTE 最后 称为 “ 中国 14 岁 孩子...,参加 女子 4x200 米 自由泳 接力赛 决赛 中国 小将 艾衍 描述 “ 那名 14 岁...,0.246545
2,来到 2012 队友 没有 好处,2012 队友 看好,0.198549
3,去年 GoudianGroup 南非 南非 港口 出口 163 套 风力 发电 项目,去年 国电 集团 共计 163 套 风电 项目 陆续 连云港 港 出口 南非,0.216002
4,"指称 Kempinski 旅馆 "" 被捕 "" 满足 阿拉伯 客户 要求",有人 认为 凯宾斯基 酒店 简直 满足 阿拉伯 客户 要求 “ 卑躬屈膝 ”,0.391471
...,...,...,...
10216,大型 会议 皇家 Ascot - 普通 星期五 晚上 20 000,一场 大规模 赛马会 英国皇家 爱斯科 赛马会 ( Royal Ascot ) 有着...,0.688151
10217,这位 负责人 强调 钢铁 产品 国际贸易 本质 市场 行为 是从 进口国 要求 考虑 产品成...,这位 负责人 强调 钢铁 产品 国际贸易 本质 讲 市场 行为 源于 进口国 需求 消费者 ...,0.817705
10218,继续 浏览 最终 Bouverie 广场 购物中心 商店 里 尝试 四双 内衣,继续 浏览 货架 最终 位于 Folkestone Bouverie Pla...,0.711189
10219,带 女儿 参加 综艺节目 “ 爸爸 ” 成为 焦点,2004 奥运会 单人 十米 台 比赛 输给 队友 胡佳 田亮 竞技状态 出现 下滑 爆出 ...,0.222476


# Train, Dev & Test Split

In [44]:
# English
en_train, en_dev = train_test_split(english, shuffle = True, test_size = 0.2, random_state = 7)
en_dev, en_test = train_test_split(en_dev, shuffle = True, test_size = 0.5, random_state = 7)

en_train_clean, en_dev_clean = train_test_split(english_clean, shuffle = True, test_size = 0.2, random_state = 7)
en_dev_clean, en_test_clean = train_test_split(en_dev_clean, shuffle = True, test_size = 0.5, random_state = 7)


# Finnish
fin_train, fin_dev = train_test_split(finnish, shuffle = True, test_size = 0.2, random_state = 7)
fin_dev, fin_test = train_test_split(fin_dev, shuffle = True, test_size = 0.5, random_state = 7)

fin_train_clean, fin_dev_clean = train_test_split(finnish_clean, shuffle = True, test_size = 0.2, random_state = 7)
fin_dev_clean, fin_test_clean = train_test_split(fin_dev_clean, shuffle = True, test_size = 0.5, random_state = 7)

#Chinese
ch_train, ch_dev = train_test_split(chinese, shuffle = True, test_size = 0.2, random_state = 7)
ch_dev, ch_test = train_test_split(ch_dev, shuffle = True, test_size = 0.5, random_state = 7)

ch_train_clean, ch_dev_clean = train_test_split(chinese_clean, shuffle = True, test_size = 0.2, random_state = 7)
ch_dev_clean, ch_test_clean = train_test_split(ch_dev_clean, shuffle = True, test_size = 0.5, random_state = 7)


# Not cleaned corpus

## Encoding (Word2Vec + Word Mover Distance)

In [None]:
model = api.load('word2vec-google-news-300')

In [None]:
names = ['en_train', 'en_dev', 'en_test']
for j,df in enumerate([en_train, en_dev, en_test]):
    name = 'distances_' + names[j]
    vars()[name] = []
    df.reset_index(drop = True, inplace = True)
    for i in tqdm(range(len(df))):
                  
        vars()[name].append(model.wmdistance(df['reference'][i], df['translation'][i]))
    
    name2 = 'score_' + names[j]
    vars()[name2] = np.array(df['z-score'])

## Calculating correlation

In [10]:
def corr(y_train_true, y_train_pred, y_dev_true, y_dev_pred, y_test_true, y_test_pred, return_corr = False):
    
    cleaned_corr_train, cleaned_corr_train_pvalue = pearsonr(y_train_true, y_train_pred)
    cleaned_corr_ktau_train, cleaned_corr_ktau_train_pvalue = kendalltau(y_train_true, y_train_pred)
    
    cleaned_corr_dev, cleaned_corr_dev_pvalue = pearsonr(y_dev_true, y_dev_pred)
    cleaned_corr_ktau_dev, cleaned_corr_ktau_dev_pvalue = kendalltau(y_dev_true, y_dev_pred)


    cleaned_corr_ktau_test, cleaned_corr_ktau_test_pvalue = kendalltau(y_test_true, y_test_pred)
    cleaned_corr_test, cleaned_corr_test_pvalue = pearsonr(y_test_true, y_test_pred)
        
    print(f'Pearson correlation between cosine similarity and score on train set: {cleaned_corr_train} (p-value < 0.001: {cleaned_corr_train_pvalue < 0.001}); and Kendall Tau: {cleaned_corr_ktau_train} (p-value < 0.001: {cleaned_corr_ktau_train_pvalue < 0.001})')
    print(f'Pearson correlation between cosine similarity and score on development set: {cleaned_corr_dev} (p-value < 0.001: {cleaned_corr_dev_pvalue < 0.001}); and Kendall Tau: {cleaned_corr_ktau_dev} (p-value < 0.001: {cleaned_corr_ktau_dev_pvalue < 0.001})')
    print(f'Pearson correlation between cosine similarity and score on test set: {cleaned_corr_test} (p-value < 0.001: {cleaned_corr_test_pvalue < 0.001}); and Kendall Tau: {cleaned_corr_ktau_test} (p-value < 0.001: {cleaned_corr_ktau_test_pvalue < 0.001})')
    
    if return_corr:
        return cleaned_corr_dev

How to treat np.inf???

In [None]:
while np.inf in distances_en_train:
    distances_en_train[distances_en_train.index(np.inf)] = 10

while np.inf in distances_en_dev:
    distances_en_dev[distances_en_dev.index(np.inf)] = 10
    
while np.inf in distances_en_test:
    distances_en_test[distances_en_test.index(np.inf)] = 10

In [None]:
corr(score_en_train, distances_en_train, score_en_dev, distances_en_dev, score_en_test, distances_en_test)

# Cleaned corpus

## Encoding (Word2Vec + Word Mover Distance)

In [None]:
# model = api.load('word2vec-google-news-300')
names = ['en_train_cleaned', 'en_dev_cleaned', 'en_test_cleaned']
for j,df in enumerate([en_train_cleaned, en_dev_cleaned, en_test_cleaned]):
    name = 'distances_' + names[j]
    vars()[name] = []
    df.reset_index(drop = True, inplace = True)
    for i in tqdm(range(len(df))):
                  
        vars()[name].append(model.wmdistance(df['reference'][i], df['translation'][i]))
    
    name2 = 'score_' + names[j]
    vars()[name2] = np.array(df['z-score'])

## Calculating correlation

In [None]:
while np.inf in distances_en_train_cleaned:
    distances_en_train_cleaned[distances_en_train_cleaned.index(np.inf)] = 10

while np.inf in distances_en_dev_cleaned:
    distances_en_dev_cleaned[distances_en_dev_cleaned.index(np.inf)] = 10
    
while np.inf in distances_en_test_cleaned:
    distances_en_test_cleaned[distances_en_test_cleaned.index(np.inf)] = 10

In [None]:
corr(score_en_train_cleaned, distances_en_train_cleaned, score_en_dev_cleaned, distances_en_dev_cleaned, score_en_test_cleaned, distances_en_test_cleaned)

In [61]:
train = ch_dev_clean.copy()
train=train.reset_index(drop = True)

# ROUGE

In [63]:
metrics = ['rouge-1','rouge-2','rouge-l']

In [64]:
def rouge_score(df,metric):
    rouge = Rouge()
    references =df['reference'].to_list()
    translation =df['translation'].to_list()
    scores=rouge.get_scores(translation, references)
    rougedf = pd.DataFrame()

    for score in scores:
        new_row = score[metric]
        rougedf = rougedf.append(new_row, ignore_index=True)
    rougedf.rename(columns={'f':'F1','p':'Precision','r':'Recall'},inplace=True)
    rougedf['Z-score']=train.iloc[:,-1]
    
    
        
    return(rougedf)

In [65]:
for metric in metrics:
    rougedf=rouge_score(train,metric)
    rougedf.corr(method='pearson')
    score = rougedf['Z-score']
    
    rougescore_corr, rougescore_corr_pvalue = pearsonr(score,rougedf['F1'])
    rougescore_corr_ktau, rougescore_corr_ktau_pvalue = kendalltau(score, rougedf['F1'])
    print('Correlation scores for',metric,'metric \n')
    print('F1')
    print(f'Pearson correlation between RougeScore and score on development set: {rougescore_corr} (p-value < 0.001: {rougescore_corr_pvalue < 0.001}); and Kendall Tau correlation: {rougescore_corr_ktau} (p-value < 0.001: {rougescore_corr_ktau_pvalue < 0.001})')
  
    rougescore_corr, rougescore_corr_pvalue = pearsonr(score,rougedf['Precision'])
    rougescore_corr_ktau, rougescore_corr_ktau_pvalue = kendalltau(score, rougedf['Precision'])
    
    print('Precision')
    print(f'Pearson correlation between RougeScore and score on development set: {rougescore_corr} (p-value < 0.001: {rougescore_corr_pvalue < 0.001}); and Kendall Tau correlation: {rougescore_corr_ktau} (p-value < 0.001: {rougescore_corr_ktau_pvalue < 0.001})')
    
    
    rougescore_corr, rougescore_corr_pvalue = pearsonr(score,rougedf['Recall'])
    rougescore_corr_ktau, rougescore_corr_ktau_pvalue = kendalltau(score, rougedf['Recall'])
    print('Recall')
    print(f'Pearson correlation between RougeScore and score on development set: {rougescore_corr} (p-value < 0.001: {rougescore_corr_pvalue < 0.001}); and Kendall Tau correlation: {rougescore_corr_ktau} (p-value < 0.001: {rougescore_corr_ktau_pvalue < 0.001})')
    print('\n')

Correlation scores for rouge-1 metric 

F1
Pearson correlation between RougeScore and score on development set: 0.4560311953072406 (p-value < 0.001: True); and Kendall Tau correlation: 0.31476099352769943 (p-value < 0.001: True)
Precision
Pearson correlation between RougeScore and score on development set: 0.45844589837035377 (p-value < 0.001: True); and Kendall Tau correlation: 0.32373343209892613 (p-value < 0.001: True)
Recall
Pearson correlation between RougeScore and score on development set: 0.4200236411485582 (p-value < 0.001: True); and Kendall Tau correlation: 0.28901528534576454 (p-value < 0.001: True)


Correlation scores for rouge-2 metric 

F1
Pearson correlation between RougeScore and score on development set: 0.38529887403362006 (p-value < 0.001: True); and Kendall Tau correlation: 0.2800140323240911 (p-value < 0.001: True)
Precision
Pearson correlation between RougeScore and score on development set: 0.3851102389522171 (p-value < 0.001: True); and Kendall Tau correlation

In [54]:
rougedf=rouge_score(train,'rouge-l')

In [81]:
f1_rouge = rouge_score(train[['reference', 'translation']],'rouge-l')['F1']

In [55]:
rougedf

Unnamed: 0,F1,Precision,Recall,Z-score
0,0.444444,0.461538,0.428571,0.752975
1,0.473684,0.409091,0.562500,0.207275
2,0.590909,0.565217,0.619048,0.722492
3,0.375000,0.428571,0.333333,0.837772
4,0.315789,0.285714,0.352941,0.487816
...,...,...,...,...
1017,0.333333,0.250000,0.500000,0.595979
1018,0.454545,0.454545,0.454545,0.393515
1019,0.260870,0.272727,0.250000,0.648436
1020,0.551724,0.571429,0.533333,0.754224


In [66]:
def corr(y_train_true, y_train_pred, return_corr = False):
    
    cleaned_corr_train, cleaned_corr_train_pvalue = pearsonr(y_train_true, y_train_pred)
    cleaned_corr_ktau_train, cleaned_corr_ktau_train_pvalue = kendalltau(y_train_true, y_train_pred)
        
    print(f'Train - Pearson: {cleaned_corr_train}; Kendall Tau: {cleaned_corr_ktau_train}')
    
    if return_corr:
        return [cleaned_corr_train, cleaned_corr_ktau_train, 
                cleaned_corr_dev, cleaned_corr_ktau_dev]

In [None]:
rougescore_corr, rougescore_corr_pvalue = pearsonr(score,rougedf['F1'])
rougescore_corr_ktau, rougescore_corr_ktau_pvalue = kendalltau(score, rougedf['F1'])
print('Correlation scores for',metric,'metric \n')
print('F1')
print(f'Pearson correlation between RougeScore and score on development set: {rougescore_corr} (p-value < 0.001: {rougescore_corr_pvalue < 0.001}); and Kendall Tau correlation: {rougescore_corr_ktau} (p-value < 0.001: {rougescore_corr_ktau_pvalue < 0.001})')

In [82]:
corr(f1_rouge, rougedf['Z-score'])

Train - Pearson: 0.4766929600222844; Kendall Tau: 0.3322986530067027


# Test set prediction

In [184]:
testCh = pd.read_csv('testset/en-zh/scores.csv')
testCh.head()

Unnamed: 0,source,reference,translation
0,The future and the destinies of the citizens o...,世界上每个国家公民的未来和命运日益联系在一起。,世界各国人民前途命运越来越紧密地联系在一起。
1,"After all that hard work, the finished result ...",经过那么多的努力，最终的结果现在已经可以揭晓了。,经过这么艰辛的工作，最终的结果现在才得以公布。
2,Author: researcher of Suning Institute of Fina...,作者：苏宁金融研究所研究员，财经专栏作家，财经评论员。,作者：苏宁金融研究院特约研究员，财经专栏作家，财经评论员。
3,“The Great Wall” tells the story of a Chinese ...,《长城》讲述了古代一支中国精锐部队在世界著名的中国长城上与怪物桃蒂英勇作战的故事。,《长城》讲述了在古代，一支中国精英部队为保卫人类，在举世闻名的长城上与怪兽饕餮进行生死决战的故事。
4,Our comrades from the Political Bureau should ...,政治局同志要学习历史，讲道理，不能混淆公、私利益，叫白黑，模糊义与利的界限，处理基于裙带关系...,中央政治局的同志都应该明史知理，不能颠倒了公私、混淆了是非、模糊了义利、放纵了亲情，要带头树...


In [185]:
#Ch
test_ch = pd.DataFrame()
for c in ['reference', 'translation']:
    test_ch[c] = clean_ch(testCh[c],
                                keep_numbers = False,
                                remove_punctuation = False,
                                remove_stop = True,
                                stopwords_set = 'snd')
test_ch = test_ch.replace(r'^\s*$', 'stop', regex=True)
test_ch.loc[18438, 'reference'] = 'stop'
test_ch

Unnamed: 0,reference,translation
0,世界 每个 国家 公民 未来 命运 日益 联系 一起,世界 各国 人民 前途 命运 越来越 紧密 联系 一起
1,努力 最终 现在 已经 揭晓,艰辛 工作 最终 现在 得以 公布
2,作者 苏宁 金融 研究所 研究员 财经 专栏作家 财经 评论员,作者 苏宁 金融 研究院 特约 研究员 财经 专栏作家 财经 评论员
3,长城 讲述 古代 一支 中国 精锐部队 世界 著名 中国 长城 怪物 桃蒂 英勇作战 故事,长城 讲述 古代 一支 中国 精英 部队 保卫 人类 举世闻名 长城 怪兽 饕餮 进行 生死...
4,政治局 同志 学习 历史 讲道理 不能 混淆 公 私 利益 叫白 黑 模糊 义与利 界限 处...,中央政治局 同志 应该 明史 知理 不能 颠倒 公私 混淆 是非 模糊 义利 放纵 亲情 要...
...,...,...
22123,"这为 年复一年 保持良好 丰收 打下 坚实 基础 , 不断 增加 农民收入 , 农村...",实现 农业 连年丰收 农民 持续 增收 农村 经济社会 健康 发展 提供 强有力 基础 支撑
22124,"CCTV - 2 美国 国家 经济 战略 研究院 编写 一堆 干货 , 这份 中国 ...",央视 财经频道 联合 中国社科院 财经 战略 研究院 梳理 一份 干货 这份 沉甸甸 中国 ...
22125,"肯塔基州 , 贝文 ( R ) 得到 美国 公民自由 联盟 使用 Faceb...",肯塔基州 州长 马特 · 贝文 ( R ) 脸书 推特 使用 美国 公民自由 联盟 光顾
22126,"新 锦屏 总书记 党中央 村民 关注 ,",老百姓 传 达到 把习 总书记 党中央 关怀 关心 传 达到 觉得


In [187]:
test_ch['metric'] = rouge_score(test_ch,'rouge-l')['F1']

In [189]:
test_ch.to_csv('test_scores/en-zh_metric_scores.csv')

In [183]:
# for i in range(len(test_ch['reference'].tolist())):
#     if len(str(test_ch.loc[i, 'reference']))<2:
#         print('--->', i)
#         print(test_ch.loc[i])
#         print(Rouge().get_scores(test_ch.loc[i, 'reference'], test_ch.loc[i, 'translation']))