In [1]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import re
import string
import nltk
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr, kendalltau
import gensim.downloader as api

ModuleNotFoundError: No module named 'gensim'

In [58]:
!pip install jieba

Collecting jieba
  Downloading jieba-0.42.1.tar.gz (19.2 MB)
Building wheels for collected packages: jieba
  Building wheel for jieba (setup.py): started
  Building wheel for jieba (setup.py): finished with status 'done'
  Created wheel for jieba: filename=jieba-0.42.1-py3-none-any.whl size=19314482 sha256=8e27de8805154b8b927a53612e7556c4554f0591c0415f37efb6175230824201
  Stored in directory: c:\users\luisl\appdata\local\pip\cache\wheels\ca\38\d8\dfdfe73bec1d12026b30cb7ce8da650f3f0ea2cf155ea018ae
Successfully built jieba
Installing collected packages: jieba
Successfully installed jieba-0.42.1


In [60]:
import jieba

In [28]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

# Data Importing and Sorting

In [29]:
files = os.listdir('corpus')
files.remove('.DS_Store')
files.remove('scores_ru-en.csv')
scaler = MinMaxScaler()
for file_ in files:
  name = file_.split('-')[0] + file_.split('-')[1]
  vars()[name] = pd.read_csv(os.path.join('corpus', file_, 'scores.csv'))
  vars()[name].drop(columns = ['source', 'annotators', 'avg-score'], inplace = True)
  vars()[name]['z-score'] = scaler.fit_transform(vars()[name]['z-score'].values.reshape(-1,1)) #normalizing values betwewen 0 and 1

In [30]:
english = csen.copy()
for df in [deen, ruen, zhen]:
  english = english.append(df)

In [31]:
finnish = enfi.copy()
chinese = enzh.copy()

In [32]:
english.reset_index(drop = True, inplace = True)

# Cleaning the corpus (updated  cleaning function)

In [33]:
def clean(text_list,
          lower = False,
          keep_numbers = False,
          keep_expression = False,
          remove_char = False,
          remove_stop = False,
          remove_tag = False,
          lemmatize = False,
          stemmer = False,
          english = False
          ):
    """
    Function that a receives a list of strings and preprocesses it.
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    if english:
        lang = 'english'
    else:
        lang = 'finnish'
    
    stop = set(stopwords.words(lang))
    stem = SnowballStemmer(lang)
    
    updates = []
    for j in range(len(text_list)):
        
        text = text_list[j]
        
        #LOWERCASE TEXT
        if lower:
            text = text.lower()
            
        #KEEP NUMBERS AS TOKENS
        if not keep_numbers:
            text = re.sub("[\d+]", 'X', text)
        
        #KEEP '?' and '!' AS TOKENS
        if not keep_expression:
            text = re.sub("[\?|\!]", 'EXPRESSION', text)
            
        #REMOVE TAGS
        if remove_tag:
            text = BeautifulSoup(text).get_text()
            
        #REMOVE THAT IS NOT TEXT
        if remove_char:
            text = re.sub("[^a-zA-Z]", ' ', text)
        
        #REMOVE STOP WORDS
        if remove_stop:
            text = ' '.join([word for word in text.split(' ') if word not in stop])
        
        #LEMMATIZATION
        if lemmatize:
            if english:
                lemma = WordNetLemmatizer()
                text = " ".join(lemma.lemmatize(word) for word in text.split())
        
        #STEMMER
        if stemmer:
            text = " ".join(stem.stem(word) for word in text.split())
        
        updates.append(text)
        
    return updates

def clean_ch(text_list, keep_numbers=False, remove_punctuation=False, remove_stop = False, stopwords_set='merged'):
    """
    Function that removes chinese stopwords
    
    :param stopwords_set: remove words of both sets (merged), just the 1st (fst) or just the second (snd) 
    """
    updates = []
    
    zh_stopwords1 = [line.strip() for line in open('chinese_stopwords/chinese_stopwords1.txt', 'r', encoding='utf-8').readlines()]
    zh_stopwords2 = [line.strip() for line in open('chinese_stopwords/chinese_stopwords2.txt', 'r', encoding='utf-8').readlines()]
    
    if stopwords_set == 'merged':
        stop = list(set(zh_stopwords1 + zh_stopwords2))
    elif stopwords_set == 'fst':
        stop = zh_stopwords1
    elif stopwords_set == 'snd':
        stop = zh_stopwords2

    for j in range(len(text_list)):
        
        text = text_list[j]
        
        #KEEP NUMBERS AS TOKENS
        if keep_numbers:
            text = re.sub("[\d+]", 'X', text)
        
        # REMOVE PUNCTUATION
        if remove_punctuation:
            # https://stackoverflow.com/questions/36640587/how-to-remove-chinese-punctuation-in-python
            punc = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
            text = re.sub(r"[%s]+" %punc, "", text)
        
        # REMOVE STOP WORDS
        if remove_stop:
            pretext = text
            text = ' '.join([word for word in jieba.cut(text) if word not in stop])
            
        updates.append(text)
        
    return updates

In [61]:
# En
cleaning_dict = {'lower': False, 'keep_numbers': True, 'keep_expression': False, 'remove_char': True, 'remove_stop': True, 'remove_tag': False, 'lemmatize': False, 'stemmer': True}
english_clean = pd.DataFrame()
english_clean['z-score'] = english['z-score']
for column in ['reference', 'translation']:
    english_clean[column] = clean(english[column], cleaning_dict)
    
# Fi
finnish_clean = pd.DataFrame()
for c in ['reference', 'translation']:
    finnish_clean[c] = clean(finnish[c],
                             lower = False,
                             keep_numbers = False,
                             keep_expression = True,
                             remove_char = True,
                             remove_stop = False,
                             remove_tag = True,
                             lemmatize = False,
                             stemmer = True,
                             english=False)
finnish_clean['z-score'] = finnish['z-score']

#Ch
chinese_clean = pd.DataFrame()
for c in ['reference', 'translation']:
    chinese_clean[c] = clean_ch(chinese[c],
                                keep_numbers = False,
                                remove_punctuation = False,
                                remove_stop = True,
                                stopwords_set = 'snd')
chinese_clean['z-score'] = chinese['z-score']


Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Temp\jieba.cache
Loading model cost 0.658 seconds.
Prefix dict has been built successfully.


In [9]:
# cleaning_dict = {'lower': False, 'keep_numbers': True, 'keep_expression': False, 'remove_char': True, 'remove_stop': True, 'remove_tag': False, 'lemmatize': False, 'stemmer': True}
# english_cleaned = pd.DataFrame()
# english_cleaned['z-score'] = english['z-score']
# for column in ['reference', 'translation']:
#     english_cleaned[column] = clean(english[column], cleaning_dict)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=77688.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=77688.0), HTML(value='')))




In [31]:
# cleaning_dict = {'lower': False, 'keep_numbers': True, 'keep_expression': False, 'remove_char': True, 'remove_stop': True, 'remove_tag': False, 'lemmatize': False, 'stemmer': True}
# finnish_cleaned = pd.DataFrame()
# finnish_cleaned['z-score'] = finnish['z-score']
# for column in ['reference', 'translation']:
#     finnish_cleaned[column] = clean(finnish[column], cleaning_dict)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6748.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6748.0), HTML(value='')))




In [8]:
# cleaning_dict = {'lower': False, 'keep_numbers': True, 'keep_expression': False, 'remove_char': True, 'remove_stop': True, 'remove_tag': False, 'lemmatize': False, 'stemmer': True}
# chinese_cleaned = pd.DataFrame()
# chinese_cleaned['z-score'] = chinese['z-score']
# for column in ['reference', 'translation']:
#     chinese_cleaned[column] = clean(chinese[column], cleaning_dict)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10221.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10221.0), HTML(value='')))




# Train, Dev & Test Split

In [62]:
# English
en_train, en_dev = train_test_split(english, shuffle = True, test_size = 0.2, random_state = 7)
en_dev, en_test = train_test_split(en_dev, shuffle = True, test_size = 0.5, random_state = 7)

en_train_clean, en_dev_clean = train_test_split(english_clean, shuffle = True, test_size = 0.2, random_state = 7)
en_dev_clean, en_test_clean = train_test_split(en_dev_clean, shuffle = True, test_size = 0.5, random_state = 7)


# Finnish
fin_train, fin_dev = train_test_split(finnish, shuffle = True, test_size = 0.2, random_state = 7)
fin_dev, fin_test = train_test_split(fin_dev, shuffle = True, test_size = 0.5, random_state = 7)

fin_train_clean, fin_dev_clean = train_test_split(finnish_clean, shuffle = True, test_size = 0.2, random_state = 7)
fin_dev_clean, fin_test_clean = train_test_split(fin_dev_clean, shuffle = True, test_size = 0.5, random_state = 7)

#Chinese
ch_train, ch_dev = train_test_split(chinese, shuffle = True, test_size = 0.2, random_state = 7)
ch_dev, ch_test = train_test_split(ch_dev, shuffle = True, test_size = 0.5, random_state = 7)

ch_train_clean, ch_dev_clean = train_test_split(chinese_clean, shuffle = True, test_size = 0.2, random_state = 7)
ch_dev_clean, ch_test_clean = train_test_split(ch_dev_clean, shuffle = True, test_size = 0.5, random_state = 7)


# Not cleaned corpus

## Encoding (Word2Vec + Word Mover Distance)

In [21]:
model = api.load('word2vec-google-news-300')

In [22]:
names = ['en_train', 'en_dev', 'en_test']
for j,df in enumerate([en_train, en_dev, en_test]):
    name = 'distances_' + names[j]
    vars()[name] = []
    df.reset_index(drop = True, inplace = True)
    for i in tqdm(range(len(df))):
                  
        vars()[name].append(model.wmdistance(df['reference'][i], df['translation'][i]))
    
    name2 = 'score_' + names[j]
    vars()[name2] = np.array(df['z-score'])

  0%|          | 0/62150 [00:00<?, ?it/s]

  0%|          | 0/7769 [00:00<?, ?it/s]

  0%|          | 0/7769 [00:00<?, ?it/s]

## Calculating correlation

In [23]:
def corr(y_train_true, y_train_pred, y_dev_true, y_dev_pred, y_test_true, y_test_pred, return_corr = False):
    
    cleaned_corr_train, cleaned_corr_train_pvalue = pearsonr(y_train_true, y_train_pred)
    cleaned_corr_ktau_train, cleaned_corr_ktau_train_pvalue = kendalltau(y_train_true, y_train_pred)
    
    cleaned_corr_dev, cleaned_corr_dev_pvalue = pearsonr(y_dev_true, y_dev_pred)
    cleaned_corr_ktau_dev, cleaned_corr_ktau_dev_pvalue = kendalltau(y_dev_true, y_dev_pred)


    cleaned_corr_ktau_test, cleaned_corr_ktau_test_pvalue = kendalltau(y_test_true, y_test_pred)
    cleaned_corr_test, cleaned_corr_test_pvalue = pearsonr(y_test_true, y_test_pred)
        
    print(f'Pearson correlation between cosine similarity and score on train set: {cleaned_corr_train} (p-value < 0.001: {cleaned_corr_train_pvalue < 0.001}); and Kendall Tau: {cleaned_corr_ktau_train} (p-value < 0.001: {cleaned_corr_ktau_train_pvalue < 0.001})')
    print(f'Pearson correlation between cosine similarity and score on development set: {cleaned_corr_dev} (p-value < 0.001: {cleaned_corr_dev_pvalue < 0.001}); and Kendall Tau: {cleaned_corr_ktau_dev} (p-value < 0.001: {cleaned_corr_ktau_dev_pvalue < 0.001})')
    print(f'Pearson correlation between cosine similarity and score on test set: {cleaned_corr_test} (p-value < 0.001: {cleaned_corr_test_pvalue < 0.001}); and Kendall Tau: {cleaned_corr_ktau_test} (p-value < 0.001: {cleaned_corr_ktau_test_pvalue < 0.001})')
    
    if return_corr:
        return cleaned_corr_dev

How to treat np.inf???

In [24]:
while np.inf in distances_en_train:
    distances_en_train[distances_en_train.index(np.inf)] = 10

while np.inf in distances_en_dev:
    distances_en_dev[distances_en_dev.index(np.inf)] = 10
    
while np.inf in distances_en_test:
    distances_en_test[distances_en_test.index(np.inf)] = 10

In [25]:
corr(score_en_train, distances_en_train, score_en_dev, distances_en_dev, score_en_test, distances_en_test)

Pearson correlation between cosine similarity and score on train set: -0.2010915969716781 (p-value < 0.001: True); and Kendall Tau: -0.1847216372580606 (p-value < 0.001: True)
Pearson correlation between cosine similarity and score on development set: -0.26042741904000105 (p-value < 0.001: True); and Kendall Tau: -0.1772273081097246 (p-value < 0.001: True)
Pearson correlation between cosine similarity and score on test set: -0.2731128169278083 (p-value < 0.001: True); and Kendall Tau: -0.18631107517421996 (p-value < 0.001: True)


# Cleaned corpus

## Encoding (Word2Vec + Word Mover Distance)

In [26]:
# model = api.load('word2vec-google-news-300')
names = ['en_train_cleaned', 'en_dev_cleaned', 'en_test_cleaned']
for j,df in enumerate([en_train_cleaned, en_dev_cleaned, en_test_cleaned]):
    name = 'distances_' + names[j]
    vars()[name] = []
    df.reset_index(drop = True, inplace = True)
    for i in tqdm(range(len(df))):
                  
        vars()[name].append(model.wmdistance(df['reference'][i], df['translation'][i]))
    
    name2 = 'score_' + names[j]
    vars()[name2] = np.array(df['z-score'])

  0%|          | 0/62150 [00:00<?, ?it/s]

  0%|          | 0/7769 [00:00<?, ?it/s]

  0%|          | 0/7769 [00:00<?, ?it/s]

## Calculating correlation

In [27]:
while np.inf in distances_en_train_cleaned:
    distances_en_train_cleaned[distances_en_train_cleaned.index(np.inf)] = 10

while np.inf in distances_en_dev_cleaned:
    distances_en_dev_cleaned[distances_en_dev_cleaned.index(np.inf)] = 10
    
while np.inf in distances_en_test_cleaned:
    distances_en_test_cleaned[distances_en_test_cleaned.index(np.inf)] = 10

In [28]:
corr(score_en_train_cleaned, distances_en_train_cleaned, score_en_dev_cleaned, distances_en_dev_cleaned, score_en_test_cleaned, distances_en_test_cleaned)

Pearson correlation between cosine similarity and score on train set: -0.1955674998038151 (p-value < 0.001: True); and Kendall Tau: -0.17732545076194817 (p-value < 0.001: True)
Pearson correlation between cosine similarity and score on development set: -0.2634434105536765 (p-value < 0.001: True); and Kendall Tau: -0.17342002949899588 (p-value < 0.001: True)
Pearson correlation between cosine similarity and score on test set: -0.2731052532305561 (p-value < 0.001: True); and Kendall Tau: -0.18236889501446313 (p-value < 0.001: True)


In [23]:
chinese

Unnamed: 0,reference,translation,z-score
0,GSIS的科学家AnthonyDelGenio在新闻稿中解释说：“在GISS模型的模拟模型中...,戈达德太空研究所科学家安东尼·德尔·杰尼奥在新闻发布会上解释说：“在戈达德太空研究所的模型模...,0.451621
1,中国在英国女性4x200mFreestreyWTE中的最后被称为：“中国14岁的孩子从球下降...,参加女子4x200米自由泳接力赛决赛的中国小将艾衍含被这样描述：“那名14岁的中国小姑娘犯了...,0.246545
2,然后来到2012年，当她和她的队友们没有什么好处。,2012年，她和她的队友都不被看好。,0.198549
3,自去年以来，GoudianGroup从南非通过南非港口出口了163套风力发电项目。,自去年以来，国电集团共计有163套风电项目陆续从连云港港出口南非。,0.216002
4,"一些人指称，Kempinski旅馆只是""被捕""，以满足阿拉伯客户的要求。",有人认为凯宾斯基酒店简直是为了满足阿拉伯客户的要求而“卑躬屈膝”。,0.391471
...,...,...,...
10216,这不是一个大型的会议，它不是皇家 Ascot -它是一个普通的星期五晚上，有20，000人。,这不是一场大规模的赛马会，也不是英国皇家爱斯科赛马会 (Royal Ascot)，这只是一个...,0.688151
10217,这位负责人强调，钢铁产品的国际贸易本质上是一种市场行为，是从进口国的要求和考虑产品成本和其他...,这位负责人强调，钢铁产品国际贸易本质上讲是市场行为，是源于进口国需求、消费者综合考虑产品性价...,0.817705
10218,她继续浏览，并最终在 Bouverie 广场购物中心的商店里尝试了四双内衣。,她继续浏览货架，并最终在位于 Folkestone 的 Bouverie Place 购物中...,0.711189
10219,但是当他带着女儿参加综艺节目“爸爸，我们去哪里？”时，他成为了一个焦点。,但在2004年奥运会单人十米台比赛中输给队友胡佳后，田亮的竞技状态出现下滑，并被爆出被跳水队开除。,0.222476


In [43]:
fin_train_clean

Unnamed: 0,reference,translation,z-score
100,helsing s ti n haltij hank perusajatus on suoj...,helsink foundation haltij projekt perusajatuks...,0.683731
2462,tapaus synnyt melkois vastauks ihmist kan verk...,v likohtaus tuot melkois vastareaktio verko ku...,0.387462
5630,lain muutos helpot ihmist pelottelu pelottelu ...,muuta lak niin et on helpomp syyt ihm pelottel...,0.426999
4299,mal tarkastelt my s venuks varhais venuks tuop...,mal my s tarkastel varhais venuks pinnanmuodostu,0.279282
6322,kuite mielipidetutkimuks osoittav et on pahoin...,gallup kuite kertov et hallituspuolue perussuo...,0.183373
...,...,...,...
5699,lick lick j ti valtuustotalo x xx pun vahingo ...,yk riperh aiheut vuokratalo x xx pun vahingo s...,0.317478
2550,matkustaj voi valitettav lemmikkikoir pukeutun...,matkustaj eiv t valitettav voi tul silit m n t...,0.295974
537,oli kerm puf tapahtum,tapahtui tuulihattuv likohtaus,0.412813
1220,tiete muka kaik n m asia voiva tuoda sinu vuos...,tiete muka kaik n m asia saattav tuoda lis eli...,0.900195


In [42]:
from rouge import Rouge

In [71]:
train = ch_dev_clean.copy()

In [72]:
train

Unnamed: 0,reference,translation,z-score
3505,8 12 早晨 国务院 问讯 台 2016 7 举行 国民经济 操作 一次 新闻 招待会,8 12 上午 国务院新闻办 2016 7 月份 国民经济 运行 情况 举行 新闻 发布会,0.752975
6558,"猎人 狼 : 美国 男子 射箭 队 成员 布雷 迪 埃里森 , 左 , 比作 莱...",猎人 狼 美国 男子 射箭 队员 布莱迪 · 埃里森 ( Brady Ellison...,0.207275
9010,参议院 多数党 领导人 麦克 康奈尔 Mitch McConnell 表示 一任 总统 ...,参议院 多数党 领袖 米奇 · 麦康奈尔 ( Mitch McConnell ) ...,0.722492
3077,2016 可能 柏林 新 德国 首都 消灭 以色列 根除,以色列 居然 2016 德国 首都 柏林 消失,0.837772
5075,朱利叶 斯 ? 尼尔森 Julius Nielsen 研究 鲨鱼 ...,一直 研究 鲨鱼 Julius Nielsen 资料 记载 一头 雌性 小头 睡...,0.487816
...,...,...,...
9923,带走 重要,对手 影响 非常 重要,0.595979
3546,三重奏 幸运 2014 治疗 过时 保持 中断 最低 情况 正确 表示,幸运 2014 年初 治疗 影响 降到 最低 这种 情况 可能 合适,0.393515
9224,邓小飞 延误 收到 黄牌 罚球区 前面 犯规 替换 彭 新立 拿到 一张 黄牌,邓小飞 判定 拖延时间 领到 黄牌 替补 出场 彭欣力 禁区 前 犯规 领到 黄牌,0.648436
9839,黑 煤田 仓库 变成 “ 绿草如茵 ” 足球场 红砖 尖石 厂 变成 一座 现代化 体育 博物馆,黑乎乎 煤场 仓库 变成 “ 绿草如茵 ” 足球场 红砖 尖顶 厂房 变成 现代化 运动 馆,0.754224


In [73]:
if len(train[train['translation']== '']) > 0:
    trans_list = train[train['translation']== ''].index.to_list()
    train=train.drop(trans_list,axis=0)
    
if len(train[train['reference']== '']) > 0:
    ref_list = train[train['reference']== ''].index.to_list()
    train=train.drop(ref_list,axis=0)
    
     
    
train=train.reset_index(drop = True)

In [74]:
metrics = ['rouge-1','rouge-2','rouge-l']

In [75]:
def rouge_score(df,metric):
    
    
    rouge = Rouge()
    references =df['reference'].to_list()
    translation =df['translation'].to_list()
    scores=rouge.get_scores(translation, references)
    rougedf = pd.DataFrame()

    for score in scores:
        new_row = score[metric]
        rougedf = rougedf.append(new_row, ignore_index=True)
    rougedf.rename(columns={'f':'F1','p':'Precision','r':'Recall'},inplace=True)
    rougedf['Z-score']=train.iloc[:,-1]
    
    
        
    return(rougedf)

In [76]:
for metric in metrics:
    rougedf=rouge_score(train,metric)
    rougedf.corr(method='pearson')
    score = rougedf['Z-score']
    
    rougescore_corr, rougescore_corr_pvalue = pearsonr(score,rougedf['F1'])
    rougescore_corr_ktau, rougescore_corr_ktau_pvalue = kendalltau(score, rougedf['F1'])
    print('Correlation scores for',metric,'metric \n')
    print('F1')
    print(f'Pearson correlation between RougeScore and score on development set: {rougescore_corr} (p-value < 0.001: {rougescore_corr_pvalue < 0.001}); and Kendall Tau correlation: {rougescore_corr_ktau} (p-value < 0.001: {rougescore_corr_ktau_pvalue < 0.001})')
  
    rougescore_corr, rougescore_corr_pvalue = pearsonr(score,rougedf['Precision'])
    rougescore_corr_ktau, rougescore_corr_ktau_pvalue = kendalltau(score, rougedf['Precision'])
    
    print('Precision')
    print(f'Pearson correlation between RougeScore and score on development set: {rougescore_corr} (p-value < 0.001: {rougescore_corr_pvalue < 0.001}); and Kendall Tau correlation: {rougescore_corr_ktau} (p-value < 0.001: {rougescore_corr_ktau_pvalue < 0.001})')
    
    
    rougescore_corr, rougescore_corr_pvalue = pearsonr(score,rougedf['Recall'])
    rougescore_corr_ktau, rougescore_corr_ktau_pvalue = kendalltau(score, rougedf['Recall'])
    print('Recall')
    print(f'Pearson correlation between RougeScore and score on development set: {rougescore_corr} (p-value < 0.001: {rougescore_corr_pvalue < 0.001}); and Kendall Tau correlation: {rougescore_corr_ktau} (p-value < 0.001: {rougescore_corr_ktau_pvalue < 0.001})')
    print('\n')

Correlation scores for rouge-1 metric 

F1
Pearson correlation between RougeScore and score on development set: 0.4560311953072407 (p-value < 0.001: True); and Kendall Tau correlation: 0.31476099352769943 (p-value < 0.001: True)
Precision
Pearson correlation between RougeScore and score on development set: 0.45844589837035377 (p-value < 0.001: True); and Kendall Tau correlation: 0.32373343209892613 (p-value < 0.001: True)
Recall
Pearson correlation between RougeScore and score on development set: 0.4200236411485583 (p-value < 0.001: True); and Kendall Tau correlation: 0.28901528534576454 (p-value < 0.001: True)


Correlation scores for rouge-2 metric 

F1
Pearson correlation between RougeScore and score on development set: 0.38529887403361995 (p-value < 0.001: True); and Kendall Tau correlation: 0.2800140323240911 (p-value < 0.001: True)
Precision
Pearson correlation between RougeScore and score on development set: 0.38511023895221713 (p-value < 0.001: True); and Kendall Tau correlatio

In [69]:
rougedf=rouge_score(train,'rouge-l')

In [70]:
rougedf

Unnamed: 0,F1,Precision,Recall,Z-score
0,0.160000,0.142857,0.181818,0.890795
1,0.400000,0.380952,0.421053,0.671382
2,0.200000,0.200000,0.200000,0.797958
3,0.727273,0.666667,0.800000,0.745917
4,0.700000,0.636364,0.777778,0.802921
...,...,...,...,...
8167,0.055556,0.055556,0.055556,0.661060
8168,0.473684,0.409091,0.562500,0.738859
8169,0.417910,0.341463,0.538462,0.800686
8170,0.166667,0.142857,0.200000,0.449487
