In [1]:
!ls -l ./models/ 

total 4
drwxrwxr-x 5 shapkin shapkin 4096 Feb 27 19:06 't5-small fruit preprocessing p(comment, x_t+1 | x_t, doc)'


In [2]:
import sys
sys.path.append('..')

In [3]:
import os
os.environ['TOKENIZERS_PARALLELISM']='true'
os.environ['CUDA_VISIBLE_DEVICES']='3'

In [4]:
import re
import os
import torch
import json
import numpy as np
import pandas as pd
import seaborn as sns
import transformers
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from transformers import T5Tokenizer, T5TokenizerFast, T5ForConditionalGeneration
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
from typing import Callable, Union, Tuple
from tqdm.notebook import tqdm
from collections import Counter
from torch import nn
from catalyst import dl
from catalyst.callbacks.periodic_loader import PeriodicLoaderCallback
from langdetect import detect
from easse.sari import corpus_sari
from rouge import Rouge 

from utils.dataset_utils import extract_com8text_from_tgt, extract_text8docs_from_src
from utils.dataset_utils import EditDataset, get_tgt, get_src, COM_SEP, TEXT_SEP_SRC, TEXT_SEP_TGT, DOCS_SEP
from utils.metrics_utils import PeerEditMetricsCallback
from utils.config import Config


DOCS_DIR = 'data'
PAGES_DIR = 'data'

In [5]:
CONFIG = Config()
CONFIG.seed = 1337
CONFIG.beam_size = 1

In [6]:
import random

random.seed(CONFIG.seed)
os.environ['PYTHONHASHSEED'] = str(CONFIG.seed)
np.random.seed(CONFIG.seed)
torch.manual_seed(CONFIG.seed)
torch.cuda.manual_seed(CONFIG.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

## Data preparing

In [7]:
mp = json.load(open(r"../data/column_mapper.json"))

train = pd.read_json(r'../data/new_train.json')
train.set_axis(mp.values(), axis='columns', inplace=True)

test = pd.read_json(r'../data/new_test.json')
test.set_axis(mp.values(), axis='columns', inplace=True)

val = pd.read_json(r'../data/new_val.json')
val.set_axis(mp.values(), axis='columns', inplace=True)
val1 = val.iloc[:600]

In [8]:
train.head()

Unnamed: 0,obj_id,old_text,new_text,comment,docs,diff,title,search_queries,counter_found_docs,section_name,is_good,docs_processed
0,1005,"In rural regions of Germany, especially the Ha...","In rural regions of Germany, especially the Ha...",/* Germany */ grammar,"Apr 30, 2020 — Depending on whom you ask, May ...","this opportunity to party,",May Day,"[May Day Germany this opportunity to party,]",[27],Germany,True,"DOC0: Apr 30, 2020 — Depending on whom you ask..."
1,55,"""This is the new WikiPedia!"" \n-HomePage, the ...","""This is the new WikiPedia!"" \n-HomePage, the ...",Added subpage,English: Results of the 1929 New York City ald...,\n*/New York City Board of Aldermen,John M Wolfson,[John M Wolfson \n*/New York City Board of Ald...,"[10, 4]",,True,DOC0: English: Results of the 1929 New York Ci...
2,2329,"Ares (Ancient Greek: , Μodern Greek: Άρης ) i...","Ares (Ancient Greek: , Μodern Greek: Άρης ) i...",repaired link to 'masculinity' ~~~~,"Apr 7, 2014 — Let's explore the essence of mas...","masculinity, integrity, and personal courage.",Ares,"[Ares masculinity, integrity, and personal cou...",[30],,True,"DOC0: Apr 7, 2014 — Let's explore the essence ..."
3,17206,Peter Velhorn (born 24 November 1932) is a Ger...,Peter Velhorn (24 November 1932 – 20 July 2016...,"Passed away 2016, look at German article",Peter Velhorn (24 November 1932 – 20 July 2016...,– 20 July 2016,Peter Velhorn,[Peter Velhorn – 20 July 2016],[23],,True,DOC0: Peter Velhorn (24 November 1932 – 20 Jul...
4,47,"Michael Palin was educated at Birkdale School,...","Michael Palin was educated at Birkdale School,...",/* Early career */ added info,"Michael Palin, Nightingale House, in Clapham, ...",", Graeme Garden, Bill Oddie and Jonathan Lynn",Michael Palin,"[Michael Palin Early career , Graeme Garden, ...",[29],Early career,True,"DOC0: Michael Palin, Nightingale House, in Cla..."


In [9]:
from nltk.tokenize import sent_tokenize

In [10]:
def clean_closing(text: str):
    text_sents = text.split('\n')
    ans = []
    for snt in text_sents:
        snt = snt.strip()
        if len(snt) > 0:
            if '[' in snt and snt.strip()[-2:] == ']]':
                continue
            if snt.strip()[-1] == ']':
                continue
            if snt.strip()[0] == '|' or snt.strip()[:2] == '{|' or snt.strip()[-2:] == '|}':
                continue
            if snt.strip()[-2:] == '--' or snt.strip()[0] == ':' or snt.strip()[0] == '!':
                continue
        sns = snt.replace('*/', '*').replace('/*', '*').replace('\t', ' ').replace('*', '* ').replace('  ', ' ')
        sns = sns.replace('  ', ' ').replace('  ', ' ').replace('  ', ' ')
        if len(sns) > 0:
            aaa = list(re.findall(r'[^\w\s]', sns[-1]))
            if aaa and len(aaa) > 0:
                ans.append(sns)
            else:
                ans.append(sns + '.')
        else:
            ans.append(sns)
    return '\n'.join(ans).strip()

In [11]:
def numerate_sents(old_text, new_text):
    old_text_clean = clean_closing(old_text)
    new_text_clean = clean_closing(new_text)
    
    old_text_sents = sent_tokenize(old_text_clean)
    new_text_sents = sent_tokenize(new_text_clean)
    
    sent2idx = {k:v for v, k in enumerate(old_text_sents)}
    idx2sent = {v:k for v, k in enumerate(old_text_sents)}
    old_ans, new_ans = [], []
    for sent in new_text_sents:
        if sent in sent2idx:
            new_sent = f"[{sent2idx[sent]}]"
        else:
            new_sent = sent
        new_ans.append(new_sent)
    
    for sent_idx, sent in enumerate(old_text_sents):
        new_sent = f"[{sent_idx}] {sent}"
        old_ans.append(new_sent)
    
    old_ans_txt = ' '.join(old_ans)
    new_ans_txt = ' '.join(new_ans)
    return (old_ans_txt, new_ans_txt, sent2idx, idx2sent)

In [12]:
idx = 1
el_idx = train.iloc[idx]

docs = el_idx['docs_processed']
comment = el_idx['comment']
old_text = el_idx['old_text']
new_text = el_idx['new_text']

In [13]:
numerate_sents(old_text, new_text)

('[0] "This is the new WikiPedia!" [1] -HomePage, the first ever article created on Wikipedia, created by office.bomis.com on Mon, 15 Jan 2001 19:27:13. [2] Interested in Chicago and urban history, and other things. [3] SUBPAGES (Excl. [4] Sandbox):\n* Chicago aldermanic notability. [5] * Louis B. Anderson. [6] * Chicago City Council since 1923. [7] * Town of Chicago. [8] "FORMER" SUBPAGES (content moved elsewhere, now serving as redirects):\n* Notabilitymandering. [9] * List of Chicago aldermen since 1923. [10] * Chicago aldermanic elections before 1923.',
 '[0] [1] [2] [3] [4] [5] [6] [7] * New York City Board of Aldermen. [8] [9] [10]',
 {'"This is the new WikiPedia!"': 0,
  '-HomePage, the first ever article created on Wikipedia, created by office.bomis.com on Mon, 15 Jan 2001 19:27:13.': 1,
  'Interested in Chicago and urban history, and other things.': 2,
  'SUBPAGES (Excl.': 3,
  'Sandbox):\n* Chicago aldermanic notability.': 4,
  '* Louis B. Anderson.': 5,
  '* Chicago City Cou

In [14]:
val['numerate_ans'] = val.apply(lambda x: numerate_sents(x['old_text'], x['new_text']), axis=1)
val1['numerate_ans'] = val1.apply(lambda x: numerate_sents(x['old_text'], x['new_text']), axis=1)

val['new_text_old'] = val['new_text']
val1['new_text_old'] = val1['new_text']
val['old_text_old'] = val['old_text']
val1['old_text_old'] = val1['old_text']

val['old_text'] = val['numerate_ans'].apply(lambda x: x[0])
val['new_text'] = val['numerate_ans'].apply(lambda x: x[1])
val1['old_text'] = val1['numerate_ans'].apply(lambda x: x[0])
val1['new_text'] = val1['numerate_ans'].apply(lambda x: x[1])

In [15]:
CONFIG.src_max_len = 512
CONFIG.tgt_max_len = 512
CONFIG.pretrained = 't5-small'
CONFIG.pattern_path = './models/t5-small fruit preprocessing p(comment, x_t+1 | x_t, doc)'
CONFIG.batch_size = 4

tokenizer = T5Tokenizer.from_pretrained(CONFIG.pretrained, model_max_length=CONFIG.src_max_len)

**Make dataset**

In [16]:
#ds_train = EditDataset(train, tokenizer, CONFIG, text_to_lower=True, comment_to_lower=True)
ds_val = EditDataset(val1, tokenizer, CONFIG, text_to_lower=True, comment_to_lower=True)
ds_val_full = EditDataset(val, tokenizer, CONFIG, text_to_lower=True, comment_to_lower=True)

## Model training

In [17]:
class EditModel(nn.Module):
    def __init__(self, 
                 pretrained: transformers.modeling_utils.PreTrainedModel, 
                 config: Config):
        super(EditModel, self).__init__()
        self.pretrained = pretrained
        

    def forward(self, 
                x: Tuple[torch.Tensor, torch.Tensor]):
        src, tgt = x
        
        tgt[tgt == 0] == -100
        
        loss = self.pretrained(
            input_ids = src,
            attention_mask = (src != 0).float(),
            labels=tgt,
        ).loss
        return loss
    
    
class Criterion(nn.Module):
    def __init__(self):
        super(Criterion, self).__init__()
        
    def forward(self, pred, tgt):
        return pred

In [18]:
CONFIG.device = 'cuda'

In [19]:
model_edit = EditModel(T5ForConditionalGeneration.from_pretrained(CONFIG.pretrained), CONFIG)
model_edit.load_state_dict(
    torch.load(f'{CONFIG.pattern_path}/checkpoints/model.best.pth', 
               map_location=CONFIG.device))
model_edit = model_edit.pretrained
model_edit.to(CONFIG.device)
model_edit.eval()
print('Success')

Success


In [20]:
from utils.print_diff_utils import diff_print, colored

In [21]:
device = CONFIG.device
CONFIG.beam_size = 3
idx_ = np.random.choice(len(ds_val), 20)

with torch.no_grad():
    for i in idx_:
        src_, tgt_ = ds_val[i]

        generated = model_edit.generate(torch.tensor(src_['input_ids']).view(1,-1).to(device), 
                                                      num_beams=CONFIG.beam_size, 
#                                                       pad_token_id=tokenizer.pad_token_id, 
#                                                       bos_token_id=tokenizer.bos_token_id, 
#                                                       eos_token_id=tokenizer.eos_token_id,
                                                          num_return_sequences=1,
                                                     max_length=512)
        generated = generated.cpu()

        src_txt = val1.iloc[i]['old_text_old'] #tokenizer.decode(src_['input_ids'], skip_special_tokens=True)
        tgt_text = tokenizer.decode(tgt_['input_ids'], skip_special_tokens=True)
        
        tgt_comment, tgt_txt = extract_com8text_from_tgt(tgt_text)
        for snt_idx, snt in val1.iloc[i]['numerate_ans'][3].items():
            tgt_txt = tgt_txt.replace(f'[{snt_idx}]', snt)
            
        #src_txt, _ = extract_text8docs_from_src(src_text)
        
        src_txt = diff_print(tgt_txt, src_txt)
        tgt_txt = diff_print(src_txt, tgt_txt)

        print(colored(f'\n\n---------- QUERY {i} ----------', 'red'))
        print(colored(f'X_t:\n', 'pink') + colored(src_txt) + '\n', sep='')
        print(colored(f'X_t+1:\n', 'pink') + colored(tgt_txt) + '\n', sep='')
        print(colored(f'Comment:', 'yellow') + colored(tgt_comment) + '\n', sep='')
        

        for j in range(1):
            to_gen = generated[j]
            gen_text = tokenizer.decode(to_gen, skip_special_tokens=True)
            gen_comment, gen_txt = extract_com8text_from_tgt(gen_text)
            
            for snt_idx, snt in val1.iloc[i]['numerate_ans'][3].items():
                gen_txt = gen_txt.replace(f'[{snt_idx}]', snt)
            
            gen_txt = diff_print(src_txt, gen_txt)
            print(colored(f'Gen Comment:', 'blue') + colored(gen_comment) + '\n', sep='')
            print(colored(f'gen X_t+1:\n', 'pink') + colored(gen_txt) + '\n', sep='')
            
        diff = val1.iloc[i]['diff']
        print(colored(f'Tgt diff:\n', 'bold') + colored(diff) + '\n', sep='')
        
        _ = val1.iloc[i]['docs_processed']
        doc_str = '\n'.join(_.split('DOC'))
        print(colored(f'Docs:\n', 'yellow') + colored(doc_str) + '\n', sep='')

[91m 

---------- QUERY 151 ----------
[95m X_t:
[39m A superset of the General MIDI standard, added several proprietary extensions. The most notable addition was the ability to address multiple banks of programs (instrument sounds) by using an additional pair of Bank Select controllers to specify up to 16384 'variation' sounds (cc#0 is Bank Select MSB, and cc#32 is Bank Select LSB). [1m Other [0m most notable features were 9 [1m Drum [0m kits with 14 additional drum sounds each, [1m Control Change [0m messages for controlling the send level of sound effect blocks (cc#91-94), entering additional parameters (cc#98-101), portamento, sostenuto, soft pedal (cc#65-67), and model-specific [1m SysEx [0m messages for setting various parameters of the synth engine. [1m GS [0m was introduced with the [1m Roland Sound Canvas [0m line, which was also [1m Roland's [0m first [1m General MIDI [0m synth module. 

[95m X_t+1:
[39m A superset of the General MIDI standard, added seve

[91m 

---------- QUERY 167 ----------
[95m X_t:
[39m 01. End Of The [1m World [0m 02. Love For [1m Air [0m 03. Normal [1m People [0m 04. [1m Everything [0m 05. Do It For [1m Yourself [0m 06. Honest To [1m God [0m 07. That Feeling and the [1m Sound [0m 08. [1m Dive [0m 09. Better [1m Friend [0m 10. All The [1m Time [0m 11. [1m 365 [0m 12. Crying [1m Wolf [0m 13. Wreckage in the [1m Rubble , Ross Leighton - [0m vocals, guitar, songwriter, [1m production [0m * [1m Greg Walkinshaw - [0m vocals, drums, songwriter, [1m production [0m * [1m Marc Strain - [0m vocals, bass, songwriter, 

[95m X_t+1:
[39m 01. End Of The [1m World. [0m 02. Love For [1m Air. [0m 03. Normal [1m People. [0m 04. [1m Everything. [0m 05. Do It For [1m Yourself. [0m 06. Honest To [1m God. [0m 07. That Feeling and the [1m Sound. [0m 08. [1m Dive. [0m 09. Better [1m Friend. [0m 10. All The [1m Time. [0m 11. [1m 365. [0m 12. Crying [1m Wolf. [0m 13. Wreckage i

[91m 

---------- QUERY 84 ----------
[95m X_t:
[39m , [1m The [0m village has a total number of 28 houses and the population of 198 of which [1m include [0m 99 are males while 99 are females. According to the report published by Census India in 2011, out of the total population of the village 0 people are from Schedule Caste and the village does not have any Schedule Tribe population so far. 

[95m X_t+1:
[39m , [1m the [0m village has a total number of 28 houses and the population of 198 of which 99 are males while 99 are females. According to the report published by Census India in 2011, out of the total population of the village 0 people are from Schedule Caste and the village does not have any Schedule Tribe population so far. 

[92m Comment:[39m COM_SEP clean up, replaced: of which include  of which

[94m Gen Comment:[39m COM_SEP /* population */added content

[95m gen X_t+1:
[39m , The village has a total number of 28 houses and the population of 198 of which in

[91m 

---------- QUERY 438 ----------
[95m X_t:
[39m [1m Bahati [0m is a constituency of the [1m National Assembly [0m of [1m Zambia.Bahati National Assembly [0m of [1m Zambia It [0m covers the northern part of [1m Mansa [0m and a rural area to the north of the city in [1m Luapula Province. [0m 

[95m X_t+1:
[39m [1m bahati [0m is a constituency of the [1m national assembly [0m of [1m zambia.bahati national assembly [0m of [1m zambia it [0m covers the northern part of [1m mansa [0m and a rural area to the north of the city in [1m mansa district of luapula province. [0m 

[92m Comment:[39m COM_SEP suggested change in display

[94m Gen Comment:[39m COM_SEP /* top */ ce

[95m gen X_t+1:
[39m [1m bahati [0m is a constituency of the [1m national assembly [0m of [1m zambia.bahati national assembly [0m of [1m zambia it [0m covers the northern part of [1m mansa [0m and a rural area to the north of the city in 

[1m Tgt diff:
 [0m[39m  Mansa Dist

[91m 

---------- QUERY 93 ----------
[95m X_t:
[39m [1m Stoke City Football Club [0m (known as [1m Stoke Football Club [0m until 1925) is a football club from [1m Stoke-on-Trent [0m in [1m England. [0m The club is reputedly the second-oldest football league club in the world, after Notts County F.C., and claims to have been formed in 1863 (disputed by some, who claim it to be 1868). The club’s nickname is The Potters and its home kit consists of a red & white vertical-striped shirt with white shorts. The club is managed by Johan Boskamp. It plays in the Football League Championship and is one of the twelve founder-members of The Football League. 

[95m X_t+1:
[39m [1m stoke city football club [0m (known as [1m stoke football club [0m until 1925) is a football club from [1m stoke-on-trent [0m in [1m england (the other league club in the city being port vale f.c.). [0m The club is reputedly the second-oldest football league club in the world, after Notts County F.C.

[91m 

---------- QUERY 203 ----------
[95m X_t:
[39m Amy Hart Redford (born October 22, 1970) is an American actress, director and producer. [1m She [0m is the daughter of [1m Academy Award-winning [0m film director and actor [1m Robert Redford [0m and his former wife [1m Lola Van Wagenen.The New York Times [0m 

[95m X_t+1:
[39m Amy Hart Redford (born October 22, 1970) is an American actress, director and producer. [1m she [0m is the daughter of [1m academy award-winning [0m film director and actor [1m robert redford [0m and his former wife [1m lola van wagenen.the new york times she is the sister of writer/producer james redford. james redford at imdb. [0m 

[92m Comment:[39m COM_SEP add in link to brother

[94m Gen Comment:[39m COM_SEP added content

[95m gen X_t+1:
[39m Amy Hart Redford (born October 22, 1970) is an American actress, director and producer. [1m she [0m is the daughter of [1m academy award-winning [0m film director and actor [1m rober

## Generate QUERIES

In [22]:
loaders = {
    'valid_full': torch.utils.data.DataLoader(ds_val_full, 
                                         batch_size=CONFIG.batch_size,
                                         collate_fn=lambda x: EditDataset.collate_fn(x, tokenizer, CONFIG),
                                         num_workers=4, shuffle=False)
}

In [23]:
from tqdm.auto import tqdm

device = CONFIG.device
CONFIG.beam_size = 1

predictions = []
with torch.no_grad():
    for batch_idx, batch in tqdm(enumerate(loaders['valid_full']), total=len(loaders['valid_full'])):
        (src_, tgt_), _ = batch
        
        generated = model_edit.generate(src_.to(device), 
                                        num_beams=CONFIG.beam_size,
                                        num_return_sequences=1,
                                        max_length=512)
        generated = generated.cpu()
        
        pred = generated.view(-1, CONFIG.beam_size, generated.shape[1])
        for i in range(pred.shape[0]):
            pred_full = []
            for pred_item in pred[i]:
                txt_pred = tokenizer.decode(pred_item, skip_special_tokens=True)
                pred_full.append(txt_pred)
            predictions.append(pred_full)

  0%|          | 0/1661 [00:00<?, ?it/s]

In [24]:
from utils.metrics_utils import topN_diff_exact_match_one, topN_exact_match_one, sari_one, rouge_one

In [27]:
predictions_back = []
tgt_back = []
for row_idx, row in tqdm(val.iterrows(), total=len(val)):
    preddd_arr = []
    real_tgt = tokenizer.decode(ds_val_full[row_idx][1]['input_ids'], skip_special_tokens=True)
    for preddd in predictions[row_idx]:
        for snt_idx, snt in row['numerate_ans'][3].items():
            preddd = preddd.replace(f'[{snt_idx}]', snt)
        preddd_arr.append(preddd)
    
    for snt_idx, snt in row['numerate_ans'][3].items():
            real_tgt = real_tgt.replace(f'[{snt_idx}]', snt)
    predictions_back.append(preddd_arr)
    tgt_back.append(real_tgt)

  0%|          | 0/6643 [00:00<?, ?it/s]

In [28]:
predictions_back[1], tgt_back[1]

(['COM_SEP added edwin g. andrade TEXT_SEP WICO (1320 AM) is a radio station broadcasting a talk radio format. licensed to salisbury, maryland, united states, the station is owned by edwin g. andrade, through licensee the voice radio, llc, and features programming from abc radio. , FCC History Cards for WICO.'],
 'COM_SEP license assigned 7 december 2017 TEXT_SEP WICO (1320 AM) is a radio station broadcasting a talk radio format. licensed to salisbury, maryland, united states, the station is owned by edwin g. andrade, through licensee the voice radio, llc, and features programming from abc radio. , FCC History Cards for WICO.')

In [31]:
len(tgt_back), len(predictions_back)

(6643, 6643)

In [35]:
def count_one(src, tgt, preds):
    tgt_comment, tgt_txt = extract_com8text_from_tgt(tgt)
    src_txt, _ = extract_text8docs_from_src(src) 
    
    pred_comments, pred_texts = [], []
    for pred in preds:
        pred_comment, pred_txt = extract_com8text_from_tgt(pred)
        pred_comments.append(pred_comment)
        pred_texts.append(pred_txt)
    
    #print(tgt_comment, pred_comments)
    #print(pred_texts, tgt_txt)
    return {
        'full__exact_match@1': topN_exact_match_one(src, tgt, preds),
        'text__exact_match@1': topN_exact_match_one(src_txt, tgt_txt, pred_texts), 
        'text__diff_exact_match@1': topN_diff_exact_match_one(src_txt, tgt_txt, pred_texts),
        'comment__exact_match@1': topN_exact_match_one('', tgt_comment, pred_comments),
        'text__sari@1': sari_one(src_txt, tgt_txt, pred_texts),
        'text__rouge-1@1': rouge_one(src_txt, tgt_txt, pred_texts)['rouge-1'],
        'text__rouge-2@1': rouge_one(src_txt, tgt_txt, pred_texts)['rouge-2'],
        'text__rouge-l@1': rouge_one(src_txt, tgt_txt, pred_texts)['rouge-l']
    }

In [36]:
a = {}
for src, tgt, preds in tqdm(zip(val['old_text_old'].values, tgt_back, predictions_back), total=len(val)):
    aa = count_one(src, tgt, preds)
    #print(aa)
    for k, v in aa.items():
        if k not in a:
            a[k] = 0
        a[k] += v

a

  0%|          | 0/6643 [00:00<?, ?it/s]

{'full__exact_match@1': 74.0,
 'text__exact_match@1': 498.0,
 'text__diff_exact_match@1': 4437.382141906501,
 'comment__exact_match@1': 123.0,
 'text__sari@1': 251373.6323234555,
 'text__rouge-1@1': 5716.424814025186,
 'text__rouge-2@1': 5315.424324554212,
 'text__rouge-l@1': 5697.661137671814}

In [37]:
for k, v in aa.items():
    a[k] = a[k] / len(val)
a

{'full__exact_match@1': 0.011139545386120728,
 'text__exact_match@1': 0.07496612976065031,
 'text__diff_exact_match@1': 0.6679786454774199,
 'comment__exact_match@1': 0.01851573084449797,
 'text__sari@1': 37.84037819109672,
 'text__rouge-1@1': 0.8605185630024365,
 'text__rouge-2@1': 0.800154196079213,
 'text__rouge-l@1': 0.8576939842950194}

In [38]:
len(predictions), len(val)

(6643, 6643)

In [39]:
predictions[1], val['new_text_old'].values[1]

(['COM_SEP added edwin g. andrade TEXT_SEP [0] licensed to salisbury, maryland, united states, the station is owned by edwin g. andrade, through licensee the voice radio, llc, and features programming from abc radio. [2]'],
 'WICO (1320 AM) is a radio station broadcasting a talk radio format. Licensed to Salisbury, Maryland, United States, the station is owned by Edwin G. Andrade, through licensee The Voice Radio, LLC, and features programming from ABC Radio.\n\n, FCC History Cards for WICO')