In [1]:
!ls -l ./models/ 

total 8
drwxrwxr-x 5 ubuntu ubuntu 4096 Jan  8 15:10 't5-small p(comment, x_t+1 | x_t, doc)'
drwxrwxr-x 5 ubuntu ubuntu 4096 Jan  8 13:26 't5-small_test_one_batch p(comment, x_t+1 | x_t, doc)'


In [None]:
import re
import os
import torch
import json
import numpy as np
import pandas as pd
import seaborn as sns
import transformers
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from transformers import T5Tokenizer, T5TokenizerFast, T5ForConditionalGeneration
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
from typing import Callable, Union, Tuple
from tqdm.notebook import tqdm
from collections import Counter
from torch import nn
from catalyst import dl
from catalyst.callbacks.periodic_loader import PeriodicLoaderCallback
from langdetect import detect
from easse.sari import corpus_sari
from rouge import Rouge 

from utils.dataset_utils import extract_com8text_from_tgt, extract_text8docs_from_src
from utils.dataset_utils import EditDataset, get_tgt, get_src, COM_SEP, TEXT_SEP_SRC, TEXT_SEP_TGT, DOCS_SEP
from utils.metrics_utils import PeerEditMetricsCallback
from utils.config import Config


DOCS_DIR = 'data'
PAGES_DIR = 'data'

In [None]:
CONFIG = Config()
CONFIG.seed = 1337
CONFIG.beam_size = 1

In [None]:
import random

random.seed(CONFIG.seed)
os.environ['PYTHONHASHSEED'] = str(CONFIG.seed)
np.random.seed(CONFIG.seed)
torch.manual_seed(CONFIG.seed)
torch.cuda.manual_seed(CONFIG.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

## Data preparing

In [5]:
mp = json.load(open(r"data/column_mapper.json"))

train = pd.read_json(r'data/train.json')
train.set_axis(mp.values(), axis='columns', inplace=True)

test = pd.read_json(r'data/test.json')
test.set_axis(mp.values(), axis='columns', inplace=True)

val = pd.read_json(r'data/val.json')
val.set_axis(mp.values(), axis='columns', inplace=True)
val1 = val.sample(600)

In [6]:
train.head()

Unnamed: 0,obj_id,old_text,new_text,comment,docs,diff,title,search_queries,counter_found_docs,section_name,is_good,docs_processed
0,13807,People Who Fear People was another project tha...,"People Who Fear People Variety, August 17th, 1...",/* People Who Fear People (1999) */ Added refe...,"Maria Pitillo (born January 8, 1966) is an Ame...","Variety, August 17th, 1999 Page 5",Maria Pitillo,[Maria Pitillo People Who Fear People (1999) ...,[28],People Who Fear People (1999),True,"DOC0: Maria Pitillo (born January 8, 1966) is ..."
1,1431,Thank you for signing my signbook. :) <3 Tinkl...,Thank you for signing my signbook. :) <3 Tinkl...,/* Thank You */ re,Councilmember Antonio López. 584 likes. Welcom...,\n:your welcome,Antonio Lopez,[Antonio Lopez Thank You \n:your welcome],[24],Thank You,True,DOC0: Councilmember Antonio López. 584 likes. ...
2,1362,Mark Ward may refer to:\n* Mark Ward (football...,Mark Ward may refer to:\n* Mark Ward (football...,Added in Mark Ward Sinn Féin TD for Dublin Mid...,Email me. Spokesperson for Mental Health. Mark...,\n* Mark Ward Sinn Féin TD for Dublin Mid West,Mark Ward,[Mark Ward \n* Mark Ward Sinn Féin TD for Dubl...,[28],,True,DOC0: Email me. Spokesperson for Mental Health...
3,10057,National Schools Tree Day is held on the last ...,National Schools Tree Day is held on the last ...,Removed footnote reference to wikipedia article,"While every day can be Tree Day, we dedicate c...",(See Planet Ark).,Arbor Day,[Arbor Day Australia (See Planet Ark).],[27],Australia,True,"DOC0: While every day can be Tree Day, we dedi..."
4,4566,* Bea Alonzo as Architect Basha-Belinda Eugeni...,* Bea Alonzo as Architect Basha-Belinda Eugeni...,/* Cast */ Added links,Top cast ; John Lloyd Cruz · Popoy ; Bea Alonz...,"Rodolfo """,A Second Chance (2015 film),"[A Second Chance (2015 film) Cast Rodolfo ""]",[27],Cast,True,DOC0: Top cast ; John Lloyd Cruz · Popoy ; Bea...


In [7]:
CONFIG.src_max_len = 512
CONFIG.tgt_max_len = 512
CONFIG.pretrained = 't5-small'
CONFIG.pattern_path = './models/t5-small p(comment, x_t+1 | x_t, doc)'
CONFIG.batch_size = 4

tokenizer = T5TokenizerFast.from_pretrained(CONFIG.pretrained, model_max_length=CONFIG.src_max_len)

**Make dataset**

In [8]:
#ds_train = EditDataset(train, tokenizer, CONFIG, text_to_lower=True, comment_to_lower=True)
ds_val = EditDataset(val1, tokenizer, CONFIG, text_to_lower=True, comment_to_lower=True)
#ds_val_full = EditDataset(val, tokenizer, CONFIG, text_to_lower=True, comment_to_lower=True)

**Len distribution**

In [9]:
idx_num = 100
src_text = tokenizer.decode(ds_val[idx_num][0]['input_ids'], skip_special_tokens=True)
tgt_text = tokenizer.decode(ds_val[idx_num][1]['input_ids'], skip_special_tokens=True)

In [10]:
print(f'{src_text}\n\n{tgt_text}')

TEXT_SEP canberra plaza is designed with three commercial stories that have a combined gross floor area of the main anchor tenants include a supermarket and food courts. other commercial units include restaurants, enrichment centres as well as clinics. a sheltered plaza for community activities and a water playground is incorporated into the design of canberra plaza as well. in order to improve connectivity, a portion of the ground level will be kept open for 24-hours and an sheltered elevated pedestrian footbridge will also link housing precincts (such as eastlink ii @ canberra) across canberra way to canberra plaza. a similar footbridge will connect canberra plaza to canberra mrt station which is located across canberra link.exhibition panel for canberra plaza at the exhibition(pdf), retrieved 2018-04-18. source: hdb DOCS_SEP doc0: canberra plaza is a new generation neighbourhood centre (ngnc) built by the housing. construction started in mid-2015 and the shopping centre opened on 18

## Model training

In [11]:
class EditModel(nn.Module):
    def __init__(self, 
                 pretrained: transformers.modeling_utils.PreTrainedModel, 
                 config: Config):
        super(EditModel, self).__init__()
        self.pretrained = pretrained
        

    def forward(self, 
                x: Tuple[torch.Tensor, torch.Tensor]):
        src, tgt = x
        
        tgt[tgt == 0] == -100
        
        loss = self.pretrained(
            input_ids = src,
            attention_mask = (src != 0).float(),
            labels=tgt,
        ).loss
        return loss
    
    
class Criterion(nn.Module):
    def __init__(self):
        super(Criterion, self).__init__()
        
    def forward(self, pred, tgt):
        return pred

In [12]:
CONFIG.device = 'cuda'

In [13]:
model_edit = EditModel(T5ForConditionalGeneration.from_pretrained(CONFIG.pretrained), CONFIG)
model_edit.load_state_dict(
    torch.load(f'{CONFIG.pattern_path}/checkpoints/model.best.pth', 
               map_location=CONFIG.device))
model_edit = model_edit.pretrained
model_edit.to(CONFIG.device)
model_edit.eval()
print('Success')

Success


In [14]:
METRIC_LIST = [
    'full__exact_match@1',
    'text__exact_match@1',
    'text__diff_exact_match@1',
    'comment__exact_match@1',
    'text__sari@1',
    'text__rouge-1@1',
    'text__rouge-2@1',
    'text__rouge-l@1'
]   

In [41]:
val1

Unnamed: 0,obj_id,old_text,new_text,comment,docs,diff,title,search_queries,counter_found_docs,section_name,is_good,docs_processed
7191,1252,"Argyle, M. (1975). Bodily communication. New...","Argyle, M. (1975). Bodily communication. New...",+ ja:,Category:ja:Body language · あくび · 欠伸 · メロイックサイ...,\nja:ボディー・ランゲージ,Body language,[Body language Reference \nja:ボディー・ランゲージ],[20],Reference,True,DOC0: Category:ja:Body language · あくび · 欠伸 · メ...
1582,213,"*Daniel Hack Tuke (1827–1895), English alienis...","*Daniel Hack Tuke (1827–1895), English alienis...",Added Gordon Turnbull,Additional lists of psychiatrists can be found...,"\n*Gordon Turnbull, Scottish, posttraumatic st...",List of psychiatrists,"[List of psychiatrists T \n*Gordon Turnbull, ...",[28],T,True,DOC0: Additional lists of psychiatrists can be...
1393,15579,"""I Luv U"" is a song by English indie rock grou...","""I Luv U"" is a song by English indie rock grou...","""to date"" does not make sense",The Ordinary Boys - I Luv U - With Full Lyrics...,. It reached,I Luv U (The Ordinary Boys song),[I Luv U (The Ordinary Boys song) . It reached],[28],,True,DOC0: The Ordinary Boys - I Luv U - With Full ...
6007,329,Frank worked in the bank his family ran until ...,Frank worked in the bank that his father initi...,/* Marriage and children */Added content,Otto Frank married former Amsterdam neighbor a...,that his\nther initially run -and that he and ...,Otto Frank,"[Otto Frank Marriage and children that his, Ot...","[27, 27]",Marriage and children,True,DOC0: Otto Frank married former Amsterdam neig...
2837,10186,Distance Measurement Equipment (DME) is used t...,Distance Measurement Equipment (DME) is used t...,tighten,DME is a fundamentally simple technology that ...,. Once a pilot tunes onto a particular VOR fre...,Avionics,[Avionics DME . Once a pilot tunes onto a par...,"[29, 29, 29]",DME,True,DOC0: DME is a fundamentally simple technology...
...,...,...,...,...,...,...,...,...,...,...,...,...
2659,358,Moazzam Tufail Malik is a British diplomat se...,Moazzam Tufail Malik is a British civil serva...,Updated professional background,Moazzam Tufail Malik CMG is a British civil se...,civil servant and,Moazzam Malik (diplomat),[Moazzam Malik (diplomat) civil servant and ],[28],,True,DOC0: Moazzam Tufail Malik CMG is a British ci...
2856,3558,*Hatton\n*Lind\n*Othello\n*Ritzville\n*Washtuc...,*Hatton\n*Lind\n*Othello\n*Ritzville\n*Washtuc...,/* Cities and towns */ +pt,Adams County's two most populous cities are Ri...,\npt:Condado de Adams (Washington),"Adams County, Washington","[Adams County, Washington Cities and towns \n...",[8],Cities and towns,True,DOC0: Adams County's two most populous cities ...
4610,1132,"* Brian Bruney, New York Yankees reliever\n* H...","* Brian Bruney, New York Yankees reliever\n* H...","added Joshua Marquis, District Attorney",Joshua K. Marquis (born 1952) is an attorney a...,"\n* Joshua Marquis, District Attorney","Astoria, Oregon","[Astoria, Oregon Notable residents \n* Joshua ...",[13],Notable residents,True,DOC0: Joshua K. Marquis (born 1952) is an atto...
3796,3542,The Soča (in Slovene) or Isonzo (in Italian) (...,The Soča ( in Slovene) or Isonzo (in Italian) ...,Pronunciation,"Soča (pronounced [ˈsoːtʃa], Italian: Sonzia) i...",in Slovene) or,Soča,[Soča in Slovene) or],[28],,True,"DOC0: Soča (pronounced [ˈsoːtʃa], Italian: Son..."


In [46]:
device = CONFIG.device
CONFIG.beam_size = 3
idx_ = np.random.choice(len(ds_val), 20)

with torch.no_grad():
    for i in idx_:
        src_, tgt_ = ds_val[i]

        generated = model_edit.generate(torch.tensor(src_['input_ids']).view(1,-1).to(device), 
                                                      num_beams=CONFIG.beam_size, 
#                                                       pad_token_id=tokenizer.pad_token_id, 
#                                                       bos_token_id=tokenizer.bos_token_id, 
#                                                       eos_token_id=tokenizer.eos_token_id,
                                                          num_return_sequences=1,
                                                     max_length=512)
        generated = generated.cpu()

        src_text = tokenizer.decode(src_['input_ids'], skip_special_tokens=True)
        tgt_text = tokenizer.decode(tgt_['input_ids'], skip_special_tokens=True)
        
        tgt_comment, tgt_txt = extract_com8text_from_tgt(tgt_text)
        src_txt, _ = extract_text8docs_from_src(src_text)

        print(f'\n\n---------- QUERY {i} ----------')
        print(f'X_t:\n{src_txt}\n')
        print(f'X_t+1:\n{tgt_txt}\n')
        print(f'Comment:\t{tgt_comment}\n')
        print(f'GENERATED:')

        for j in range(1):
            to_gen = generated[j]
            gen_text = tokenizer.decode(to_gen, skip_special_tokens=True)
            gen_comment, gen_txt = extract_com8text_from_tgt(tgt_text)
            print(f'Comment:\t{gen_comment}\n')
            print(f'gen X_t+1:\n{gen_txt}\n')
            
        diff = val1.iloc[i]['diff']
        print(f'Tgt diff:\n{diff}\n')
        
        doc_str = '\n'.join(_.split('doc'))
        print(f'Docs:\n{doc_str}')



---------- QUERY 256 ----------
X_t:
* sister cities of bishkek include colorado springs, colorado (1994) and meriden, connecticut (2005).

X_t+1:
* sister cities of bishkek include colorado springs, colorado (1994) and meriden, connecticut (2005). *ankara, turkey

Comment:	COM_SEP added a sister city based on list of sister cities

GENERATED:
Comment:	COM_SEP added a sister city based on list of sister cities

gen X_t+1:
* sister cities of bishkek include colorado springs, colorado (1994) and meriden, connecticut (2005). *ankara, turkey

Tgt diff:

*Ankara, Turkey

Docs:

0: bishkek is situated near the kazakhstan-kyrgyzstan border. its population was 1,074,075 in 2021.. sister cities. bishkek is twinned with: almaty, kazakhstan (1994) ankara, turkey (1992) ashgabat, turkmenistan (2018) colorado springs, united states (1994) doha, qatar (2014) gumi, south korea (1991) izmir, turkey (1994) kyiv 
1: amman, jordan ashgabat, turkmenistan astana, kazakhstan bangkok, thailand beijing, chi



---------- QUERY 85 ----------
X_t:
patrick richard henry wright, baron wright of richmond (born 28 june 1931) is a retired british diplomat and former head of hm diplomatic service. he sat in the house of lords as a crossbencher until his retirement on 17 december 2019.lord wright of richmond, parliament.uk, 18 december 2019

X_t+1:
patrick richard henry wright, baron wright of richmond (born 28 june 1931) is a retired british diplomat and former head of hm diplomatic service. he sat in the house of lords as a crossbencher from 10 february 1994 until his retirement on 17 december 2019.lord wright of richmond, parliament.uk, 18 december 2019

Comment:	COM_SEP /* top */ m

GENERATED:
Comment:	COM_SEP /* top */ m

gen X_t+1:
patrick richard henry wright, baron wright of richmond (born 28 june 1931) is a retired british diplomat and former head of hm diplomatic service. he sat in the house of lords as a crossbencher from 10 february 1994 until his retirement on 17 december 2019.lord wri



---------- QUERY 336 ----------
X_t:
german does not have an ablative case (but exceptionally, latin ablative case-forms were used from the 17th to the 19th century after some prepositions, for example after von in von dem nomine: ablative of the latin loanword nomen). grammarians at that time, such as justus georg schottel, kaspar von stieler ("der spate"), johann balthasar von antesperg and johann christoph gottsched, listed an ablative case (as the sixth case after nominative, genitive, dative, accusative and vocative) for german words. they arbitrarily considered the dative case after some prepositions to be an ablative, as in ("from the man" or "of the man") and ("with the man"), while they considered the dative case after other prepositions or without a preposition as to be a dative.

X_t+1:
german does not have an ablative case (but exceptionally, latin ablative case-forms were used from the 17th to the 19th century after some prepositions, for example after von in von dem nom



---------- QUERY 167 ----------
X_t:
telenovelas are popular in venezuela, and some venezuelan productions (such as 1992's cara sucia) are distributed internationally. perhaps the best known television show internationally is however president hugo chávez' weekly talk show aló presidente, which began in 1999. state television in venezuela has an unusually low audience share, of around 2% to 5%, although the government also makes regular use of cadenas (mandatory interruptions on all channels to show government broadcasts).

X_t+1:
telenovelas are popular in venezuela, and some venezuelan productions (such as 1992's cara sucia) are distributed internationally. perhaps the best known television show internationally is however president hugo chávez' weekly talk show aló presidente, which began in 1999 and ran with occasional breaks until 2012. state television in venezuela has an unusually low audience share, of around 2% to 5%, although the government also makes regular use of cadenas 



---------- QUERY 272 ----------
X_t:
*callisto, the mythological figure *callisto, the moon of jupiter this is a disambiguation page; that is, one that just points to other pages that might otherwise have the same name. if you followed a link here, you might want to go back and fix that link to point to the appropriate specific page.

X_t+1:
*callisto, the mythological figure *callisto, the moon of jupiter *callisto, the content management system this is a disambiguation page; that is, one that just points to other pages that might otherwise have the same name. if you followed a link here, you might want to go back and fix that link to point to the appropriate specific page.

Comment:	COM_SEP added cms link

GENERATED:
Comment:	COM_SEP added cms link

gen X_t+1:
*callisto, the mythological figure *callisto, the moon of jupiter *callisto, the content management system this is a disambiguation page; that is, one that just points to other pages that might otherwise have the same name. i



---------- QUERY 299 ----------
X_t:
'' cut of hkey_users if present.'this makes interoperating with regedit easier.'if len(sidstring) > 11 then if left(sidstring, 11) = "hkey_users" then sidstring = mid(sidstring, 12) end if end if'' open the wmi service and retrieve the sid'on error resume next set sidobject = getobject( _ "winmgmts:impersonationlevel=impersonate" _ ).get("win32_sid.sid='" & sidstring & "'") if err then msgbox "could not retrieve the sid.", vbokonly, "sorry" exit do end if (adapted from an example illustrating ways to create system administration tools.)

X_t+1:
'' cut off hkey_users if present.'this makes interoperating with regedit easier.'if len(sidstring) > 11 then if left(sidstring, 11) = "hkey_users" then sidstring = mid(sidstring, 12) end if end if'' open the wmi service and retrieve the sid'on error resume next set sidobject = getobject( _ "winmgmts:impersonationlevel=impersonate" _ ).get("win32_sid.sid='" & sidstring & "'") if err then msgbox "could not re

In [16]:
cc = '''Marc-André ter Stegen (born 30 April 1992) is a German professional footballer who plays as a goalkeeper for La Liga club Barcelona and the Germany national team. Known for his reflexes, passing, and ball-playing ability, he has been often nicknamed as the Berlin Wall because of his reflexes and ball control abilities as a goalkeeper'''
doc1 = """Marc-André ter Stegen signed for FC Barcelona in the summer of 2014 from Borussia Mönchengladbach. Born on 30 April 1992 in Mönchengladbach itself, the German did not take long to make his name as one Europe's most promising young goalkeepers."""
#doc2 = '''Marc-Andre Ter Stegen nationality is German. He was born in Monchengladbach, Germany. Ter Stegen grew up in a football loving family. He started kicking the ball, playing in his family's yard with his brother at the age of 2. When he turned 4, his grandfather enrolled him into Borussia Monchengladbach's academy.'''
#doc3 = '''Ter Stegen started his amateur career as a striker. According to himself he had a wonderful time during this period of his amateur career and enjoyed being the goal scorer of the team.'''
#doc4 = '''Ter Stegen was Monchengladbach's first-choice goalkeeper in his final three seasons with Gladbach, playing 39, 45 and 36 games respectively. In total, He played 127 games for Gladbach's first team, leaving behind an incredible stat of 45 clean sheets.'''




In [58]:
a = f'''TEXT_SEP {cc.lower()} DOCS_SEP doc0: {doc1.lower()}''' # ''' #doc1: {doc1.lower()} doc2: {doc1.lower()} doc3: {doc1.lower()}'''
b = 'COM_SEP when started to play in barcelona TEXT_SEP'



In [59]:
with torch.no_grad():
    src_ = tokenizer(
                      a,
                      max_length=512,
                      truncation=True,
                      return_attention_mask=False,
                     )
    b_tok = tokenizer(b, return_tensors = 'pt', padding=True, add_special_tokens = False)
    forced_decoder_ids = list(enumerate(b_tok, 1))
    
    generated = model_edit.generate(torch.tensor(src_['input_ids']).view(1,-1).to(device), 
                                                  num_beams=5, 
#                                                       pad_token_id=tokenizer.pad_token_id, 
#                                                       bos_token_id=tokenizer.bos_token_id, 
#                                                       eos_token_id=tokenizer.eos_token_id,
                                                  num_return_sequences=1,
                                                  decoder_input_ids=b_tok['input_ids'].to(device),
                                         max_length=512)
    generated = generated.cpu()

    src_text = tokenizer.decode(src_['input_ids'], skip_special_tokens=True)
    print(f'\n\n---------- QUERY {i} ----------')
    print(f'Src query: {src_text}')
    # print(f'Tgt query: {tgt_text}')
    print(f'Generated:')

    for j in range(1):
        to_gen = generated[j]
        gen_text = tokenizer.decode(to_gen, skip_special_tokens=True)
        print(f'   {j}: {gen_text}')



---------- QUERY 516 ----------
Src query: TEXT_SEP marc-andré ter stegen (born 30 april 1992) is a german professional footballer who plays as a goalkeeper for la liga club barcelona and the germany national team. known for his reflexes, passing, and ball-playing ability, he has been often nicknamed as the berlin wall because of his reflexes and ball control abilities as a goalkeeper DOCS_SEP doc0: marc-andré ter stegen signed for fc barcelona in the summer of 2014 from borussia mönchengladbach. born on 30 april 1992 in mönchengladbach itself, the german did not take long to make his name as one europe's most promising young goalkeepers.
Generated:
   0: COM_SEP when started to play in barcelona TEXT_SEP marc-andré ter stegen (born 30 april 1992) is a german professional footballer who plays as a goalkeeper for la liga club barcelona and the germany national team. known for his reflexes, passing, and ball-playing ability, he has been often nicknamed as the berlin wall because of his

In [60]:
com, txt = extract_com8text_from_tgt(gen_text)
src_txt, doc = extract_text8docs_from_src(src_text)

In [61]:
com, txt

('COM_SEP when started to play in barcelona',
 "marc-andré ter stegen (born 30 april 1992) is a german professional footballer who plays as a goalkeeper for la liga club barcelona and the germany national team. known for his reflexes, passing, and ball-playing ability, he has been often nicknamed as the berlin wall because of his reflexes and ball control abilities as a goalkeeper in barc he he was signed to fc barcelona, marc-andré ter stegen (born in mönchengladbach itself, the german did not take long to make his name as one europe's most promising young goalkeepers.")

In [62]:
src_txt, doc

('marc-andré ter stegen (born 30 april 1992) is a german professional footballer who plays as a goalkeeper for la liga club barcelona and the germany national team. known for his reflexes, passing, and ball-playing ability, he has been often nicknamed as the berlin wall because of his reflexes and ball control abilities as a goalkeeper',
 "doc0: marc-andré ter stegen signed for fc barcelona in the summer of 2014 from borussia mönchengladbach. born on 30 april 1992 in mönchengladbach itself, the german did not take long to make his name as one europe's most promising young goalkeepers.")