In [1]:
import re
import os
import torch
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import T5TokenizerFast, T5ForConditionalGeneration
from tqdm.notebook import tqdm
from collections import Counter
from torch import nn
from catalyst import dl
from langdetect import detect
from nltk.tokenize import sent_tokenize, word_tokenize

DOCS_DIR = 'data/final_data/documents'
PAGES_DIR = 'data/final_data/revision'
CHUNKS = sorted(os.listdir(DOCS_DIR))

In [2]:
total = 0
db_dict = {'obj_id': [], 'old_text': [], 'new_text': [], 'comment': [], 'docs': [], 'diff': [],
           'title': [], 'search_queries': [], 'counter_found_docs': [], 'section_name': []}
for chunk in CHUNKS:
    dataset_ids = list(map(lambda x: x.split('.')[0], os.listdir(f"{DOCS_DIR}/{chunk}")))
    for dataset_obj_id in tqdm(dataset_ids):
        total += 1
        with open(f"{PAGES_DIR}/{chunk}/{dataset_obj_id}.json", 'r', encoding='utf-8') as f:
            page_json = json.load(f)
        docs_text = ''
        added_docs = set()
        count_docs = 0
        with open(f"{DOCS_DIR}/{chunk}/{dataset_obj_id}.txt", 'r', encoding='utf-8') as f:
            docs_text_plain = f.read()

        diff = '\n'.join(page_json['change_texts'][0][0])
        db_dict['diff'].append(diff)
        db_dict['obj_id'].append(dataset_obj_id)
        db_dict['old_text'].append(page_json['old_text'])
        db_dict['new_text'].append(page_json['new_text'])
        db_dict['comment'].append(page_json['comment'])
        db_dict['docs'].append(docs_text_plain)
        db_dict['title'].append(page_json['title'])
        db_dict['search_queries'].append(page_json['search_queries'])
        db_dict['counter_found_docs'].append(page_json['counter_found_docs'])
        db_dict['section_name'].append(page_json['section_name'])

HBox(children=(FloatProgress(value=0.0, max=5639.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4139.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1189.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3299.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1340.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4121.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2749.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1308.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6644.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1197.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2775.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8251.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2883.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2851.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2673.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3799.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=184.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7552.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5715.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9036.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=69.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=53.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6663.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2155.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1272.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1126.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6725.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6237.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5105.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1101.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9004.0), HTML(value='')))




In [3]:
df = pd.DataFrame.from_dict(db_dict)
print(df.shape)

(120720, 10)


In [4]:
df.head()

Unnamed: 0,obj_id,old_text,new_text,comment,docs,diff,title,search_queries,counter_found_docs,section_name
0,3571,"__NOTOC__\n\nThe Pacific Coast Marine Firemen,...","__NOTOC__\n\nThe Pacific Coast Marine Firemen,...",edited links.,"The Pacific Coast Marine Firemen, Oilers, Wate...",", commonly\n, is an",Marine Firemen's Union,"[Marine Firemen's Union , commonly, Marine Fir...","[29, 23]",
1,14256,"Soon after its premiere, Martin Stein wrote ""S...","Soon after its premiere, Martin Stein wrote ""S...",/* Reception */ Added LAT.,The show opens with Siren leader Sin Cinnamon ...,The Los Angeles Times advised readers to avoid...,Sirens of TI,[Sirens of TI Reception The Los Angeles Times ...,[29],Reception
2,17990,"The film was released to DVD on June 12, 2007....","The film was released to DVD on June 12, 2007....",/* Release */ fixes,Romeo Juliet: Sealed with a Kiss is a 2006 Am...,praising its amount of \nonus content for a\nc...,Romeo & Juliet: Sealed with a Kiss,[Romeo & Juliet: Sealed with a Kiss Home media...,"[29, 12, 20]",Home media
3,1870,This template will categorise articles into :C...,This template will categorise articles into :C...,/* Usage */ added date,Listed below are several options for usage. { ...,date=August 2010}}\n\n{{hoax|section|date=Augu...,Hoax/doc,[Hoax/doc Usage date=August 2010}}\n\n{{hoax|s...,[29],Usage
4,10259,Dallasaurus is a basal mosasauroid from the Up...,"Dallasaurus (""Dallas lizard"") is a basal mosas...",Adding a definition to the lead.,Dallasaurus (Dallas lizard) is a basal mosasau...,"(""Dallas lizard"") is a",Dallasaurus,"[Dallasaurus (""Dallas lizard"") is a]",[18],


In [5]:
titles = df.title.unique()
len(titles)

35682

In [6]:
df.iloc[3]['old_text']



In [7]:
df.iloc[3]['new_text']



## Filter revisions

In [12]:
def is_good(row):
    if 'link' in row['comment'] and 'fix' in row['comment']:
        return False
    if '|' in row['old_text']:
        return False
    if 'this template' in row['old_text']:
        return False
    if 'image' in row['comment']:
        return False
    if '* \n*' in row['old_text'] or '*\n*' in row['old_text'] or '*  \n*' in row['old_text']:
        return False
    if row['old_text'].count('*') > 8:
        return False
    if 'http' in row['diff']:
        return False
    return True
        

In [13]:
df['is_good'] = df.apply(lambda x: is_good(x), axis=1)

In [14]:
df = df.loc[df['is_good'] == True]
df.shape

(92589, 11)

## Watch docs

In [34]:
docs_corpus = []
for row_idx, row in df.iterrows():
    docs = row['old_text']
    sents = sent_tokenize(docs)
    docs_corpus += sents
    

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5), max_df=0.9)
vectorizer.fit(docs_corpus)

TfidfVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=1, ngram_range=(3, 5), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [36]:
len(vectorizer.vocabulary_)

1339139

In [38]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [49]:
df_sample = df.sample(30)
for row_idx, row in df_sample.iterrows():
    old = row['old_text']
    new = row['new_text']
    com = row['comment']
    diff = row['diff']
    docs = row['docs']
    
    diff_arr = diff.split('\n')
    diff_arr = [txt for txt in diff_arr if len(txt) > 20]
    
    final_docs = []
    for doc in docs.split('\n\nDOC_DELIMITER_TOKEN\n\n'):
        texts = doc.split('...')
        final_sents = []
        for txt in texts:
            if len(txt) > 15:
                if txt[-1] == '.':
                    final_sents.append(txt[:-1])
        final_text = '. '.join(final_sents)
        if final_text not in final_docs:
            final_docs.append(final_text)
    final_docs = np.array(final_docs)
    
    vectorized_com = vectorizer.transform([com])
    vectorized_docs = vectorizer.transform(final_docs)
    
    cs = np.abs(cosine_similarity(vectorized_com, vectorized_docs))[0]
    if len(diff_arr) > 0:
        vectorized_diffs = vectorizer.transform(diff_arr)
        cs2 = np.max(np.abs(cosine_similarity(vectorized_diffs, vectorized_docs)), axis=0)
        cs += cs2
    idxs = np.argsort(cs)[-5:][::-1]
    docs = '\n\n'.join(final_docs[idxs])
    
    print('----------------------- NEW DATASET OBJECT -----------------------\n')
    print(f"OLD TEXT:\n\n{old}\n\nNEW TEXT:\n\n{new}\n\nDIFF:\t{diff}\n\nComment:\t{com}\n\nDOCS:\n{docs}\nMetrics:\t{cs[idxs]}")

----------------------- NEW DATASET OBJECT -----------------------

OLD TEXT:

This engine is considered a reverse flow 90 degree head V8 engine.  The exhaust gases are released internally to the valley of the engine to go straight into the turbo.  This corrects a common issue of long up pipe routing and it's durability.  The exhaust gases boost intake air through the turbocharger which then travels to a water to air charge air cooler that is different from normal air to air coolers and much smaller in size.  The intake air is then returned through the intake manifold into the heads which flow around to the outside of the head towards the intake valves.

NEW TEXT:

This engine is considered a reverse flow 90 degree head V8 engine.  The exhaust gases are released internally to the valley of the engine to go straight into the turbo.  This corrects a common issue of long up pipe routing and it's durability.  The exhaust gases boost intake air through the turbocharger which then travels to

----------------------- NEW DATASET OBJECT -----------------------

OLD TEXT:

George and the Big Bang is a 2014 children's book written by Stephen and Lucy Hawking. The book is the fourth book in the George series, following George's Secret Key to the Universe, George's Cosmic Treasure Hunt and George and the Big Bang.

NEW TEXT:

George and the Unbreakable Code is a 2014 children's book written by Stephen and Lucy Hawking. The book is the fourth book in the George series, following George's Secret Key to the Universe, George's Cosmic Treasure Hunt and George and the Big Bang.

DIFF:	Unbreakable Code is a

Comment:	Corrected user error

DOCS:
 George and the Unbreakable Code

George and the Unbreakable Code as it's meant to be heard, narrated by Roy McMillan, Sophie Aldred. Discover the English Audiobook at Audible

George and the Unbreakable Code is written by Stephen Hawking; Lucy Hawking and published by Simon & Schuster Books For Young Readers

George And The Unbreakable Code ; Au

----------------------- NEW DATASET OBJECT -----------------------

OLD TEXT:

Duval is the first person whom Paul kills in hand-to-hand combat.  As a civilian, Duval was a French printer, married and with a child.  When Duval dives into a sodden shell hole occupied by Paul while retreating from a failed attack, Paul stabs Duval three times in the chest.  As they sit, trapped together in a shell hole in No-Man's Land, Duval dies slowly and Paul experiences profound remorse, eventually swearing to devote his life to Duval's family.  Kropp and Kat later comfort Paul by noting the joyful abandon with which snipers kill many times each day, and Paul resolves to repress the encounter with Duval with the other horrors he has endured.

NEW TEXT:

Duval is the first person whom Paul kills in hand-to-hand combat. As a civilian, Duval was a French printer, married and with a child. When Duval dives into a sodden shell hole occupied by Paul while retreating from a failed attack, Paul stabs Duval 

----------------------- NEW DATASET OBJECT -----------------------

OLD TEXT:

Born to a Jewish family, Messing fled from Germany to the USSR before World War II. He claimed that his abilities came to the attention of Joseph Stalin. According to Messing, he was able to broadcast mental suggestions in order to alter people's perceptions.
he died in ?

NEW TEXT:

Born to a Jewish family, Messing fled from Germany to the USSR before World War II. He claimed that his abilities came to the attention of Joseph Stalin. According to Messing, he was able to broadcast mental suggestions in order to alter people's perceptions.
he died in November 8, 1974.

DIFF:	November 8, 1974.

Comment:	/* Biography */ Brought into agreement with earlier in article

DOCS:
 had arrived [in Vienna] from Zürich, where he taught, in November 1913 

 all over the immense country, until his death on November 8, 1974

 He died on 8 November 1974 in Moscow

 It is known for certain that Wolf died on November 8, 1974 i

----------------------- NEW DATASET OBJECT -----------------------

OLD TEXT:

Sade can mean:,  Sade Adu, female singer, or the eponymous group she fronts, Sade
* Donatien Alphonse François, Marquis de Sade, the eighteenth century aristocrat, writer and libertine
* Sade (2000), a French film starring Daniel Auteuil as the Marquis de Sade
* Sade Baderinwa, the WABC-TV Eyewitness News reporter and anchor
* Tsadi, a letter  of the Aramaic alphabet
* Sade (clan), a clan of Somalia

de:Sade
es:Sade
fr:Sade
it:Sade

NEW TEXT:

Sade can mean:,  Sade Adu, female singer, or the eponymous group she fronts, Sade
* Donatien Alphonse François, Marquis de Sade, the eighteenth century aristocrat, writer and libertine
* Sade (movie) (2000), a French film starring Daniel Auteuil as the Marquis de Sade
* Sade Baderinwa, the WABC-TV Eyewitness News reporter and anchor
* Tsadi, a letter  of the Aramaic alphabet
* Sade (clan), a clan of Somalia

de:Sade
es:Sade
fr:Sade
it:Sade

DIFF:	 (movie)

Comment:	wik