In [1]:
import re
import os
import torch
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import T5TokenizerFast, T5ForConditionalGeneration
from tqdm.notebook import tqdm
from collections import Counter
from torch import nn
from catalyst import dl
from langdetect import detect
from nltk.tokenize import sent_tokenize, word_tokenize

DOCS_DIR = 'data/final_data/documents'
PAGES_DIR = 'data/final_data/revision'
CHUNKS = sorted(os.listdir(DOCS_DIR))

In [2]:
total = 0
db_dict = {'obj_id': [], 'old_text': [], 'new_text': [], 'comment': [], 'docs': [], 'diff': [],
           'title': [], 'search_queries': [], 'counter_found_docs': [], 'section_name': []}
for chunk in CHUNKS:
    dataset_ids = list(map(lambda x: x.split('.')[0], os.listdir(f"{DOCS_DIR}/{chunk}")))
    for dataset_obj_id in tqdm(dataset_ids):
        total += 1
        with open(f"{PAGES_DIR}/{chunk}/{dataset_obj_id}.json", 'r', encoding='utf-8') as f:
            page_json = json.load(f)
        docs_text = ''
        added_docs = set()
        count_docs = 0
        with open(f"{DOCS_DIR}/{chunk}/{dataset_obj_id}.txt", 'r', encoding='utf-8') as f:
            docs_text_plain = f.read()

        diff = '\n'.join(page_json['change_texts'][0][0])
        db_dict['diff'].append(diff)
        db_dict['obj_id'].append(dataset_obj_id)
        db_dict['old_text'].append(page_json['old_text'])
        db_dict['new_text'].append(page_json['new_text'])
        db_dict['comment'].append(page_json['comment'])
        db_dict['docs'].append(docs_text_plain)
        db_dict['title'].append(page_json['title'])
        db_dict['search_queries'].append(page_json['search_queries'])
        db_dict['counter_found_docs'].append(page_json['counter_found_docs'])
        db_dict['section_name'].append(page_json['section_name'])

HBox(children=(FloatProgress(value=0.0, max=5639.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4139.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1189.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3299.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1340.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4121.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2749.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1308.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6644.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1197.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2775.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8251.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2883.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2851.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2673.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3799.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=184.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7552.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5715.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9036.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=69.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=53.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6663.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2155.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1272.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1126.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6725.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6237.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5105.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1101.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9004.0), HTML(value='')))




In [3]:
df = pd.DataFrame.from_dict(db_dict)
print(df.shape)

(120720, 10)


In [4]:
df.head()

Unnamed: 0,obj_id,old_text,new_text,comment,docs,diff,title,search_queries,counter_found_docs,section_name
0,3571,"__NOTOC__\n\nThe Pacific Coast Marine Firemen,...","__NOTOC__\n\nThe Pacific Coast Marine Firemen,...",edited links.,"The Pacific Coast Marine Firemen, Oilers, Wate...",", commonly\n, is an",Marine Firemen's Union,"[Marine Firemen's Union , commonly, Marine Fir...","[29, 23]",
1,14256,"Soon after its premiere, Martin Stein wrote ""S...","Soon after its premiere, Martin Stein wrote ""S...",/* Reception */ Added LAT.,The show opens with Siren leader Sin Cinnamon ...,The Los Angeles Times advised readers to avoid...,Sirens of TI,[Sirens of TI Reception The Los Angeles Times ...,[29],Reception
2,17990,"The film was released to DVD on June 12, 2007....","The film was released to DVD on June 12, 2007....",/* Release */ fixes,Romeo Juliet: Sealed with a Kiss is a 2006 Am...,praising its amount of \nonus content for a\nc...,Romeo & Juliet: Sealed with a Kiss,[Romeo & Juliet: Sealed with a Kiss Home media...,"[29, 12, 20]",Home media
3,1870,This template will categorise articles into :C...,This template will categorise articles into :C...,/* Usage */ added date,Listed below are several options for usage. { ...,date=August 2010}}\n\n{{hoax|section|date=Augu...,Hoax/doc,[Hoax/doc Usage date=August 2010}}\n\n{{hoax|s...,[29],Usage
4,10259,Dallasaurus is a basal mosasauroid from the Up...,"Dallasaurus (""Dallas lizard"") is a basal mosas...",Adding a definition to the lead.,Dallasaurus (Dallas lizard) is a basal mosasau...,"(""Dallas lizard"") is a",Dallasaurus,"[Dallasaurus (""Dallas lizard"") is a]",[18],


In [5]:
titles = df.title.unique()
len(titles)

35682

In [6]:
df.iloc[3]['old_text']



In [7]:
df.iloc[3]['new_text']



## TF-IDF Learning

In [8]:
docs_corpus = []
for row_idx, row in df.iterrows():
    docs = row['old_text']
    sents = sent_tokenize(docs)
    docs_corpus += sents
    

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5), max_df=0.9)
vectorizer.fit(docs_corpus)

TfidfVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=1, ngram_range=(3, 5), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [10]:
len(vectorizer.vocabulary_)

1646280

## Filter revisions

In [11]:
def is_good(row):
    if 'link' in row['comment'] and 'fix' in row['comment']:
        return False
    if '|' in row['old_text'] or '|' in row['new_text']:
        return False
    if 'this template' in row['old_text']:
        return False
    if 'image' in row['comment']:
        return False
    if '* \n*' in row['old_text'] or '*\n*' in row['old_text'] or '*  \n*' in row['old_text']:
        return False
    if row['old_text'].count('*') > 8:
        return False
    if 'http' in row['diff'] or 'http://' in row['old_text'] or 'http://' in row['new_text']:
        return False
    if 'https' in row['diff'] or 'https://' in row['old_text'] or 'https://' in row['new_text']:
        return False
    if 'log in' in row['comment'] or 'sorry' in row['comment']:
        return False
    if ':Note:' in row['old_text'] or ':()' in row['old_text']:
        return False
    return True
        

In [12]:
df['is_good'] = df.apply(lambda x: is_good(x), axis=1)

In [13]:
df = df.loc[df['is_good'] == True]
df.shape

(87000, 11)

## Watch docs

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import string 
    
# Storing the sets of punctuation in variable result 
punct_symbols = string.punctuation

In [15]:
df_sample = df.sample(30)
for row_idx, row in df_sample.iterrows():
    old = row['old_text']
    new = row['new_text']
    com = row['comment']
    diff = row['diff']
    docs = row['docs']
    
    diff_arr = diff.split('\n')
    diff_arr = [txt for txt in diff_arr if len(txt) > 20]
    
    final_docs = []
    for doc in docs.split('\n\nDOC_DELIMITER_TOKEN\n\n'):
        texts = doc.split('...')
        final_sents = []
        for txt in texts:
            if len(txt) > 15:
                if txt[-1] in punct_symbols:
                    final_sents.append(txt.strip()[:-1])
                else:
                    final_sents.append(txt.strip())
        final_text = '. '.join(final_sents)
        if final_text not in final_docs:
            final_docs.append(final_text)
    final_docs = np.array(final_docs)
    
    vectorized_com = vectorizer.transform([com])
    vectorized_docs = vectorizer.transform(final_docs)
    
    cs = np.abs(cosine_similarity(vectorized_com, vectorized_docs))[0]
    if len(diff_arr) > 0:
        vectorized_diffs = vectorizer.transform(diff_arr)
        cs2 = np.max(np.abs(cosine_similarity(vectorized_diffs, vectorized_docs)), axis=0)
        cs += cs2
    idxs = np.argsort(cs)[-5:][::-1]
    docs = '\n\n'.join(final_docs[idxs])
    
    print(f'----------------------- NEW DATASET OBJECT ({row["title"]} ---- {row["section_name"]})-----------------------\n')
    print(f"OLD TEXT:\n\n{old}\n\nNEW TEXT:\n\n{new}\n\nDIFF:\t{diff}\n\nComment:\t{com}\n\nDOCS:\n{docs}\nMetrics:\t{cs[idxs]}")

----------------------- NEW DATASET OBJECT (Electronarcosis ---- Livestock)-----------------------

OLD TEXT:

Electronarcosis is one of the  methods used to render animals unconscious before slaughter and unable to feel pain. Electronarcosis may be followed immediately by electrocution or by bleeding.

Modern electronarcosis is typically performed by applying 200 Volts of high frequency alternating current of about 1500 Hertz for 3 seconds to the animals head.  High frequency is not felt as an electric shock and does not cause skeletal muscle contractions.  A wet animal will pass a current of over an Ampere.  If electronarcosis is not followed by other procedures, the animal will usually recover.

Studies have been used to determine parameters for effective electronarcosis.

NEW TEXT:

Electronarcosis is one of the  methods used to render animals unconscious before slaughter and unable to feel pain. Electronarcosis may be followed immediately by electrocution or by bleeding.

Modern e

----------------------- NEW DATASET OBJECT (History of the Jews in Belgium ---- The Holocaust)-----------------------

OLD TEXT:

Just before the Second World War, the Jewish community of Belgium was at a peak of roughly 100,000 Jews (with concentrations of 55,000 in Antwerp and 35,000 in Brussels). Some 20,000 of this number were German-Jewish refugees. Belgium was occupied by Nazi Germany for the vast majority of the war, and a number of anti-Semitic policies were adopted in Belgium. Many Belgian Jews were taken to concentration camps, primarily Auschwitz. The Committee for Jewish Defence, which worked with the national resistance movement, was the largest Jewish defence movement in Belgium during the war. All told, some 25,000 Belgian Jews perished between 1942 and 1945.

NEW TEXT:

Just before the Second World War, the Jewish community of Belgium was at a peak of roughly 100,000 Jews (with concentrations of 55,000 in Antwerp and 35,000 in Brussels). Some 20,000 of this number were 

----------------------- NEW DATASET OBJECT (Georgian Stock Exchange ---- )-----------------------

OLD TEXT:

The Georgian Stock Exchange (Georgian: "saqarTvelos safondo birJa") is the principal stock exchange in the country of Georgia.  It was created by the "Joint Stock Company Georgian Stock Exchange Charter" which was registered and approved in 1999.  It is located in the capital city of Tbilisi and its abbreviation in English is GeSE.

NEW TEXT:

The Georgian Stock Exchange (, literally "Georgian Stock Exchange Market") is the principal stock exchange in the country of Georgia.  It was created by the "Joint Stock Company Georgian Stock Exchange Charter" which was registered and approved in 1999.  It is located in the capital city of Tbilisi and its abbreviation in English is GeSE.

DIFF:	, literally
Georgian Stock Exchange Market

Comment:	standardize

DOCS:
The Georgian Stock Exchange ( Georgian: საქართველოს საფონდო ბირჟა, literally Georgian Stock Exchange Market) is the principa

----------------------- NEW DATASET OBJECT (Calendar ---- Future reform)-----------------------

OLD TEXT:

There have been a number of proposals for reform of the calendar, such as the World calendar and International Fixed Calendar.  The United Nations considered adopting such a reformed calendar for a while in the 1950s, but these proposals have lost most of their popularity.

NEW TEXT:

There have been a number of proposals for reform of the calendar, such as the World calendar and International Fixed Calendar.  The United Nations considered adopting such a reformed calendar for a while in the 1950s, but these proposals have lost most of their popularity. Holocene calendar is another one for counting years.

DIFF:	 Holocene calendar is another one for counting years.

Comment:	/* Future reform */ add

DOCS:
An environmental history. 2. Auflage. Oxford 1998, ISBN -631-18638-7. This idea is not new. You may have heard about Holocene Calendar - a calendar reform proposal made in 1993 

----------------------- NEW DATASET OBJECT (Southwest Airlines Flight 1380 ---- Aftermath)-----------------------

OLD TEXT:

The FAA issued a ground stop for aircraft on the ground planning to depart for Philadelphia until shortly before 2 p.m. Eastern Time. On the day of the incident, the continuation of flight 1380 from Dallas was renumbered as flight 8874.

NEW TEXT:

The FAA issued a ground stop for aircraft on the ground planning to depart for Philadelphia until shortly before 2 p.m. Eastern Time. On the day of the incident, the continuation of flight 1380 from Dallas was renumbered as flight 8874, and continued using a different Boeing 737 aircraft.

DIFF:	, and continued using a different Boeing 737 aircraft.

Comment:	/* Aftermath */ add

DOCS:
Nov 22, 2019 — After parts from a fan blade failure on the left engine of a Southwest Airlines Boeing 737 pierced the airplane's cabin causing one fatality

On April 17, 2018, 12 minutes after departure from LaGuardia, the Boeing 737-7H

----------------------- NEW DATASET OBJECT (Ahab ---- )-----------------------

OLD TEXT:

Ahab or Ach'av or Achab in Douay-Rheims (; ; ) was king of Israel and the son and successor of Omri.

Ahab became king of Israel in the thirty-eighth year of Asa, king of Judah, and reigned for twenty-two years.  William F. Albright dated his reign to 869 – 850 BC, while E. R. Thiele offered the dates 874 – 853 BC.Edwin Thiele, The Mysterious Numbers of the Hebrew Kings, (1st ed.; New York: Macmillan, 1951;  2d ed.; Grand Rapids: Eerdmans, 1965; 3rd ed.; Grand Rapids: Zondervan/Kregel, 1983). ISBN 082543825X, 9780825438257

NEW TEXT:

Ahab or Ach'av or Achab in Douay-Rheims (; ; ) was king of Israel and the son and successor of Omri according to the Hebrew Bible.

Ahab became king of Israel in the thirty-eighth year of Asa, king of Judah, and reigned for twenty-two years.  William F. Albright dated his reign to 869 – 850 BC, while E. R. Thiele offered the dates 874 – 853 BC.Edwin Thiele, The Myst

## Docs preparing

In [16]:
NUM_DOCS = 6
def process_docs(row):
    old = row['old_text']
    new = row['new_text']
    com = row['comment']
    diff = row['diff']
    docs = row['docs']
    
    diff_arr = diff.split('\n')
    diff_arr = [txt for txt in diff_arr if len(txt) > 20]
    
    final_docs = []
    for doc in docs.split('\n\nDOC_DELIMITER_TOKEN\n\n'):
        texts = doc.split('...')
        final_sents = []
        for txt in texts:
            if len(txt) > 15:
                if txt[-1] in punct_symbols:
                    final_sents.append(txt.strip()[:-1])
                else:
                    final_sents.append(txt.strip())
        final_text = '. '.join(final_sents)
        if final_text not in final_docs:
            final_docs.append(final_text)
    final_docs = np.array(final_docs)
    
    vectorized_com = vectorizer.transform([com])
    vectorized_docs = vectorizer.transform(final_docs)
    
    cs = np.abs(cosine_similarity(vectorized_com, vectorized_docs))[0]
    if len(diff_arr) > 0:
        vectorized_diffs = vectorizer.transform(diff_arr)
        cs2 = np.max(np.abs(cosine_similarity(vectorized_diffs, vectorized_docs)), axis=0)
        cs += cs2
    idxs = np.argsort(cs)[-NUM_DOCS:][::-1]
    docs_text = ''
    for doc_idx, doc in enumerate(idxs):
        doc_text = final_docs[doc_idx]
        docs_text += f'DOC{doc_idx}: {doc_text} '
    docs_text = docs_text.strip()    
    return docs_text

In [17]:
process_docs(df.iloc[1])



In [18]:
df['docs_processed'] = df.apply(lambda x: process_docs(x), axis=1)

In [19]:
df.head()

Unnamed: 0,obj_id,old_text,new_text,comment,docs,diff,title,search_queries,counter_found_docs,section_name,is_good,docs_processed
0,3571,"__NOTOC__\n\nThe Pacific Coast Marine Firemen,...","__NOTOC__\n\nThe Pacific Coast Marine Firemen,...",edited links.,"The Pacific Coast Marine Firemen, Oilers, Wate...",", commonly\n, is an",Marine Firemen's Union,"[Marine Firemen's Union , commonly, Marine Fir...","[29, 23]",,True,"DOC0: The Pacific Coast Marine Firemen, Oilers..."
1,14256,"Soon after its premiere, Martin Stein wrote ""S...","Soon after its premiere, Martin Stein wrote ""S...",/* Reception */ Added LAT.,The show opens with Siren leader Sin Cinnamon ...,The Los Angeles Times advised readers to avoid...,Sirens of TI,[Sirens of TI Reception The Los Angeles Times ...,[29],Reception,True,DOC0: The show opens with Siren leader Sin Cin...
2,17990,"The film was released to DVD on June 12, 2007....","The film was released to DVD on June 12, 2007....",/* Release */ fixes,Romeo Juliet: Sealed with a Kiss is a 2006 Am...,praising its amount of \nonus content for a\nc...,Romeo & Juliet: Sealed with a Kiss,[Romeo & Juliet: Sealed with a Kiss Home media...,"[29, 12, 20]",Home media,True,DOC0: Romeo Juliet: Sealed with a Kiss is a 2...
4,10259,Dallasaurus is a basal mosasauroid from the Up...,"Dallasaurus (""Dallas lizard"") is a basal mosas...",Adding a definition to the lead.,Dallasaurus (Dallas lizard) is a basal mosasau...,"(""Dallas lizard"") is a",Dallasaurus,"[Dallasaurus (""Dallas lizard"") is a]",[18],,True,DOC0: Dallasaurus (Dallas lizard) is a basal m...
5,5712,The Vaiphei are an ethnic group who inhabit th...,The Vaiphei are an ethnic group who inhabit th...,"rv unexplained edit contradicting source, but...","Colonel J. Shakespeare (1887-1905), the first ...","Lt. Colonel J. Shakespeare (1887–1905), the fi...",Vaiphei people,[Vaiphei people Lt. Colonel J. Shakespeare (18...,"[28, 26]",,True,"DOC0: Colonel J. Shakespeare (1887-1905), the ..."


In [20]:
df.docs_processed.iloc[0]

'DOC0: The Pacific Coast Marine Firemen, Oilers, Watertenders and Wipers Association, commonly referred to as the Marine Firemens Union, represents unlicensed merchant mariners who work in the engine department aboard a variety of ships. The union was formed October 1883 in San Francisco, California. The founding members were firemen on Pacif MISSION DOC1: The Pacific Coast Marine Firemen, Oilers, Watertenders and Wipers Association (MFOW), commonly referred to as the Marine Firemens Union, is an American labor union of mariners working aboard U.S. flag vessels. [1] The Marine Firemens Union is an affiliate union of the Seafarers International Union of North America AFL-CIO  DOC2: The Pacific Coast Marine Firemen, Oilers, Watertenders and Wipers Association , commonly referred to as the Marine Firemens Union, is an American labor union of mariners working aboard U.S. flag vessels.[1] The Marine Firemens Union is an affiliate union of the Seafarers International Union of North America A

## Train / test / val split

In [21]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.1, random_state=42)
print(train.shape, test.shape)

(78300, 12) (8700, 12)


In [22]:
train, val = train_test_split(train, test_size=test.shape[0], random_state=42)
print(train.shape, test.shape, val.shape)

(69600, 12) (8700, 12) (8700, 12)


In [23]:
columns = df.columns.values
idx2colname = {k:v for k, v in enumerate(columns)}

with open("data/column_mapper.json", "w") as outfile:
    json.dump(idx2colname, outfile)

In [24]:
train.to_json(r'data/train.json', orient='values')
test.to_json(r'data/test.json', orient='values')
val.to_json(r'data/val.json', orient='values')

In [52]:
mp = json.load(open(r"data/column_mapper.json"))
a = pd.read_json(r'data/train.json').head()

In [56]:
a.set_axis(mp.values(), axis='columns', inplace=True)

In [57]:
a.head()

Unnamed: 0,obj_id,old_text,new_text,comment,docs,diff,title,search_queries,counter_found_docs,section_name,is_good,docs_processed
0,3571,"__NOTOC__\n\nThe Pacific Coast Marine Firemen,...","__NOTOC__\n\nThe Pacific Coast Marine Firemen,...",edited links.,"The Pacific Coast Marine Firemen, Oilers, Wate...",", commonly\n, is an",Marine Firemen's Union,"[Marine Firemen's Union , commonly, Marine Fir...","[29, 23]",,True,"DOC0: The Pacific Coast Marine Firemen, Oilers..."
1,14256,"Soon after its premiere, Martin Stein wrote ""S...","Soon after its premiere, Martin Stein wrote ""S...",/* Reception */ Added LAT.,The show opens with Siren leader Sin Cinnamon ...,The Los Angeles Times advised readers to avoid...,Sirens of TI,[Sirens of TI Reception The Los Angeles Times ...,[29],Reception,True,DOC0: The show opens with Siren leader Sin Cin...
2,17990,"The film was released to DVD on June 12, 2007....","The film was released to DVD on June 12, 2007....",/* Release */ fixes,Romeo Juliet: Sealed with a Kiss is a 2006 Am...,praising its amount of \nonus content for a\nc...,Romeo & Juliet: Sealed with a Kiss,[Romeo & Juliet: Sealed with a Kiss Home media...,"[29, 12, 20]",Home media,True,DOC0: Romeo Juliet: Sealed with a Kiss is a 2...
3,10259,Dallasaurus is a basal mosasauroid from the Up...,"Dallasaurus (""Dallas lizard"") is a basal mosas...",Adding a definition to the lead.,Dallasaurus (Dallas lizard) is a basal mosasau...,"(""Dallas lizard"") is a",Dallasaurus,"[Dallasaurus (""Dallas lizard"") is a]",[18],,True,DOC0: Dallasaurus (Dallas lizard) is a basal m...
4,5712,The Vaiphei are an ethnic group who inhabit th...,The Vaiphei are an ethnic group who inhabit th...,"rv unexplained edit contradicting source, but...","Colonel J. Shakespeare (1887-1905), the first ...","Lt. Colonel J. Shakespeare (1887–1905), the fi...",Vaiphei people,[Vaiphei people Lt. Colonel J. Shakespeare (18...,"[28, 26]",,True,"DOC0: Colonel J. Shakespeare (1887-1905), the ..."
