# Find Duplicate Documents
in this notebook we will use the Text_Deduplicater utility class to:
* find problematic texts in a single corpus
* compare texts for different corpora 
* build a large corpus by merging several distinct corpora and excluding duplicate documents. 

In [1]:
%load_ext autoreload
%autoreload 2

In [39]:
import logging 
import random
import os
import re
import pickle
from tqdm import tqdm_notebook
from cltk.corpus.readers import get_corpus_reader

In [3]:
import sys
import inspect
from pathlib import Path 
currentdir = Path.cwd()
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
from mlyoucanuse.text_deduplicater import TextDeduplicater

In [4]:
logging.basicConfig(level=logging.INFO)

In [5]:
# You may need to install the tesserae corpora
# from cltk.corpus.utils.importer import CorpusImporter
# corpus_importer = CorpusImporter('latin')
# #corpus_importer.list_corpora
# corpus_importer.import_corpus('latin_text_tesserae')

## Load our Corpus readers

In [6]:
perseus_latin_reader = get_corpus_reader(corpus_name='latin_text_perseus', language='latin')
latin_library_reader = get_corpus_reader(corpus_name='latin_text_latin_library', language='latin')
tesserae_reader = get_corpus_reader(corpus_name='latin_text_tesserae', language='latin')

## Create our Text Deduplicator

In [7]:
deduper = TextDeduplicater()

## Simple proof of the Dedupe functionality
Add Caesar docs and pick one of the files at random and add it as a duplicate

In [8]:
caesar = [file for  file in latin_library_reader._fileids if 'caesar' in file]
print(caesar)
deduper = TextDeduplicater()
for file in tqdm_notebook(caesar):
    text = list(latin_library_reader.docs( file ))[0]
    deduper.add_document(file, text)
dupe_file = random.choice(caesar)
text = list(latin_library_reader.docs(dupe_file ))[0]
deduper.add_document(dupe_file, text)
print(f'Unique doc names: {deduper.get_unique_doc_names()}')
print(f'Duplicate doc names: {deduper.get_possible_duplicate_doc_names()}')

['caesar/alex.txt', 'caesar/bc1.txt', 'caesar/bc2.txt', 'caesar/bc3.txt', 'caesar/bellafr.txt', 'caesar/gall1.txt', 'caesar/gall2.txt', 'caesar/gall3.txt', 'caesar/gall4.txt', 'caesar/gall5.txt', 'caesar/gall6.txt', 'caesar/gall7.txt', 'caesar/gall8.txt', 'caesar/hisp.txt', 'suetonius/suet.caesar.txt', 'xylander/caesar.txt']


HBox(children=(IntProgress(value=0, max=16), HTML(value='')))


Unique doc names: ['caesar/gall8.txt', 'caesar/hisp.txt', 'caesar/gall1.txt', 'caesar/bc3.txt', 'suetonius/suet.caesar.txt', 'xylander/caesar.txt', 'caesar/gall5.txt', 'caesar/bc2.txt', 'caesar/bc1.txt', 'caesar/alex.txt', 'caesar/bellafr.txt', 'caesar/gall7.txt', 'caesar/gall6.txt', 'caesar/gall3.txt', 'caesar/gall2.txt', 'caesar/gall4.txt']
Duplicate doc names: [('caesar/bc3.txt', 'caesar/bc3.txt')]


## Let's check the whole corpus contents
if a file has warnings about no trigrams found, it's an indicator that the file is empty.

In [9]:
# Reset the duplicator
deduper = TextDeduplicater()

for file in tqdm_notebook(latin_library_reader._fileids):
    text = list(latin_library_reader.docs(file))[0]
    deduper.add_document(file, text)

HBox(children=(IntProgress(value=0, max=2141), HTML(value='')))






## Surprise, some problematic files just in one Corpus

In [10]:
latin_lib_problematic_files = deduper.get_possible_duplicate_doc_names()
latin_lib_problematic_files

[('lucan/lucan7.txt', 'lucan/lucan8.txt'),
 ('albertanus/albertanus.sermo4.txt', 'albertanus/albertanus.sermo3.txt'),
 ('albertanus/albertanus.sermo1.txt', 'albertanus/albertanus.sermo3.txt'),
 ('albertanus/albertanus.sermo2.txt', 'albertanus/albertanus.sermo1.txt'),
 ('albertanus/albertanus.sermo1.txt', 'albertanus/albertanus.sermo4.txt'),
 ('albertanus/albertanus.sermo2.txt', 'albertanus/albertanus.sermo4.txt'),
 ('albertanus/albertanus.sermo3.txt', 'albertanus/albertanus.sermo2.txt')]

## Let's check their similarity scores

In [11]:
print('Duplicate documents and similarity scores')
for doc_one, doc_two in latin_lib_problematic_files:
    print(doc_one, doc_two,  
    deduper.calculate_similarity( 
    list(latin_library_reader.docs(doc_one))[0],
    list(latin_library_reader.docs(doc_two))[0]))

Duplicate documents and similarity scores
lucan/lucan7.txt lucan/lucan8.txt 0.9987385682749921
albertanus/albertanus.sermo4.txt albertanus/albertanus.sermo3.txt 1.0
albertanus/albertanus.sermo1.txt albertanus/albertanus.sermo3.txt 1.0
albertanus/albertanus.sermo2.txt albertanus/albertanus.sermo1.txt 1.0
albertanus/albertanus.sermo1.txt albertanus/albertanus.sermo4.txt 1.0
albertanus/albertanus.sermo2.txt albertanus/albertanus.sermo4.txt 1.0
albertanus/albertanus.sermo3.txt albertanus/albertanus.sermo2.txt 1.0


## Note: the reported duplicate files are actually errors in the corpus
The Lucan file is nearly the same file twice, somehow.

In [20]:
for file_one, file_two in latin_lib_problematic_files:
    print(re.sub('\s+',' ', list(latin_library_reader.docs(file_one))[0][:200]))
    print(re.sub('\s+',' ', list(latin_library_reader.docs(file_two))[0][:200]))

Lucan Liber VII M. ANNAEI LVCANI BELLI CIVILIS LIBER SEPTIMVS Segnior, Oceano quam lex aeterna uocabat, luctificus Titan numquam magis aethera contra egit equos cursumque p
Lucan Liber VIII M. ANNAEI LVCANI BELLI CIVILIS LIBER OCTAVVS Segnior, Oceano quam lex aeterna uocabat, luctificus Titan numquam magis aethera contra egit equos cursumque
Albertano of Brescia [an error occurred while processing this directive] The Classics Page
Albertano of Brescia [an error occurred while processing this directive] The Classics Page
Albertano of Brescia [an error occurred while processing this directive] The Classics Page
Albertano of Brescia [an error occurred while processing this directive] The Classics Page
Albertano of Brescia [an error occurred while processing this directive] The Classics Page
Albertano of Brescia [an error occurred while processing this directive] The Classics Page
Albertano of Brescia [an error occurred while processing this directive] The Classics Page
Albertano of Bresci

## Let's merge the tuples so we can skip these later

In [33]:
files1, files2 = zip(* latin_lib_problematic_files)
latin_lib_problematic_files = files1 + files2
latin_lib_problematic_files

('lucan/lucan7.txt',
 'albertanus/albertanus.sermo4.txt',
 'albertanus/albertanus.sermo1.txt',
 'albertanus/albertanus.sermo2.txt',
 'albertanus/albertanus.sermo1.txt',
 'albertanus/albertanus.sermo2.txt',
 'albertanus/albertanus.sermo3.txt',
 'lucan/lucan8.txt',
 'albertanus/albertanus.sermo3.txt',
 'albertanus/albertanus.sermo3.txt',
 'albertanus/albertanus.sermo1.txt',
 'albertanus/albertanus.sermo4.txt',
 'albertanus/albertanus.sermo4.txt',
 'albertanus/albertanus.sermo2.txt')

## Let's take a closer look at the Latin library and Perseus corpora by looking for and comparing a shared file

In [21]:
latin_lib_plautus_file = [tmp for tmp in latin_library_reader._fileids if 'plautus' in tmp and 'menaechmi' in tmp]
latin_lib_plautus_file

['plautus/menaechmi.txt']

In [22]:
perseus_plautus_file = [tmp for tmp in perseus_latin_reader._fileids if 'plautus' in tmp and 'menaechmi' in tmp]
perseus_plautus_file

['plautus-titus-maccius__menaechmi__latin.json']

In [25]:
perseus_plautus_text = ''.join(list(perseus_latin_reader.paras(perseus_plautus_file)))
latin_lib_plautus_text = list(latin_library_reader.docs(latin_lib_plautus_file))[0]
tmp_deduper = TextDeduplicater()
tmp_deduper.add_document(perseus_plautus_file[0], perseus_plautus_text)
tmp_deduper.add_document(latin_lib_plautus_file[0], latin_lib_plautus_text)
tmp_deduper.get_possible_duplicate_doc_names(threshold=0.4)

[('plautus-titus-maccius__menaechmi__latin.json', 'plautus/menaechmi.txt')]

In [29]:
latin_lib_plautus_text[:400]

'Plautus: Menaechmi\n\t\t \n\t\t \n\t\t \n\t \n\t\n \n T. MACCI PLAVTI MENAECHMI \n\n \n\n PERSONAE \n \n\n PENICVLVS PARASITVS \nMENAECHMVS \nMENAECHMVS (SOSICLES) \nEROTIUM MERETRIX \nCYLINDRUS COCVS \nMESSENIO SERVVS \nANCILLA \nMATRONA \nSENEX \nMEDICVS\n \n\n ARGVMENTVM \n \n\n Mercator Siculus, quoi erant gemini filii, \nEi surrupto altero mors optigit. \nNomen surrepticii illi indit qui domist \nAvos paternus, facit Maenaechmum e '

In [30]:
perseus_plautus_text[:400]

'Salutem primum iam a principio propitiammihi atque vobis, spectatores, nuntio.atque adeo hoc argumentum graecissat, tamenita est adulescens; ipsus escae maxumaeQua de re aut cuius rei rerum omnium?Me neque isti male fecisse mulieri, quae me arguithanc domo ab se surrupuisse atque abstulisse deierat.sí ego intra aedis huius umquam, ubi habitat, penetravi pedem ,omnium hominum exopto ut fiam miseror'

## The Perseus corpus isn't satisfactory, so let's try merging the Latin library corpus and the Tesserae corpus

In [34]:
# Reset the duplicator
deduper = TextDeduplicater()

for file in tqdm_notebook(tesserae_reader._fileids):
    text = list(tesserae_reader.texts(file))[0]
    if len(text) > 0:
        deduper.add_document(file, text)    
for file in tqdm_notebook(latin_library_reader._fileids):
    if file not in latin_lib_problematic_files:
        text = list(latin_library_reader.docs(file))[0]
        if len(text) > 0:
            deduper.add_document(file, text)

HBox(children=(IntProgress(value=0, max=762), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2141), HTML(value='')))




## Find out which files are duplicate
Note: of course this does not exclude any files were several texts are put together.

In [46]:
dupes = deduper.get_possible_duplicate_doc_names(threshold=0.8)
# Filter on Tess dupes, since we already processed the lat lib ones
single_files_to_exclude = [file_one for file_one, file_two in dupes if file_one.endswith('.tess')
                          ] + [file_two for file_one, file_two in dupes if file_two.endswith('.tess')] 
print(len(single_files_to_exclude))
single_files_to_exclude

228


['texts/cicero.tusculanae_disputationes.part.4.tess',
 'texts/sallust.catilina.tess',
 'texts/jerome.vulgate.part.24.song_of_songs.tess',
 'texts/cicero.philippicae.part.12.tess',
 'texts/silius_italicus.punica.part.7.tess',
 'texts/silius_italicus.punica.part.5.tess',
 'texts/caesar.de_bello_gallico.part.7.tess',
 'texts/caesar.de_bello_gallico.part.8.tess',
 'texts/cicero.letters_to_atticus.part.2.tess',
 'texts/jerome.vulgate.part.36.obadiah.tess',
 'texts/columella.de_re_rustica.part.4.tess',
 'texts/cicero.epistulae_ad_familiares.part.5.ad_metellum_et_ceteros.tess',
 'texts/silius_italicus.punica.part.14.tess',
 'texts/jerome.vulgate.part.45.1_maccabees.tess',
 'texts/cicero.letters_to_atticus.part.9.tess',
 'texts/ammianus.rerum_gestarum.part.18.tess',
 'texts/seneca.de_beneficiis.part.1.tess',
 'texts/tacitus.annales.part.13.tess',
 'texts/plautus.stichus.tess',
 'texts/quintus_smyrnaeus.fall_of_troy.part.13.tess',
 'texts/jerome.vulgate.part.35.amos.tess',
 'texts/cicero.letter

## Now we can use these distinct files to build a single larger corpus

In [42]:
additional_tess_files = [tmp for tmp in deduper.get_unique_doc_names() if tmp.endswith('.tess')]
print(len(additional_tess_files))
additional_tess_files

761


['texts/terence.eunuchus.tess',
 'texts/jerome.vulgate.part.31.ezekiel.tess',
 'texts/cicero.de_oratore.part.1.tess',
 'texts/cicero.letters_to_atticus.part.7.tess',
 'texts/jerome.vulgate.part.27.isaiah.tess',
 'texts/marcus_mincuius_felix.octavius.tess',
 'texts/quintus_smyrnaeus.fall_of_troy.part.11.tess',
 'texts/ovid.amores.part.1.tess',
 'texts/ovid.heroides.part.2.16-21.tess',
 'texts/livy.ab_urbe_condita.part.1.books_1-10.tess',
 'texts/pliny_the_younger.letters.part.10.tess',
 'texts/seneca.de_beneficiis.part.2.tess',
 'texts/pliny_the_elder.naturalis_historia.part.6.books_26-30.tess',
 'texts/pseudo_quintilian.major_declamations.tess',
 'texts/seneca_the_elder.controversiae.part.10.tess',
 'texts/pliny_the_elder.naturalis_historia.part.1.books_1-5.tess',
 'texts/jerome.epistulae.tess',
 'texts/jerome.vulgate.part.28.jeremiah.tess',
 'texts/jerome.vulgate.part.4.numbers.tess',
 'texts/statius.silvae.part.5.tess',
 'texts/jerome.vulgate.part.22.proverbs.tess',
 'texts/macrobius

In [43]:
len(deduper.hash_data)

2889

In [47]:
files_to_use = [ tmp for tmp in tesserae_reader._fileids if tmp not in single_files_to_exclude]
print(len(files_to_use))
files_to_use

534


['texts/ammianus.rerum_gestarum.part.15.tess',
 'texts/ammianus.rerum_gestarum.part.16.tess',
 'texts/ammianus.rerum_gestarum.part.20.tess',
 'texts/ammianus.rerum_gestarum.part.21.tess',
 'texts/ammianus.rerum_gestarum.part.22.tess',
 'texts/ammianus.rerum_gestarum.part.23.tess',
 'texts/ammianus.rerum_gestarum.part.26.tess',
 'texts/ammianus.rerum_gestarum.part.27.tess',
 'texts/ammianus.rerum_gestarum.part.28.tess',
 'texts/ammianus.rerum_gestarum.part.29.tess',
 'texts/ammianus.rerum_gestarum.part.30.tess',
 'texts/ammianus.rerum_gestarum.part.31.tess',
 'texts/anonymous.laudes_domini.tess',
 'texts/apuleius.apologia.tess',
 'texts/apuleius.florida.tess',
 'texts/apuleius.metamorphoses.part.10.tess',
 'texts/apuleius.metamorphoses.part.2.tess',
 'texts/apuleius.metamorphoses.part.5.tess',
 'texts/apuleius.metamorphoses.part.7.tess',
 'texts/apuleius.metamorphoses.part.9.tess',
 'texts/aristotle.economics_book_3.tess',
 'texts/augustine.de_doctrina_christiana.part.4.tess',
 'texts/a

In [50]:
list(tesserae_reader.texts('texts/vergil.aeneid.part.2.tess'))[0][:200]

'Conticuere omnes, intentique ora tenebant.\nInde toro pater Aeneas sic orsus ab alto:\nInfandum, regina, iubes renovare dolorem,\nTroianas ut opes et lamentabile regnum\neruerint Danai; quaeque ipse miser'