In [1]:
from pathlib import Path
import glob
import os
import sys
import csv
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize,wordpunct_tokenize
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures

In [2]:
sys.path.append("/home/lucas-jerusalimiec/Documents/OCR Text/Notebooks")
from tokenizer_func  import (wordcleaner, write_words_to_file, dictionary_to_file, convert_tuple_bigrams,
convert_tuple_trigrams)

from extra_token_func import print_first_n_items, remove_keys_from_nested_dict

from additional_token_func import convert_strings_to_counts

from dict_write import write_dict_to_files_with_suffix

In [3]:
text_loc = Path("./final")
text_files = glob.glob(f"{text_loc}/*.txt")
output_folder = './tokenized/'
tokenized_folder = Path(output_folder)
tokenized_folder.mkdir(exist_ok=True)
print(output_folder)
print("Text files in the spellchecked directory:", text_files)

./tokenized/
Text files in the spellchecked directory: ['final/Démonomanie Repair_corrected.txt']


In [4]:
tokenized_texts = {}
for txt in text_files:
    with open(txt, 'r') as f:
        content = f.read()
        file_name = txt.split('\\')[-1]
        #key = file_name.split('.')[0]
        key = os.path.splitext(os.path.basename(file_name))[0]
        tokenized_texts[key] = content
print("Raw texts:", list(tokenized_texts.keys()))       

Raw texts: ['Démonomanie Repair_corrected']


In [5]:
unigrams = {}

for key, value in tokenized_texts.items():
    unigram_list = wordpunct_tokenize(value)
    cleanwords = [wordcleaner(w) for w in unigram_list]
    unigrams[key] = cleanwords

for key, value in unigrams.items():
    filename = f"./tokenized/{key}.txt"
    write_words_to_file(value, filename, words_per_line=20)
    print(f"Saved content for '{key}' to {filename}")

Saved content for 'Démonomanie Repair_corrected' to ./tokenized/Démonomanie Repair_corrected.txt


In [6]:
print("Unigram texts:")
for key in unigrams:
    print(key)

Unigram texts:
Démonomanie Repair_corrected


In [7]:
# Count up the tokens using a Counter() object
unigram_counts = {}
for key, value in unigrams.items():
    unigram_counts_dict = Counter(value)
    unigram_counts[key] = unigram_counts_dict

print("Unigram Counts:")
for key in unigram_counts:
    print(key)

Unigram Counts:
Démonomanie Repair_corrected


In [8]:
print_first_n_items(unigram_counts, 25)

First 25 items in Démonomanie Repair_corrected:
: 40772
de: 3351
que: 1896
la: 1678
qu: 1667
les: 1641
en: 1564
il: 1543
qui: 1511
à: 1281
d: 1278
l: 1260
le: 1233
des: 1102
eft: 897
a: 752
pour: 731
ne: 729
c: 725
n: 665
ce: 651
par: 649
on: 641
dieu: 640
vn: 616



In [9]:
# Open stopwords CSV file and list the contents
with open('./stop_words.csv', 'r') as f:
    stopwords = f.read().strip().split(",")
stopwords[-10:]

['auoit', 'fa', 'eftoit', 'eftre', 'auoir', '', 'x', 'v', 'ee', 'p']

In [10]:
# Remove specified keys from the dictionary
stripped_unigrams = remove_keys_from_nested_dict(unigram_counts, stopwords)

print_first_n_items(stripped_unigrams, 25)

First 25 items in Démonomanie Repair_corrected:
dieu: 640
comme: 462
plus: 412
sorciers: 355
bien: 298
livre: 261
faire: 232
autres: 221
diable: 210
di: 207
peut: 191
in: 189
tous: 177
dit: 175
point: 174
fur: 167
dela: 164
faut: 162
mort: 162
sathan: 160
quand: 158
dire: 149
tout: 146
re: 144
apres: 143



In [11]:
dictionary_to_file(stripped_unigrams, output_folder, 'unigram_counts')

Saved Démonomanie Repair_corrected_unigram_counts.csv in ./tokenized/


In [12]:
bigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    bigrams_list = list(nltk.bigrams(unigram_list))
    bigrams[key] = bigrams_list

print("Bigrams:")
for key in bigrams:
    print(key)

Bigrams:
Démonomanie Repair_corrected


In [13]:
bigram_counts = {}

for key, value in bigrams.items():
    string_bigrams = convert_tuple_bigrams(value)
    bigramCount = convert_strings_to_counts(string_bigrams)
    bigram_counts[key] = bigramCount

print("Bigram Counts:")
for key in bigram_counts:
    print(key)

Bigram Counts:
Démonomanie Repair_corrected


In [14]:
print_first_n_items(bigram_counts, 25)

First 25 items in Démonomanie Repair_corrected:
livre second: 45
livre premier: 41
loy dieu: 35
malins esprits: 28
quelque chose: 24
cas pareil: 24
livre qvatriesme: 23
comme dit: 21
comme di: 20
plus grand: 19
livre troisiesme: 19
malings esprits: 17
cy dessus: 17
bien fort: 16
peut faire: 15
cy apres: 14
dieu comme: 14
comme dict: 14
mil cinq: 14
encores plus: 13
faut donc: 13
autres choses: 13
faire mourir: 13
cinq cens: 13
livre troisieme: 13



In [15]:
dictionary_to_file(bigram_counts, output_folder, 'bigram_counts')

Saved Démonomanie Repair_corrected_bigram_counts.csv in ./tokenized/


In [16]:
trigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    trigrams_list = list(nltk.trigrams(unigram_list))
    trigrams[key] = trigrams_list

print("Trigrams:")
for key in bigrams:
    print(key)

Trigrams:
Démonomanie Repair_corrected


In [17]:
trigram_counts = {}

for key, value in trigrams.items():
    string_trigrams = convert_tuple_trigrams(value)
    trigramCount = convert_strings_to_counts(string_trigrams)
    trigram_counts[key] = trigramCount

print("Trigram Counts:")
for key in trigram_counts:
    print(key)

Trigram Counts:
Démonomanie Repair_corrected


In [18]:
print_first_n_items(trigram_counts, 25)

First 25 items in Démonomanie Repair_corrected:
mil cinq cens: 12
an mil cinq: 8
dirons cy apres: 7
plus forte raifon: 6
dela loy dieu: 6
comme cas pareil: 6
prince dela mirande: 5
paruenir quelque chose: 5
livre morbo facro: 5
deux iours apres: 4
plus grands sorciers: 4
ie mettray point: 4
monftré cy dessus: 4
bruflee toute viue: 4
cinq cens cinquante: 4
ceste ville laon: 4
tefmoing fans reproche: 4
deux trois heures: 3
quel esprit dieu: 3
plus deteftables sorciers: 3
comme auons dict: 3
fait bien noter: 3
comme dirons cy: 3
dela ville laon: 3
defendu parlaloy dieu: 3



In [19]:
dictionary_to_file(trigram_counts, output_folder, 'trigram_counts')

Saved Démonomanie Repair_corrected_trigram_counts.csv in ./tokenized/


In [20]:
colloc_dict = {}
for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_finder.apply_freq_filter(5)   # make sure all collocations have occured at least 5 times
    collocations = bigram_finder.nbest(BigramAssocMeasures.pmi, 500)
    colloc_dict[key] = collocations

print("Collocations:")
for key, value in colloc_dict.items():
    print(key)
    #for w1, w2 in value:
        #print(' ', w1, w2)

Collocations:
Démonomanie Repair_corrected


In [21]:
colloc_counts = {}

for key, value in colloc_dict.items():
    string_bigrams = convert_tuple_bigrams(value)
    bigramCount = convert_strings_to_counts(string_bigrams)
    colloc_counts[key] = string_bigrams
    colloc_counts[key] = bigramCount

print("Collocation Counts:")
for key in colloc_counts:
    print(key)

Collocation Counts:
Démonomanie Repair_corrected


In [22]:
print_first_n_items(colloc_counts, 25)

First 25 items in Démonomanie Repair_corrected:
morbo facro: 1
iefus chrift: 1
paul grilland: 1
thomas aquin: 1
images cire: 1
dia ble: 1
genre humain: 1
prefomptions violentes: 1
grecs latins: 1
demeurent accord: 1
ville laon: 1
caufes naturelles: 1
ef prit: 1
mal caduc: 1
bons mauuais: 1
cas pareil: 1
figure humaine: 1
cy deffus: 1
mil cinq: 1
long temps: 1
aage douze: 1
quei ay: 1
cens cinquante: 1
cinq cens: 1
mefme autheur: 1



In [23]:
dictionary_to_file(colloc_counts, output_folder, 'collocation_counts')

Saved Démonomanie Repair_corrected_collocation_counts.csv in ./tokenized/


In [24]:
trigram_measures_dict = {}
for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    trigram_finder = TrigramCollocationFinder.from_words(unigram_list)
    trigram_finder.apply_freq_filter(2)   # make sure all collocations have occured at least 5 times
    collocations = trigram_finder.nbest(TrigramAssocMeasures.pmi, 25)
    trigram_measures_dict[key] = collocations

trigram_colloc_counts = {}

for key, value in trigram_measures_dict.items():
    string_trigrams = convert_tuple_trigrams(value)
    trigramCount = convert_strings_to_counts(string_trigrams)
    trigram_colloc_counts[key] = string_trigrams
    trigram_colloc_counts[key] = trigramCount

print_first_n_items(trigram_colloc_counts, 25)

dictionary_to_file(trigram_colloc_counts, output_folder, 'trigram_collocation_counts')

First 25 items in Démonomanie Repair_corrected:
naturam adit pana: 1
bouuin bailly chafteau: 1
felin ye nies: 1
pres villiers cofterets: 1
cognoift veué œil: 1
vol oy feaux: 1
pana non attraxerit: 1
adam martin procureur: 1
adit pana non: 1
frappé maladie foudaine: 1
aueugle pendu paris: 1
bruflées toutes viues: 1
vingt quatre heures: 1
iln befoin efcrire: 1
confirmatif dela fentence: 1
maladies venues fortileges: 1
maiftre adam martin: 1
procureur roy ribemont: 1
diminuer nombre mefchans: 1
iour feurier mil: 1
feurier mil cinq: 1
ennemys genre humain: 1
és crimes atroces: 1
creé ciel laterre: 1
magdeleine dela croix: 1

Saved Démonomanie Repair_corrected_trigram_collocation_counts.csv in ./tokenized/


In [25]:
underscore_dict = {}
for key, value in unigrams.items():

    tokenized_words = unigrams.get(key)
    collocations = colloc_dict.get(key)
    
    colloc_words = []
    
    # Iterate through the words making new versions combining collocations
    i = 0
    while i < len(tokenized_words) - 1:
        # If we find a collocation, add and advance by two words
        if (tokenized_words[i], tokenized_words[i + 1]) in collocations:
            colloc_words.append('_'.join((tokenized_words[i], tokenized_words[i + 1])))
            i += 2
        # Otherwise, advance by one word
        else:
            colloc_words.append(tokenized_words[i])
            i += 1

    # Add the last word (if any)
    if i == len(tokenized_words) - 1:
        colloc_words.append(tokenized_words[i])
    underscore_dict[key] = colloc_words

print("Underscore Dictionary:")
for key in underscore_dict:
    print(key)

Underscore Dictionary:
Démonomanie Repair_corrected


In [26]:
write_dict_to_files_with_suffix(underscore_dict, output_folder, 'underscore_bigrams')

Démonomanie Repair_corrected_underscore_bigrams.txt in ./tokenized/
