In [1]:
from pathlib import Path
import glob
import os
import sys
import csv
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize,wordpunct_tokenize
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures

In [2]:
sys.path.append("/home/lucas-jerusalimiec/Documents/OCR Text/Notebooks")
from tokenizer_func  import (wordcleaner, write_words_to_file, dictionary_to_file, convert_tuple_bigrams,
convert_tuple_trigrams)

from extra_token_func import print_first_n_items, remove_keys_from_nested_dict

from additional_token_func import convert_strings_to_counts

from dict_write import write_dict_to_files_with_suffix

In [3]:
text_loc = Path("./final")
text_files = glob.glob(f"{text_loc}/*.txt")
output_folder = './tokenized/'
tokenized_folder = Path(output_folder)
tokenized_folder.mkdir(exist_ok=True)

output_unigram = f'{output_folder}unigram_counts'
unigram_folder = Path(output_unigram)
unigram_folder.mkdir(exist_ok=True)

output_bigram = f'{output_folder}bigram_counts'
bigram_folder = Path(output_bigram)
bigram_folder.mkdir(exist_ok=True)

output_trigram = f'{output_folder}trigram_counts'
trigram_folder = Path(output_trigram)
trigram_folder.mkdir(exist_ok=True)

output_collocation = f'{output_folder}collocation_counts'
collocation_folder = Path(output_collocation)
collocation_folder.mkdir(exist_ok=True)

output_trigram_collocation = f'{output_folder}trigram_collocation_counts'
trigram_collocation_folder = Path(output_trigram_collocation)
trigram_collocation_folder.mkdir(exist_ok=True)

output_underscore = f'{output_folder}underscore_bigrams'
underscore_folder = Path(output_underscore)
underscore_folder.mkdir(exist_ok=True)

output_trigram_underscore = f'{output_folder}underscore_trigrams'
trigram_underscore_folder = Path(output_trigram_underscore)
trigram_underscore_folder.mkdir(exist_ok=True)

print("Text files in the spellchecked directory:", text_files)

Text files in the spellchecked directory: ["final/Memoires d'État Refuge_corrected.txt", 'final/Memoire - Namur_corrected.txt', "final/Memoires d'état_corrected.txt", 'final/Memoire au roi_corrected.txt', 'final/Memoire - le but_corrected.txt']


In [4]:
# Open stopwords CSV file and list the contents
with open('./stop_words.csv', 'r') as f:
    stopwords = f.read().strip().split(",")
stopwords[-10:]

['aaa', 'lzs', 'ar', 'ara', 'bl', 'ua', 'rar', 'rr', 'iir', 'tett']

In [5]:
tokenized_texts = {}
for txt in text_files:
    with open(txt, 'r') as f:
        content = f.read()
        file_name = txt.split('\\')[-1]
        #key = file_name.split('.')[0]
        key = os.path.splitext(os.path.basename(file_name))[0]
        tokenized_texts[key] = content
print("Raw texts:", list(tokenized_texts.keys()))       

Raw texts: ["Memoires d'État Refuge_corrected", 'Memoire - Namur_corrected', "Memoires d'état_corrected", 'Memoire au roi_corrected', 'Memoire - le but_corrected']


In [6]:
unigrams = {}

for key, value in tokenized_texts.items():
    unigram_list = wordpunct_tokenize(value)
    cleanwords = [wordcleaner(w) for w in unigram_list]
    unigrams[key] = cleanwords

for key, value in unigrams.items():
    filename = f"./tokenized/{key}.txt"
    write_words_to_file(value, filename, words_per_line=20)
    print(f"Saved content for '{key}' to {filename}")

Saved content for 'Memoires d'État Refuge_corrected' to ./tokenized/Memoires d'État Refuge_corrected.txt
Saved content for 'Memoire - Namur_corrected' to ./tokenized/Memoire - Namur_corrected.txt
Saved content for 'Memoires d'état_corrected' to ./tokenized/Memoires d'état_corrected.txt
Saved content for 'Memoire au roi_corrected' to ./tokenized/Memoire au roi_corrected.txt
Saved content for 'Memoire - le but_corrected' to ./tokenized/Memoire - le but_corrected.txt


In [7]:
print("Unigram texts:")
for key in unigrams:
    print(key)

Unigram texts:
Memoires d'État Refuge_corrected
Memoire - Namur_corrected
Memoires d'état_corrected
Memoire au roi_corrected
Memoire - le but_corrected


In [8]:
# Count up the tokens using a Counter() object
unigram_counts = {}
for key, value in unigrams.items():
    unigram_counts_dict = Counter(value)
    unigram_counts[key] = unigram_counts_dict

print("Unigram Counts:")
for key in unigram_counts:
    print(key)

print_first_n_items(unigram_counts, 25)

Unigram Counts:
Memoires d'État Refuge_corrected
Memoire - Namur_corrected
Memoires d'état_corrected
Memoire au roi_corrected
Memoire - le but_corrected
First 25 items in Memoires d'État Refuge_corrected:
: 582
de: 114
et: 93
d: 61
que: 56
l: 50
la: 48
en: 45
les: 41
qui: 40
à: 39
qu: 39
il: 36
le: 34
des: 33
ne: 31
n: 29
est: 27
plus: 23
estat: 21
ung: 21
conseil: 20
pour: 19
du: 18
estre: 18

First 25 items in Memoire - Namur_corrected:
: 243
de: 88
et: 35
le: 24
à: 22
la: 18
qui: 16
comte: 16
ledict: 16
d: 14
duché: 14
en: 13
duc: 13
lorraine: 11
les: 11
du: 10
par: 10
l: 9
son: 9
henry: 9
dudict: 9
feut: 9
brabant: 9
namur: 8
empereur: 8

First 25 items in Memoires d'état_corrected:
: 9040
de: 2169
et: 1559
le: 1024
d: 678
roy: 640
à: 575
la: 574
les: 527
l: 509
en: 505
du: 473
que: 432
par: 361
au: 355
cent: 280
pour: 279
des: 250
il: 245
duc: 239
mil: 237
qu: 222
an: 213
quatre: 198
état: 197

First 25 items in Memoire au roi_corrected:
: 405
et: 59
de: 59
le: 28
que: 28
en: 26
q

In [9]:
# Remove specified keys from the dictionary
stripped_unigrams = remove_keys_from_nested_dict(unigram_counts, stopwords)

print_first_n_items(stripped_unigrams, 30)

First 30 items in Memoires d'État Refuge_corrected:
plus: 23
estat: 21
conseil: 20
estre: 18
fault: 14
affaires: 14
bien: 14
faire: 12
si: 12
leurs: 10
ceulx: 9
aultres: 9
raisons: 9
sans: 8
aussy: 8
conseiller: 8
comme: 8
moins: 7
moyen: 7
poinct: 7
adviz: 7
mémoires: 6
doibvent: 6
expérience: 6
peult: 6
nécessaire: 6
aultre: 6
estant: 6
fortune: 6
quelques: 6

First 30 items in Memoire - Namur_corrected:
comte: 16
ledict: 16
duché: 14
duc: 13
lorraine: 11
henry: 9
dudict: 9
brabant: 9
namur: 8
empereur: 8
othon: 8
bauldoin: 7
comté: 6
depuis: 6
roy: 6
fils: 6
france: 6
frére: 6
louvain: 6
cause: 5
lequel: 5
charles: 5
flandres: 4
mémoires: 4
état: 4
fille: 4
godefroy: 4
force: 3
luxembourg: 3
sœur: 3

First 30 items in Memoires d'état_corrected:
roy: 640
cent: 280
duc: 239
mil: 237
an: 213
quatre: 198
état: 197
mémoires: 194
ledict: 173
comte: 144
charles: 122
vingt: 120
trois: 116
cinq: 105
comté: 101
deux: 99
traicté: 94
mille: 94
faict: 93
dudict: 90
audict: 90
hommaige: 85
france

In [10]:
dictionary_to_file(stripped_unigrams, unigram_folder, 'unigram_counts')

Saved Memoires d'État Refuge_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Memoire - Namur_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Memoires d'état_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Memoire au roi_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Memoire - le but_corrected_unigram_counts.csv in tokenized/unigram_counts


In [11]:
bigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    bigrams_list = list(nltk.bigrams(unigram_list))
    bigrams[key] = bigrams_list

print("Bigrams:")
for key in bigrams:
    print(key)

Bigrams:
Memoires d'État Refuge_corrected
Memoire - Namur_corrected
Memoires d'état_corrected
Memoire au roi_corrected
Memoire - le but_corrected


In [12]:
bigram_counts = {}

for key, value in bigrams.items():
    string_bigrams = convert_tuple_bigrams(value)
    bigramCount = convert_strings_to_counts(string_bigrams)
    bigram_counts[key] = bigramCount

print("Bigram Counts:")
for key in bigram_counts:
    print(key)

print_first_n_items(bigram_counts, 30)

Bigram Counts:
Memoires d'État Refuge_corrected
Memoire - Namur_corrected
Memoires d'état_corrected
Memoire au roi_corrected
Memoire - le but_corrected
First 30 items in Memoires d'État Refuge_corrected:
conseiller estat: 5
conseil estat: 4
doibvent estre: 3
quelques ungs: 3
celles conseillent: 2
peult estre: 2
estre homme: 2
peult avoir: 2
ceulx seroient: 2
plus saiges: 2
grandes affaires: 2
ceulx cy: 2
si bien: 2
telz genz: 2
estat doibt: 2
quelques fois: 2
mal propoz: 2
affaires où: 2
envers aultres: 2
ailleurs estat: 2
estat leurs: 2
moyen vivre: 2
vivre ailleurs: 2
fault conseiller: 2
fault estre: 2
estre si: 2
raisons ceulx: 2
faire entendre: 2
entendre raisons: 2
plus grande: 2

First 30 items in Memoire - Namur_corrected:
ledict duché: 8
ledict comté: 5
comté namur: 4
mémoires état: 4
duc lorraine: 3
duché lorraine: 3
guerres cause: 2
comte henry: 2
bauldoin comte: 2
comte flandres: 2
empereur constantinople: 2
bauldoin ii: 2
ii empereur: 2
empereur henry: 2
hugues capet: 2
pre

In [13]:
dictionary_to_file(bigram_counts, bigram_folder, 'bigram_counts')

Saved Memoires d'État Refuge_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Memoire - Namur_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Memoires d'état_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Memoire au roi_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Memoire - le but_corrected_bigram_counts.csv in tokenized/bigram_counts


In [14]:
trigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    trigrams_list = list(nltk.trigrams(unigram_list))
    trigrams[key] = trigrams_list

print("Trigrams:")
for key in bigrams:
    print(key)

Trigrams:
Memoires d'État Refuge_corrected
Memoire - Namur_corrected
Memoires d'état_corrected
Memoire au roi_corrected
Memoire - le but_corrected


In [15]:
trigram_counts = {}

for key, value in trigrams.items():
    string_trigrams = convert_tuple_trigrams(value)
    trigramCount = convert_strings_to_counts(string_trigrams)
    trigram_counts[key] = trigramCount

print("Trigram Counts:")
for key in trigram_counts:
    print(key)
    
print_first_n_items(trigram_counts, 30)

Trigram Counts:
Memoires d'État Refuge_corrected
Memoire - Namur_corrected
Memoires d'état_corrected
Memoire au roi_corrected
Memoire - le but_corrected
First 30 items in Memoires d'État Refuge_corrected:
moyen vivre ailleurs: 2
fault conseiller estat: 2
faire entendre raisons: 2
mémoires etat monsieur: 1
etat monsieur chancelier: 1
monsieur chancelier hospital: 1
chancelier hospital mis: 1
hospital mis ordre: 1
mis ordre refugf: 1
ordre refugf établissement: 1
refugf établissement conseil: 1
établissement conseil estat: 1
conseil estat qualités: 1
estat qualités ct: 1
qualités ct nombre: 1
ct nombre conscillers: 1
nombre conscillers divx: 1
conscillers divx sortes: 1
divx sortes personnes: 1
sortes personnes considérer: 1
personnes considérer establissement: 1
considérer establissement conseil: 1
establissement conseil estat: 1
conseil estat scavoir: 1
estat scavoir celles: 1
scavoir celles conseillent: 1
celles conseillent celles: 1
conseillent celles conseil: 1
celles conseil lées: 

In [16]:
dictionary_to_file(trigram_counts, trigram_folder, 'trigram_counts')

Saved Memoires d'État Refuge_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Memoire - Namur_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Memoires d'état_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Memoire au roi_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Memoire - le but_corrected_trigram_counts.csv in tokenized/trigram_counts


In [17]:
colloc_dict = {}
colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_finder.apply_freq_filter(3)  # Make sure all collocations have occurred at least 5 times
    collocations = bigram_finder.nbest(BigramAssocMeasures.pmi, 500)
    colloc_dict[key] = collocations
    
    # Initialize Counter for colloc_counts
    bigram_count_dict = Counter()

    # Count the occurrences of each bigram in the text
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_freqs = bigram_finder.ngram_fd.items()
    
    # Filter bigram counts based on collocations
    for bigram, count in bigram_freqs:
        if bigram in collocations:
            bigram_count_dict[bigram] = count

    colloc_counts[key] = bigram_count_dict

print("Collocations:")
for key, value in colloc_dict.items():
    print(key)
    # for w1, w2 in value:
    #     print(' ', w1, w2)

print("Collocation Counts:")
for key in colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in colloc_counts[key].most_common(30):
        bigram = " ".join(item)
        print(f"{bigram} {count}")
    print()

dictionary_to_file(colloc_counts, collocation_folder, 'collocation_counts')

Collocations:
Memoires d'État Refuge_corrected
Memoire - Namur_corrected
Memoires d'état_corrected
Memoire au roi_corrected
Memoire - le but_corrected
Collocation Counts:
Memoires d'État Refuge_corrected
conseiller estat 5
conseil estat 4
doibvent estre 3
quelques ungs 3

Memoire - Namur_corrected
ledict duché 8
ledict comté 5
comté namur 4
mémoires état 4
duc lorraine 3
duché lorraine 3

Memoires d'état_corrected
mémoires état 192
an mil 170
quatre cent 104
mil quatre 101
cinq cent 58
trois cent 57
mil cinq 56
cent quatre 51
mille livres 49
mil trois 48
quatre vingt 48
jour mois 47
foy hommaige 43
cent soixante 42
cent trente 29
deux cent 29
cent quarante 28
ledict duc 27
roy loys 26
mil deux 24
cent mille 23
cent vingt 22
roy angleterre 22
audict duc 21
loys xi 20
état an 19
cent cinquante 18
court parlement 18
mille escus 16
livres rente 16

Memoire au roi_corrected

Memoire - le but_corrected
aujourd huy 9
aultre chose 8
long temps 5
peult estre 4
toutes choses 4
tous ceulx 4
tout 

In [18]:
trigram_colloc_dict = {}
trigram_colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    trigram_finder = TrigramCollocationFinder.from_words(unigram_list)
    trigram_finder.apply_freq_filter(3)  # Ensure all collocations have occurred at least 5 times
    collocations = trigram_finder.nbest(TrigramAssocMeasures.pmi, 500)
    trigram_colloc_dict[key] = collocations
    
    # Initialize Counter for trigram_colloc_counts
    trigram_count_dict = Counter()

    # Count the occurrences of each trigram in the text
    trigram_freqs = trigram_finder.ngram_fd.items()
    
    # Filter trigram counts based on collocations
    for trigram, count in trigram_freqs:
        if trigram in collocations:
            trigram_count_dict[trigram] = count

    trigram_colloc_counts[key] = trigram_count_dict

print("Trigram Collocations:")
for key, value in trigram_colloc_dict.items():
    print(key)
    #for w1, w2, w3 in value:
    #    print(' ', w1, w2, w3)

print("Trigram Collocation Counts:")
for key in trigram_colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in trigram_colloc_counts[key].most_common(30):
        trigram = " ".join(item)
        print(f"{trigram} {count}")
    print()

dictionary_to_file(trigram_colloc_counts, trigram_collocation_folder, 'trigram_collocation_counts')

Trigram Collocations:
Memoires d'État Refuge_corrected
Memoire - Namur_corrected
Memoires d'état_corrected
Memoire au roi_corrected
Memoire - le but_corrected
Trigram Collocation Counts:
Memoires d'État Refuge_corrected

Memoire - Namur_corrected
ledict comté namur 4

Memoires d'état_corrected
mil quatre cent 100
an mil quatre 66
mil cinq cent 56
mil trois cent 48
cent quatre vingt 48
an mil cinq 46
an mil trois 36
quatre cent quatre 29
quatre cent soixante 24
mil deux cent 24
mémoires état an 19
état an mil 19
an mil deux 19
trois cent soixante 15
roy loys xi 14
cinq cent quarante 13
quatre vingt dix 13
mille livres rente 13
quatre cent trente 12
cent soixante dix 10
cent mille livres 10
deux cent quatre 10
cinq cent cinquante 9
quatre cent quarante 9
feit foy hommaige 9
jour mois juin 8
cent mille escus 8
trois cent quatre 8
trois cent trente 8
cinq cent vingt 7

Memoire au roi_corrected

Memoire - le but_corrected

Saved Memoires d'État Refuge_corrected_trigram_collocation_counts.cs

In [19]:
underscore_dict = {}
for key, value in unigrams.items():

    tokenized_words = unigrams.get(key)
    collocations = colloc_dict.get(key)
    
    colloc_words = []
    
    # Iterate through the words making new versions combining collocations
    i = 0
    while i < len(tokenized_words) - 1:
        # If we find a collocation, add and advance by two words
        if (tokenized_words[i], tokenized_words[i + 1]) in collocations:
            colloc_words.append('_'.join((tokenized_words[i], tokenized_words[i + 1])))
            i += 2
        # Otherwise, advance by one word
        else:
            colloc_words.append(tokenized_words[i])
            i += 1

    # Add the last word (if any)
    if i == len(tokenized_words) - 1:
        colloc_words.append(tokenized_words[i])
    underscore_dict[key] = colloc_words

print("Underscore Dictionary:")
for key in underscore_dict:
    print(key)

write_dict_to_files_with_suffix(underscore_dict, underscore_folder, 'underscore_bigrams')

Underscore Dictionary:
Memoires d'État Refuge_corrected
Memoire - Namur_corrected
Memoires d'état_corrected
Memoire au roi_corrected
Memoire - le but_corrected
Memoires d'État Refuge_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Memoire - Namur_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Memoires d'état_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Memoire au roi_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Memoire - le but_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams


In [20]:
trigram_underscore_dict = {}

for key, tokenized_words in unigrams.items():
    collocations = trigram_colloc_dict.get(key, [])
    colloc_words = []
    i = 0
    while i < len(tokenized_words) - 2:
        # If we find a trigram collocation, add and advance by three words
        trigram = (tokenized_words[i], tokenized_words[i + 1], tokenized_words[i + 2])
        if trigram in collocations:
            colloc_words.append('_'.join(trigram))
            i += 3
        else:
            colloc_words.append(tokenized_words[i])
            i += 1
    # Add the last words (if any)
    while i < len(tokenized_words):
        colloc_words.append(tokenized_words[i])
        i += 1
    trigram_underscore_dict[key] = colloc_words

print("Trigram underscore Dictionary:")
for key in trigram_underscore_dict:
    print(key)

write_dict_to_files_with_suffix(trigram_underscore_dict, trigram_underscore_folder, 'underscore_trigrams')

Trigram underscore Dictionary:
Memoires d'État Refuge_corrected
Memoire - Namur_corrected
Memoires d'état_corrected
Memoire au roi_corrected
Memoire - le but_corrected
Memoires d'État Refuge_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Memoire - Namur_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Memoires d'état_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Memoire au roi_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Memoire - le but_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
