In [1]:
from pathlib import Path
import glob
import os
import sys
import csv
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize,wordpunct_tokenize
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures

In [2]:
sys.path.append("/home/lucas-jerusalimiec/Documents/OCR Text/Notebooks")
from tokenizer_func  import (wordcleaner, write_words_to_file, dictionary_to_file, convert_tuple_bigrams,
convert_tuple_trigrams)

from extra_token_func import print_first_n_items, remove_keys_from_nested_dict

from additional_token_func import convert_strings_to_counts

from dict_write import write_dict_to_files_with_suffix

In [3]:
text_loc = Path("./final")
text_files = glob.glob(f"{text_loc}/*.txt")
output_folder = './tokenized/'
tokenized_folder = Path(output_folder)
tokenized_folder.mkdir(exist_ok=True)

output_unigram = f'{output_folder}unigram_counts'
unigram_folder = Path(output_unigram)
unigram_folder.mkdir(exist_ok=True)

output_bigram = f'{output_folder}bigram_counts'
bigram_folder = Path(output_bigram)
bigram_folder.mkdir(exist_ok=True)

output_trigram = f'{output_folder}trigram_counts'
trigram_folder = Path(output_trigram)
trigram_folder.mkdir(exist_ok=True)

output_collocation = f'{output_folder}collocation_counts'
collocation_folder = Path(output_collocation)
collocation_folder.mkdir(exist_ok=True)

output_trigram_collocation = f'{output_folder}trigram_collocation_counts'
trigram_collocation_folder = Path(output_trigram_collocation)
trigram_collocation_folder.mkdir(exist_ok=True)

output_underscore = f'{output_folder}underscore_bigrams'
underscore_folder = Path(output_underscore)
underscore_folder.mkdir(exist_ok=True)

output_trigram_underscore = f'{output_folder}underscore_trigrams'
trigram_underscore_folder = Path(output_trigram_underscore)
trigram_underscore_folder.mkdir(exist_ok=True)

print("Text files in the spellchecked directory:", text_files)

Text files in the spellchecked directory: ['final/Traité Justice II_corrected.txt', 'final/Traité Justice IV_corrected.txt', 'final/Traité Justice III_corrected.txt', 'final/Traite Justice V_corrected.txt', 'final/Traité Justice I_corrected.txt', 'final/Traite Justice VI_corrected.txt', 'final/Traite Justice VII_corrected.txt']


In [4]:
# Open stopwords CSV file and list the contents
with open('./stop_words.csv', 'r') as f:
    stopwords = f.read().strip().split(",")
stopwords[-10:]

['ar', 'ara', 'bl', 'ua', 'rar', 'rr', 'iir', 'tett', 'nw', 'b']

In [5]:
tokenized_texts = {}
for txt in text_files:
    with open(txt, 'r') as f:
        content = f.read()
        file_name = txt.split('\\')[-1]
        #key = file_name.split('.')[0]
        key = os.path.splitext(os.path.basename(file_name))[0]
        tokenized_texts[key] = content
print("Raw texts:", list(tokenized_texts.keys()))       

Raw texts: ['Traité Justice II_corrected', 'Traité Justice IV_corrected', 'Traité Justice III_corrected', 'Traite Justice V_corrected', 'Traité Justice I_corrected', 'Traite Justice VI_corrected', 'Traite Justice VII_corrected']


In [6]:
unigrams = {}

for key, value in tokenized_texts.items():
    unigram_list = wordpunct_tokenize(value)
    cleanwords = [wordcleaner(w) for w in unigram_list]
    unigrams[key] = cleanwords

for key, value in unigrams.items():
    filename = f"./tokenized/{key}.txt"
    write_words_to_file(value, filename, words_per_line=20)
    print(f"Saved content for '{key}' to {filename}")

Saved content for 'Traité Justice II_corrected' to ./tokenized/Traité Justice II_corrected.txt
Saved content for 'Traité Justice IV_corrected' to ./tokenized/Traité Justice IV_corrected.txt
Saved content for 'Traité Justice III_corrected' to ./tokenized/Traité Justice III_corrected.txt
Saved content for 'Traite Justice V_corrected' to ./tokenized/Traite Justice V_corrected.txt
Saved content for 'Traité Justice I_corrected' to ./tokenized/Traité Justice I_corrected.txt
Saved content for 'Traite Justice VI_corrected' to ./tokenized/Traite Justice VI_corrected.txt
Saved content for 'Traite Justice VII_corrected' to ./tokenized/Traite Justice VII_corrected.txt


In [7]:
print("Unigram texts:")
for key in unigrams:
    print(key)

Unigram texts:
Traité Justice II_corrected
Traité Justice IV_corrected
Traité Justice III_corrected
Traite Justice V_corrected
Traité Justice I_corrected
Traite Justice VI_corrected
Traite Justice VII_corrected


In [8]:
# Count up the tokens using a Counter() object
unigram_counts = {}
for key, value in unigrams.items():
    unigram_counts_dict = Counter(value)
    unigram_counts[key] = unigram_counts_dict

print("Unigram Counts:")
for key in unigram_counts:
    print(key)

print_first_n_items(unigram_counts, 25)

Unigram Counts:
Traité Justice II_corrected
Traité Justice IV_corrected
Traité Justice III_corrected
Traite Justice V_corrected
Traité Justice I_corrected
Traite Justice VI_corrected
Traite Justice VII_corrected
First 25 items in Traité Justice II_corrected:
: 2778
et: 659
de: 636
la: 353
que: 236
l: 230
les: 226
le: 202
à: 194
en: 192
qui: 171
qu: 156
il: 154
est: 150
d: 147
pour: 128
justice: 114
ne: 113
des: 110
par: 99
plus: 92
ung: 87
bien: 85
se: 83
ce: 83

First 25 items in Traité Justice IV_corrected:
: 9316
de: 2253
et: 2120
la: 982
les: 805
que: 707
en: 680
l: 641
à: 610
qui: 600
le: 573
il: 485
d: 481
qu: 403
des: 401
par: 367
pour: 355
ce: 349
est: 330
plus: 313
ne: 312
ung: 288
n: 272
se: 266
justice: 235

First 25 items in Traité Justice III_corrected:
: 5001
et: 1153
de: 1134
la: 567
à: 405
il: 382
en: 377
le: 369
que: 360
les: 357
l: 336
qui: 322
d: 264
qu: 263
par: 220
des: 207
ne: 199
ce: 195
ung: 193
pour: 180
justice: 166
est: 166
se: 160
son: 140
luy: 140

First 25

In [9]:
# Remove specified keys from the dictionary
stripped_unigrams = remove_keys_from_nested_dict(unigram_counts, stopwords)

print_first_n_items(stripped_unigrams, 30)

First 30 items in Traité Justice II_corrected:
justice: 114
plus: 92
bien: 85
si: 59
tout: 53
faire: 50
faict: 44
aultre: 38
leurs: 37
dieu: 36
comme: 35
mesme: 34
tant: 33
estre: 32
homme: 31
eulx: 30
ceste: 30
réformation: 29
ceulx: 29
grand: 29
sans: 29
poinct: 28
jamais: 28
aultres: 27
non: 25
tous: 25
dict: 24
traité: 23
rien: 23
force: 23

First 30 items in Traité Justice IV_corrected:
plus: 313
justice: 235
si: 207
bien: 182
comme: 160
ceste: 152
tout: 144
leurs: 143
tant: 124
faire: 104
aultres: 100
grand: 97
temps: 95
toutes: 91
réformation: 90
tous: 89
mesme: 87
faict: 86
traité: 84
ceulx: 83
sans: 79
estoient: 78
non: 77
aultre: 74
dict: 70
jamais: 70
bon: 68
poinct: 68
peu: 67
soubs: 66

First 30 items in Traité Justice III_corrected:
justice: 166
bien: 126
plus: 115
faict: 101
tout: 94
si: 93
faire: 84
comme: 75
dieu: 74
leurs: 67
grand: 59
tant: 54
peuple: 52
réformation: 51
jamais: 49
tous: 48
homme: 48
ceulx: 45
toutes: 43
aultre: 43
cause: 42
traité: 42
sans: 42
ceste:

In [10]:
dictionary_to_file(stripped_unigrams, unigram_folder, 'unigram_counts')

Saved Traité Justice II_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Traité Justice IV_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Traité Justice III_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Traite Justice V_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Traité Justice I_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Traite Justice VI_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Traite Justice VII_corrected_unigram_counts.csv in tokenized/unigram_counts


In [11]:
bigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    bigrams_list = list(nltk.bigrams(unigram_list))
    bigrams[key] = bigrams_list

print("Bigrams:")
for key in bigrams:
    print(key)

Bigrams:
Traité Justice II_corrected
Traité Justice IV_corrected
Traité Justice III_corrected
Traite Justice V_corrected
Traité Justice I_corrected
Traite Justice VI_corrected
Traite Justice VII_corrected


In [12]:
bigram_counts = {}

for key, value in bigrams.items():
    string_bigrams = convert_tuple_bigrams(value)
    bigramCount = convert_strings_to_counts(string_bigrams)
    bigram_counts[key] = bigramCount

print("Bigram Counts:")
for key in bigram_counts:
    print(key)

print_first_n_items(bigram_counts, 30)

Bigram Counts:
Traité Justice II_corrected
Traité Justice IV_corrected
Traité Justice III_corrected
Traite Justice V_corrected
Traité Justice I_corrected
Traite Justice VI_corrected
Traite Justice VII_corrected
First 30 items in Traité Justice II_corrected:
réformation justice: 28
tout ainsy: 7
eulx mesmes: 7
soy mesme: 7
entre eulx: 6
plus grand: 6
gens bien: 6
non seulement: 5
bien faire: 5
homme bien: 5
plus grande: 5
non plus: 4
peult estre: 4
bien estre: 4
toutes sortes: 4
peu temps: 4
tout monde: 4
plusieurs aultres: 4
leurs subjects: 4
tous ceulx: 4
plus grands: 4
justice divine: 3
tout homme: 3
nul aultre: 3
bonne vie: 3
tous jours: 3
tout ensemble: 3
rien plus: 3
entre mains: 3
toutes choses: 3

First 30 items in Traité Justice IV_corrected:
réformation justice: 85
gens bien: 21
toutes sortes: 20
plus grands: 20
aujourd huy: 20
tous ceulx: 17
plus grand: 16
non seulement: 15
grand nombre: 15
temps là: 13
comme dict: 12
long temps: 11
si grand: 11
plus grandes: 11
tant plus: 11

In [13]:
dictionary_to_file(bigram_counts, bigram_folder, 'bigram_counts')

Saved Traité Justice II_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Traité Justice IV_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Traité Justice III_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Traite Justice V_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Traité Justice I_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Traite Justice VI_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Traite Justice VII_corrected_bigram_counts.csv in tokenized/bigram_counts


In [14]:
trigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    trigrams_list = list(nltk.trigrams(unigram_list))
    trigrams[key] = trigrams_list

print("Trigrams:")
for key in bigrams:
    print(key)

Trigrams:
Traité Justice II_corrected
Traité Justice IV_corrected
Traité Justice III_corrected
Traite Justice V_corrected
Traité Justice I_corrected
Traite Justice VI_corrected
Traite Justice VII_corrected


In [15]:
trigram_counts = {}

for key, value in trigrams.items():
    string_trigrams = convert_tuple_trigrams(value)
    trigramCount = convert_strings_to_counts(string_trigrams)
    trigram_counts[key] = trigramCount

print("Trigram Counts:")
for key in trigram_counts:
    print(key)
    
print_first_n_items(trigram_counts, 30)

Trigram Counts:
Traité Justice II_corrected
Traité Justice IV_corrected
Traité Justice III_corrected
Traite Justice V_corrected
Traité Justice I_corrected
Traite Justice VI_corrected
Traite Justice VII_corrected
First 30 items in Traité Justice II_corrected:
traite réformation justice: 2
mieulx tout aultre: 2
grand dommaige honte: 2
dommaige honte confu: 2
honte confu sion: 2
juste auroit aultre: 2
donc sans cause: 2
amour eulx mesmes: 2
réformation justice oi: 2
bien faire proficter: 2
dict plus grand: 2
toutes aultres vertus: 2
plus grande celle: 2
charge thrésorier général: 2
mary parmy prisonniers: 2
réformation justice seconde: 1
justice seconde partie: 1
seconde partie sommaire: 1
partie sommaire cette: 1
sommaire cette partie: 1
cette partie manque: 1
partie manque manuscrits: 1
manque manuscrits bibliothèque: 1
manuscrits bibliothèque roi: 1
bibliothèque roi bon: 1
roi bon hésiode: 1
bon hésiode poëte: 1
hésiode poëte grec: 1
poëte grec vivoit: 1
grec vivoit temps: 1

First 30 

In [16]:
dictionary_to_file(trigram_counts, trigram_folder, 'trigram_counts')

Saved Traité Justice II_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Traité Justice IV_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Traité Justice III_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Traite Justice V_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Traité Justice I_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Traite Justice VI_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Traite Justice VII_corrected_trigram_counts.csv in tokenized/trigram_counts


In [17]:
colloc_dict = {}
colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_finder.apply_freq_filter(3)  # Make sure all collocations have occurred at least 5 times
    collocations = bigram_finder.nbest(BigramAssocMeasures.pmi, 500)
    colloc_dict[key] = collocations
    
    # Initialize Counter for colloc_counts
    bigram_count_dict = Counter()

    # Count the occurrences of each bigram in the text
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_freqs = bigram_finder.ngram_fd.items()
    
    # Filter bigram counts based on collocations
    for bigram, count in bigram_freqs:
        if bigram in collocations:
            bigram_count_dict[bigram] = count

    colloc_counts[key] = bigram_count_dict

print("Collocations:")
for key, value in colloc_dict.items():
    print(key)
    # for w1, w2 in value:
    #     print(' ', w1, w2)

print("Collocation Counts:")
for key in colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in colloc_counts[key].most_common(30):
        bigram = " ".join(item)
        print(f"{bigram} {count}")
    print()

dictionary_to_file(colloc_counts, collocation_folder, 'collocation_counts')

Collocations:
Traité Justice II_corrected
Traité Justice IV_corrected
Traité Justice III_corrected
Traite Justice V_corrected
Traité Justice I_corrected
Traite Justice VI_corrected
Traite Justice VII_corrected
Collocation Counts:
Traité Justice II_corrected
réformation justice 28
tout ainsy 7
eulx mesmes 7
soy mesme 7
entre eulx 6
plus grand 6
gens bien 6
non seulement 5
bien faire 5
homme bien 5
plus grande 5
non plus 4
peult estre 4
bien estre 4
toutes sortes 4
peu temps 4
tout monde 4
plusieurs aultres 4
leurs subjects 4
tous ceulx 4
plus grands 4
justice divine 3
tout homme 3
nul aultre 3
bonne vie 3
tous jours 3
tout ensemble 3
rien plus 3
entre mains 3
toutes choses 3

Traité Justice IV_corrected
réformation justice 85
gens bien 21
toutes sortes 20
plus grands 20
aujourd huy 20
tous ceulx 17
plus grand 16
non seulement 15
grand nombre 15
temps là 13
comme dict 12
long temps 11
si grand 11
plus grandes 11
tant plus 11
cy dessus 10
entre eulx 10
si grande 10
homme bien 9
tous aultr

In [18]:
trigram_colloc_dict = {}
trigram_colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    trigram_finder = TrigramCollocationFinder.from_words(unigram_list)
    trigram_finder.apply_freq_filter(3)  # Ensure all collocations have occurred at least 5 times
    collocations = trigram_finder.nbest(TrigramAssocMeasures.pmi, 500)
    trigram_colloc_dict[key] = collocations
    
    # Initialize Counter for trigram_colloc_counts
    trigram_count_dict = Counter()

    # Count the occurrences of each trigram in the text
    trigram_freqs = trigram_finder.ngram_fd.items()
    
    # Filter trigram counts based on collocations
    for trigram, count in trigram_freqs:
        if trigram in collocations:
            trigram_count_dict[trigram] = count

    trigram_colloc_counts[key] = trigram_count_dict

print("Trigram Collocations:")
for key, value in trigram_colloc_dict.items():
    print(key)
    #for w1, w2, w3 in value:
    #    print(' ', w1, w2, w3)

print("Trigram Collocation Counts:")
for key in trigram_colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in trigram_colloc_counts[key].most_common(30):
        trigram = " ".join(item)
        print(f"{trigram} {count}")
    print()

dictionary_to_file(trigram_colloc_counts, trigram_collocation_folder, 'trigram_collocation_counts')

Trigram Collocations:
Traité Justice II_corrected
Traité Justice IV_corrected
Traité Justice III_corrected
Traite Justice V_corrected
Traité Justice I_corrected
Traite Justice VI_corrected
Traite Justice VII_corrected
Trigram Collocation Counts:
Traité Justice II_corrected

Traité Justice IV_corrected
si grand nombre 5
an mil trois 4
mil trois cent 4
mil quatre cent 4
plus grandes dignitez 4
comme dict cy 4
dict cy dessus 4
bien sou vent 3
cent quatre vingt 3
quatre vingt dix 3
an mil quatre 3
plus meschans hommes 3
plein sé nat 3
beaulx deniers comptans 3
fault bien croire 3
vie sans reproche 3
gens bien honneur 3
soubs telz gouvernemens 3
chose tant petite 3

Traité Justice III_corrected
servyce vray dieu 3
plus homme bien 3

Traite Justice V_corrected
laquelle doibt jamais 3
leurs propres privez 3
propres privez noms 3

Traité Justice I_corrected
sans faveur sans 3

Traite Justice VI_corrected
leurs simples gaiges 5
plus gens bien 4
ez courts soubveraines 4
si long temps 3
réformati

In [19]:
underscore_dict = {}
for key, value in unigrams.items():

    tokenized_words = unigrams.get(key)
    collocations = colloc_dict.get(key)
    
    colloc_words = []
    
    # Iterate through the words making new versions combining collocations
    i = 0
    while i < len(tokenized_words) - 1:
        # If we find a collocation, add and advance by two words
        if (tokenized_words[i], tokenized_words[i + 1]) in collocations:
            colloc_words.append('_'.join((tokenized_words[i], tokenized_words[i + 1])))
            i += 2
        # Otherwise, advance by one word
        else:
            colloc_words.append(tokenized_words[i])
            i += 1

    # Add the last word (if any)
    if i == len(tokenized_words) - 1:
        colloc_words.append(tokenized_words[i])
    underscore_dict[key] = colloc_words

print("Underscore Dictionary:")
for key in underscore_dict:
    print(key)

write_dict_to_files_with_suffix(underscore_dict, underscore_folder, 'underscore_bigrams')

Underscore Dictionary:
Traité Justice II_corrected
Traité Justice IV_corrected
Traité Justice III_corrected
Traite Justice V_corrected
Traité Justice I_corrected
Traite Justice VI_corrected
Traite Justice VII_corrected
Traité Justice II_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Traité Justice IV_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Traité Justice III_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Traite Justice V_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Traité Justice I_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Traite Justice VI_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Traite Justice VII_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams


In [20]:
trigram_underscore_dict = {}

for key, tokenized_words in unigrams.items():
    collocations = trigram_colloc_dict.get(key, [])
    colloc_words = []
    i = 0
    while i < len(tokenized_words) - 2:
        # If we find a trigram collocation, add and advance by three words
        trigram = (tokenized_words[i], tokenized_words[i + 1], tokenized_words[i + 2])
        if trigram in collocations:
            colloc_words.append('_'.join(trigram))
            i += 3
        else:
            colloc_words.append(tokenized_words[i])
            i += 1
    # Add the last words (if any)
    while i < len(tokenized_words):
        colloc_words.append(tokenized_words[i])
        i += 1
    trigram_underscore_dict[key] = colloc_words

print("Trigram underscore Dictionary:")
for key in trigram_underscore_dict:
    print(key)

write_dict_to_files_with_suffix(trigram_underscore_dict, trigram_underscore_folder, 'underscore_trigrams')

Trigram underscore Dictionary:
Traité Justice II_corrected
Traité Justice IV_corrected
Traité Justice III_corrected
Traite Justice V_corrected
Traité Justice I_corrected
Traite Justice VI_corrected
Traite Justice VII_corrected
Traité Justice II_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Traité Justice IV_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Traité Justice III_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Traite Justice V_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Traité Justice I_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Traite Justice VI_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Traite Justice VII_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
