In [1]:
from pathlib import Path
import glob
import os
import sys
import csv
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize,wordpunct_tokenize
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures

import locale
# Set the locale to your desired setting (e.g., 'fr_FR.UTF-8' for French)
locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')

'fr_FR.UTF-8'

In [2]:
sys.path.append("/home/lucas-jerusalimiec/Documents/OCR Text/Notebooks")
from tokenizer_func  import (wordcleaner, write_words_to_file, dictionary_to_file, convert_tuple_bigrams,
convert_tuple_trigrams)

from extra_token_func import print_first_n_items, remove_keys_from_nested_dict

from additional_token_func import convert_strings_to_counts

from dict_write import write_dict_to_files_with_suffix

In [3]:
text_loc = Path("./final")
text_files = sorted(glob.glob(f"{text_loc}/*.txt"), key=locale.strxfrm)
output_folder = './tokenized/'
tokenized_folder = Path(output_folder)
tokenized_folder.mkdir(exist_ok=True)

output_unigram = f'{output_folder}unigram_counts'
unigram_folder = Path(output_unigram)
unigram_folder.mkdir(exist_ok=True)

output_bigram = f'{output_folder}bigram_counts'
bigram_folder = Path(output_bigram)
bigram_folder.mkdir(exist_ok=True)

output_trigram = f'{output_folder}trigram_counts'
trigram_folder = Path(output_trigram)
trigram_folder.mkdir(exist_ok=True)

output_collocation = f'{output_folder}collocation_counts'
collocation_folder = Path(output_collocation)
collocation_folder.mkdir(exist_ok=True)

output_trigram_collocation = f'{output_folder}trigram_collocation_counts'
trigram_collocation_folder = Path(output_trigram_collocation)
trigram_collocation_folder.mkdir(exist_ok=True)

output_underscore = f'{output_folder}underscore_bigrams'
underscore_folder = Path(output_underscore)
underscore_folder.mkdir(exist_ok=True)

output_trigram_underscore = f'{output_folder}underscore_trigrams'
trigram_underscore_folder = Path(output_trigram_underscore)
trigram_underscore_folder.mkdir(exist_ok=True)

print("Text files in the spellchecked directory:", text_files)

Text files in the spellchecked directory: ['final/République I_corrected.txt', 'final/République II_corrected.txt', 'final/République III_corrected.txt', 'final/République IV_corrected.txt', 'final/République V_corrected.txt', 'final/République VI_corrected.txt', 'final/Réublique Preface_corrected.txt']


In [4]:
# Open stopwords CSV file and list the contents
with open('./stop_words.csv', 'r') as f:
    stopwords = f.read().strip().split(",")
stopwords[-10:]

['v', 'vn', 'vne', 'vnes', 'w', 'x', 'ya', 'encores', 'quele', 'queles']

In [5]:
tokenized_texts = {}
for txt in text_files:
    with open(txt, 'r') as f:
        content = f.read()
        file_name = txt.split('\\')[-1]
        #key = file_name.split('.')[0]
        key = os.path.splitext(os.path.basename(file_name))[0]
        tokenized_texts[key] = content
print("Raw texts:", list(tokenized_texts.keys()))       

Raw texts: ['République I_corrected', 'République II_corrected', 'République III_corrected', 'République IV_corrected', 'République V_corrected', 'République VI_corrected', 'Réublique Preface_corrected']


In [6]:
unigrams = {}

for key, value in tokenized_texts.items():
    unigram_list = wordpunct_tokenize(value)
    cleanwords = [wordcleaner(w) for w in unigram_list]
    unigrams[key] = cleanwords

for key, value in unigrams.items():
    filename = f"./tokenized/{key}.txt"
    write_words_to_file(value, filename, words_per_line=20)
    print(f"Saved content for '{key}' to {filename}")

Saved content for 'République I_corrected' to ./tokenized/République I_corrected.txt
Saved content for 'République II_corrected' to ./tokenized/République II_corrected.txt
Saved content for 'République III_corrected' to ./tokenized/République III_corrected.txt
Saved content for 'République IV_corrected' to ./tokenized/République IV_corrected.txt
Saved content for 'République V_corrected' to ./tokenized/République V_corrected.txt
Saved content for 'République VI_corrected' to ./tokenized/République VI_corrected.txt
Saved content for 'Réublique Preface_corrected' to ./tokenized/Réublique Preface_corrected.txt


In [7]:
print("Unigram texts:")
for key in unigrams:
    print(key)

Unigram texts:
République I_corrected
République II_corrected
République III_corrected
République IV_corrected
République V_corrected
République VI_corrected
Réublique Preface_corrected


In [8]:
# Count up the tokens using a Counter() object
unigram_counts = {}
for key, value in unigrams.items():
    unigram_counts_dict = Counter(value)
    unigram_counts[key] = unigram_counts_dict

print("Unigram Counts:")
for key in unigram_counts:
    print(key)

print_first_n_items(unigram_counts, 25)

Unigram Counts:
République I_corrected
République II_corrected
République III_corrected
République IV_corrected
République V_corrected
République VI_corrected
Réublique Preface_corrected
First 25 items in République I_corrected:
: 30923
de: 4020
la: 1922
les: 1775
que: 1676
en: 1590
le: 1428
d: 1365
l: 1350
qui: 1314
il: 1285
qu: 1241
à: 1146
des: 1043
eft: 865
du: 841
n: 821
pour: 720
ne: 707
par: 695
au: 622
ce: 572
roy: 524
a: 488
ou: 482

First 25 items in République II_corrected:
: 8723
de: 1105
la: 558
les: 527
que: 503
en: 471
l: 431
il: 428
qui: 384
qu: 369
d: 363
à: 359
des: 355
le: 347
eft: 248
n: 225
du: 213
par: 211
plus: 204
ou: 198
pour: 190
a: 183
vn: 180
au: 164
comme: 161

First 25 items in République III_corrected:
: 16751
de: 1830
les: 996
la: 990
que: 917
en: 880
il: 784
le: 763
qu: 750
qui: 727
des: 710
l: 645
d: 620
à: 610
n: 490
eft: 457
du: 455
par: 449
pour: 435
ou: 433
ce: 420
ne: 389
au: 372
a: 310
on: 308

First 25 items in République IV_corrected:
: 19161
d

In [9]:
# Remove specified keys from the dictionary
stripped_unigrams = remove_keys_from_nested_dict(unigram_counts, stopwords)

print_first_n_items(stripped_unigrams, 30)

First 30 items in République I_corrected:
: 30923
roy: 524
bien: 332
prince: 281
puissance: 262
empereur: 213
faire: 198
peuple: 198
sujets: 192
princes: 191
republique: 190
droit: 184
france: 184
celuy: 157
an: 154
pape: 152
loy: 151
foit: 150
contre: 147
entre: 137
empire: 135
premier: 134
souverain: 134
romains: 134
loix: 128
foy: 127
sujet: 123
enfans: 121
roys: 118
plusieurs: 116

First 30 items in République II_corrected:
: 8723
estat: 143
bien: 129
roy: 128
peuple: 102
puissance: 84
prince: 82
republique: 80
faire: 62
empire: 62
sujets: 59
princes: 59
estats: 57
populaire: 57
foit: 56
roys: 55
tyran: 53
empereur: 53
trois: 45
monarchie: 44
mort: 42
celuy: 42
souveraineté: 41
senat: 40
grand: 39
dire: 39
republiques: 38
contre: 37
vie: 37
moins: 36

First 30 items in République III_corrected:
: 16751
magistrats: 233
puissance: 206
bien: 187
senat: 155
faire: 151
prince: 139
magistrat: 134
peuple: 128
roy: 101
republique: 97
corps: 97
loy: 95
estat: 93
foit: 90
non: 88
college: 76

In [10]:
dictionary_to_file(stripped_unigrams, unigram_folder, 'unigram_counts')

Saved République I_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved République II_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved République III_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved République IV_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved République V_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved République VI_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Réublique Preface_corrected_unigram_counts.csv in tokenized/unigram_counts


In [11]:
bigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    bigrams_list = list(nltk.bigrams(unigram_list))
    bigrams[key] = bigrams_list

print("Bigrams:")
for key in bigrams:
    print(key)

Bigrams:
République I_corrected
République II_corrected
République III_corrected
République IV_corrected
République V_corrected
République VI_corrected
Réublique Preface_corrected


In [12]:
bigram_counts = {}

for key, value in bigrams.items():
    string_bigrams = convert_tuple_bigrams(value)
    bigramCount = convert_strings_to_counts(string_bigrams)
    bigram_counts[key] = bigramCount

print("Bigram Counts:")
for key in bigram_counts:
    print(key)

print_first_n_items(bigram_counts, 30)

Bigram Counts:
République I_corrected
République II_corrected
République III_corrected
République IV_corrected
République V_corrected
République VI_corrected
Réublique Preface_corrected
First 30 items in République I_corrected:
 : 7954
 empereur: 184
roy : 170
an : 139
 an: 137
france : 136
 bien: 125
 roy: 125
 empire: 124
sujets : 117
peuple : 109
prince : 104
empire : 101
romains : 97
empereur : 96
princes : 95
foy : 89
 foit: 88
souverain : 88
republique : 85
 prince: 85
 combien: 83
premier : 83
 non: 82
puissance : 80
 quoy: 80
pape : 78
 quand: 77
 depuis: 76
 republique: 74

First 30 items in République II_corrected:
 : 2038
 estat: 131
 empire: 62
roy : 57
estat : 55
peuple : 53
 bien: 53
 empereur: 48
populaire : 47
empire : 45
sujets : 36
puissance : 34
republique : 33
bien : 33
estats : 32
roys : 31
princes : 31
biens : 31
tyran : 31
 ains: 30
prince : 29
souveraineté : 28
 foit: 28
mort : 28
 non: 26
 celuy: 26
vie : 25
ariftocratique : 24
 republique: 23
senat : 23

First

In [13]:
dictionary_to_file(bigram_counts, bigram_folder, 'bigram_counts')

Saved République I_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved République II_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved République III_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved République IV_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved République V_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved République VI_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Réublique Preface_corrected_bigram_counts.csv in tokenized/bigram_counts


In [14]:
trigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    trigrams_list = list(nltk.trigrams(unigram_list))
    trigrams[key] = trigrams_list

print("Trigrams:")
for key in bigrams:
    print(key)

Trigrams:
République I_corrected
République II_corrected
République III_corrected
République IV_corrected
République V_corrected
République VI_corrected
Réublique Preface_corrected


In [15]:
trigram_counts = {}

for key, value in trigrams.items():
    string_trigrams = convert_tuple_trigrams(value)
    trigramCount = convert_strings_to_counts(string_trigrams)
    trigram_counts[key] = trigramCount

print("Trigram Counts:")
for key in trigram_counts:
    print(key)
    
print_first_n_items(trigram_counts, 30)

Trigram Counts:
République I_corrected
République II_corrected
République III_corrected
République IV_corrected
République V_corrected
République VI_corrected
Réublique Preface_corrected
First 30 items in République I_corrected:
  : 2026
 an : 128
 empire : 93
 empereur : 80
an  : 63
 autruy : 55
 premier : 48
  empereur: 47
 roy : 43
france  : 43
 honneur : 41
 angleterre : 41
 p : 40
  autant: 38
prince souverain : 38
 prince : 37
sujets  : 36
roy france : 35
 republique : 34
 hommage : 33
p  : 32
romains  : 32
  roy: 31
roy  : 31
  republique: 30
  neantmoins: 30
  rien: 30
  depuis: 30
  prince: 30
  pourquoy: 29

First 30 items in République II_corrected:
  : 303
 estat : 49
 empire : 45
  estat: 32
roy  : 22
 empereur : 21
 estat populaire: 20
estat populaire : 17
 an : 16
peuple  : 14
puissance  : 14
 com : 12
estat  : 12
 honneur : 12
  autant: 11
  empereur: 11
sujets  : 11
 bien : 11
princes  : 11
an  : 11
 ariftocratie : 10
populaire  : 10
prince  : 10
 vertu : 10
souveraine

In [16]:
dictionary_to_file(trigram_counts, trigram_folder, 'trigram_counts')

Saved République I_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved République II_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved République III_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved République IV_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved République V_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved République VI_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Réublique Preface_corrected_trigram_counts.csv in tokenized/trigram_counts


In [17]:
colloc_dict = {}
colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_finder.apply_freq_filter(3)  # Make sure all collocations have occurred at least 5 times
    collocations = bigram_finder.nbest(BigramAssocMeasures.pmi, 500)
    colloc_dict[key] = collocations
    
    # Initialize Counter for colloc_counts
    bigram_count_dict = Counter()

    # Count the occurrences of each bigram in the text
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_freqs = bigram_finder.ngram_fd.items()
    
    # Filter bigram counts based on collocations
    for bigram, count in bigram_freqs:
        if bigram in collocations:
            bigram_count_dict[bigram] = count

    colloc_counts[key] = bigram_count_dict

print("Collocations:")
for key, value in colloc_dict.items():
    print(key)
    # for w1, w2 in value:
    #     print(' ', w1, w2)

print("Collocation Counts:")
for key in colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in colloc_counts[key].most_common(30):
        bigram = " ".join(item)
        print(f"{bigram} {count}")
    print()

dictionary_to_file(colloc_counts, collocation_folder, 'collocation_counts')

Collocations:
République I_corrected
République II_corrected
République III_corrected
République IV_corrected
République V_corrected
République VI_corrected
Réublique Preface_corrected
Collocation Counts:
République I_corrected
prince souverain 62
roy france 48
empereur charle 22
roys france 18
bien certain 17
sujet naturel 17
non feulement 16
menu peuple 16
roy françois 16
bien fouuent 15
cas pareil 15
homme lige 15
princes souverains 15
faire mourir 13
roy charle 13
toute republique 12
trois cens 12
roy loüys 12
entre princes 11
puissance paternelle 11
peu peu 11
droit bourgeoifie 11
tite liue 11
prince fouucrain 11
hommage lige 11
toute puissance 10
philippe valois 10
celuy quia 10
roy macedoine 10
droit gouuernement 9

République II_corrected
 estat 131
 empire 62
 empereur 48
populaire  47
biens  31
 ains 30
 non 26
ariftocratique  24
estat populaire 21
 combien 21
 honneur 21
 toutesfois 20
 neantmoins 20
an  20
 an 18
 où 17
 autant 16
 faut 16
foy  16
particulier  16
vertu  16


In [18]:
trigram_colloc_dict = {}
trigram_colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    trigram_finder = TrigramCollocationFinder.from_words(unigram_list)
    trigram_finder.apply_freq_filter(3)  # Ensure all collocations have occurred at least 5 times
    collocations = trigram_finder.nbest(TrigramAssocMeasures.pmi, 500)
    trigram_colloc_dict[key] = collocations
    
    # Initialize Counter for trigram_colloc_counts
    trigram_count_dict = Counter()

    # Count the occurrences of each trigram in the text
    trigram_freqs = trigram_finder.ngram_fd.items()
    
    # Filter trigram counts based on collocations
    for trigram, count in trigram_freqs:
        if trigram in collocations:
            trigram_count_dict[trigram] = count

    trigram_colloc_counts[key] = trigram_count_dict

print("Trigram Collocations:")
for key, value in trigram_colloc_dict.items():
    print(key)
    #for w1, w2, w3 in value:
    #    print(' ', w1, w2, w3)

print("Trigram Collocation Counts:")
for key in trigram_colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in trigram_colloc_counts[key].most_common(30):
        trigram = " ".join(item)
        print(f"{trigram} {count}")
    print()

dictionary_to_file(trigram_colloc_counts, trigram_collocation_folder, 'trigram_collocation_counts')

Trigram Collocations:
République I_corrected
République II_corrected
République III_corrected
République IV_corrected
République V_corrected
République VI_corrected
Réublique Preface_corrected
Trigram Collocation Counts:
République I_corrected
prince souverain  38
roy france  35
foy  hommage 27
an  p 25
roy  angleterre 25
empereur charle  22
 empereur charle 21
 prince souverain 19
 fil  18
 fieft  17
 roy france 15
roys france  14
pere  mere 13
vie  mort 12
roy françois  12
 non feulement 11
 cas pareil 11
 auftriche  11
roy charle  11
 hommage lige 11
neant  moins 10
 iln  10
faire mourir  9
naples  sicile 9
 sujet naturel 9
traitez  alliance 9
princes  empire 9
roy loüys  9
princes souverains  9
 homme lige 9

République II_corrected
   303
 estat  49
 empire  45
  estat 32
roy   22
 empereur  21
 estat populaire 20
estat populaire  17
 an  16
peuple   14
puissance   14
 com  12
estat   12
 honneur  12
  autant 11
  empereur 11
sujets   11
 bien  11
princes   11
an   11
 ariftocrati

In [19]:
underscore_dict = {}
for key, value in unigrams.items():

    tokenized_words = unigrams.get(key)
    collocations = colloc_dict.get(key)
    
    colloc_words = []
    
    # Iterate through the words making new versions combining collocations
    i = 0
    while i < len(tokenized_words) - 1:
        # If we find a collocation, add and advance by two words
        if (tokenized_words[i], tokenized_words[i + 1]) in collocations:
            colloc_words.append('_'.join((tokenized_words[i], tokenized_words[i + 1])))
            i += 2
        # Otherwise, advance by one word
        else:
            colloc_words.append(tokenized_words[i])
            i += 1

    # Add the last word (if any)
    if i == len(tokenized_words) - 1:
        colloc_words.append(tokenized_words[i])
    underscore_dict[key] = colloc_words

print("Underscore Dictionary:")
for key in underscore_dict:
    print(key)

write_dict_to_files_with_suffix(underscore_dict, underscore_folder, 'underscore_bigrams')

Underscore Dictionary:
République I_corrected
République II_corrected
République III_corrected
République IV_corrected
République V_corrected
République VI_corrected
Réublique Preface_corrected
République I_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
République II_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
République III_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
République IV_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
République V_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
République VI_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Réublique Preface_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams


In [20]:
trigram_underscore_dict = {}

for key, tokenized_words in unigrams.items():
    collocations = trigram_colloc_dict.get(key, [])
    colloc_words = []
    i = 0
    while i < len(tokenized_words) - 2:
        # If we find a trigram collocation, add and advance by three words
        trigram = (tokenized_words[i], tokenized_words[i + 1], tokenized_words[i + 2])
        if trigram in collocations:
            colloc_words.append('_'.join(trigram))
            i += 3
        else:
            colloc_words.append(tokenized_words[i])
            i += 1
    # Add the last words (if any)
    while i < len(tokenized_words):
        colloc_words.append(tokenized_words[i])
        i += 1
    trigram_underscore_dict[key] = colloc_words

print("Trigram underscore Dictionary:")
for key in trigram_underscore_dict:
    print(key)

write_dict_to_files_with_suffix(trigram_underscore_dict, trigram_underscore_folder, 'underscore_trigrams')

Trigram underscore Dictionary:
République I_corrected
République II_corrected
République III_corrected
République IV_corrected
République V_corrected
République VI_corrected
Réublique Preface_corrected
République I_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
République II_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
République III_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
République IV_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
République V_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
République VI_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Réublique Preface_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
