In [1]:
from pathlib import Path
import glob
import os
import sys
import csv
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize,wordpunct_tokenize
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures

import locale
# Set the locale to your desired setting (e.g., 'fr_FR.UTF-8' for French)
locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')

'fr_FR.UTF-8'

In [2]:
sys.path.append("/home/lucas-jerusalimiec/Documents/OCR Text/Notebooks")
from tokenizer_func  import (wordcleaner, write_words_to_file, dictionary_to_file, convert_tuple_bigrams,
convert_tuple_trigrams)

from extra_token_func import print_first_n_items, remove_keys_from_nested_dict

from additional_token_func import convert_strings_to_counts

from dict_write import write_dict_to_files_with_suffix

In [3]:
text_loc = Path("./final")
text_files = sorted(glob.glob(f"{text_loc}/*.txt"), key=locale.strxfrm)
output_folder = './tokenized/'
tokenized_folder = Path(output_folder)
tokenized_folder.mkdir(exist_ok=True)

output_unigram = f'{output_folder}unigram_counts'
unigram_folder = Path(output_unigram)
unigram_folder.mkdir(exist_ok=True)

output_bigram = f'{output_folder}bigram_counts'
bigram_folder = Path(output_bigram)
bigram_folder.mkdir(exist_ok=True)

output_trigram = f'{output_folder}trigram_counts'
trigram_folder = Path(output_trigram)
trigram_folder.mkdir(exist_ok=True)

output_collocation = f'{output_folder}collocation_counts'
collocation_folder = Path(output_collocation)
collocation_folder.mkdir(exist_ok=True)

output_trigram_collocation = f'{output_folder}trigram_collocation_counts'
trigram_collocation_folder = Path(output_trigram_collocation)
trigram_collocation_folder.mkdir(exist_ok=True)

output_underscore = f'{output_folder}underscore_bigrams'
underscore_folder = Path(output_underscore)
underscore_folder.mkdir(exist_ok=True)

output_trigram_underscore = f'{output_folder}underscore_trigrams'
trigram_underscore_folder = Path(output_trigram_underscore)
trigram_underscore_folder.mkdir(exist_ok=True)

print("Text files in the spellchecked directory:", text_files)

Text files in the spellchecked directory: ['final/Théatre I_corrected.txt', 'final/Théatre II_corrected.txt', 'final/Théatre III_corrected.txt', 'final/Théatre IV_corrected.txt', 'final/Théatre summary_corrected.txt', 'final/Théatre V_corrected.txt']


In [4]:
# Open stopwords CSV file and list the contents
with open('./stop_words.csv', 'r') as f:
    stopwords = f.read().strip().split(",")
stopwords[-10:]

['v', 'vn', 'vne', 'vnes', 'w', 'x', 'ya', 'encores', 'quele', 'queles']

In [5]:
tokenized_texts = {}
for txt in text_files:
    with open(txt, 'r') as f:
        content = f.read()
        file_name = txt.split('\\')[-1]
        #key = file_name.split('.')[0]
        key = os.path.splitext(os.path.basename(file_name))[0]
        tokenized_texts[key] = content
print("Raw texts:", list(tokenized_texts.keys()))       

Raw texts: ['Théatre I_corrected', 'Théatre II_corrected', 'Théatre III_corrected', 'Théatre IV_corrected', 'Théatre summary_corrected', 'Théatre V_corrected']


In [6]:
unigrams = {}

for key, value in tokenized_texts.items():
    unigram_list = wordpunct_tokenize(value)
    cleanwords = [wordcleaner(w) for w in unigram_list]
    unigrams[key] = cleanwords

for key, value in unigrams.items():
    filename = f"./tokenized/{key}.txt"
    write_words_to_file(value, filename, words_per_line=20)
    print(f"Saved content for '{key}' to {filename}")

Saved content for 'Théatre I_corrected' to ./tokenized/Théatre I_corrected.txt
Saved content for 'Théatre II_corrected' to ./tokenized/Théatre II_corrected.txt
Saved content for 'Théatre III_corrected' to ./tokenized/Théatre III_corrected.txt
Saved content for 'Théatre IV_corrected' to ./tokenized/Théatre IV_corrected.txt
Saved content for 'Théatre summary_corrected' to ./tokenized/Théatre summary_corrected.txt
Saved content for 'Théatre V_corrected' to ./tokenized/Théatre V_corrected.txt


In [7]:
print("Unigram texts:")
for key in unigrams:
    print(key)

Unigram texts:
Théatre I_corrected
Théatre II_corrected
Théatre III_corrected
Théatre IV_corrected
Théatre summary_corrected
Théatre V_corrected


In [8]:
# Count up the tokens using a Counter() object
unigram_counts = {}
for key, value in unigrams.items():
    unigram_counts_dict = Counter(value)
    unigram_counts[key] = unigram_counts_dict

print("Unigram Counts:")
for key in unigram_counts:
    print(key)

print_first_n_items(unigram_counts, 25)

Unigram Counts:
Théatre I_corrected
Théatre II_corrected
Théatre III_corrected
Théatre IV_corrected
Théatre summary_corrected
Théatre V_corrected
First 25 items in Théatre I_corrected:
: 7698
de: 729
la: 606
que: 557
l: 366
il: 363
en: 354
qu: 346
d: 316
le: 313
à: 308
qui: 298
les: 278
a: 274
ne: 272
ce: 231
n: 229
par: 224
des: 220
eft: 218
fe: 189
vn: 175
pas: 158
au: 153
du: 147

First 25 items in Théatre II_corrected:
: 6769
de: 812
l: 591
la: 568
que: 505
les: 394
qui: 365
en: 330
le: 320
à: 318
il: 272
des: 264
qu: 262
eft: 250
d: 226
plus: 202
a: 196
par: 192
ne: 182
ce: 163
s: 161
n: 150
m: 144
du: 143
ou: 141

First 25 items in Théatre III_corrected:
: 9144
de: 953
les: 733
la: 585
que: 553
le: 505
l: 498
qu: 432
des: 423
en: 408
à: 374
d: 324
qui: 321
ne: 261
a: 245
plus: 239
il: 234
par: 206
s: 203
on: 195
ce: 187
font: 185
n: 183
du: 180
fe: 169

First 25 items in Théatre IV_corrected:
: 8553
de: 718
l: 587
que: 555
la: 453
les: 383
qu: 347
en: 333
le: 299
qui: 289
d: 287


In [9]:
# Remove specified keys from the dictionary
stripped_unigrams = remove_keys_from_nested_dict(unigram_counts, stopwords)

print_first_n_items(stripped_unigrams, 30)

First 30 items in Théatre I_corrected:
: 7698
corps: 106
choses: 93
chose: 87
nature: 83
cause: 82
monde: 81
forme: 79
foit: 64
matiere: 61
rien: 58
mouvement: 49
quelque: 48
aussi: 44
dieu: 43
causes: 40
lieu: 39
quand: 39
principe: 39
dire: 38
non: 38
premiere: 37
faut: 36
donc: 36
efté: 36
fin: 35
bien: 35
grand: 34
faire: 34
autant: 34

First 30 items in Théatre II_corrected:
: 6769
eau: 104
air: 99
terre: 92
corps: 72
feu: 70
nature: 63
combien: 57
grand: 45
entre: 45
pourquoy: 45
fortes: 44
foit: 40
moins: 36
choses: 35
chose: 35
bien: 33
aussi: 31
fois: 29
fin: 29
là: 28
froid: 28
rien: 28
quand: 27
autant: 26
faut: 26
dire: 25
fort: 25
argent: 25
quelque: 24

First 30 items in Théatre III_corrected:
: 9144
animaux: 86
pourquoy: 76
elles: 72
grand: 71
nature: 68
plantes: 52
fois: 52
moins: 49
aussi: 44
fort: 42
où: 41
homme: 41
fin: 38
là: 36
force: 36
hommes: 36
vient: 35
chose: 34
ans: 34
choses: 33
fortes: 33
vie: 32
quelques: 32
terre: 32
efté: 31
foit: 31
quelque: 31
combie

In [10]:
dictionary_to_file(stripped_unigrams, unigram_folder, 'unigram_counts')

Saved Théatre I_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Théatre II_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Théatre III_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Théatre IV_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Théatre summary_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Théatre V_corrected_unigram_counts.csv in tokenized/unigram_counts


In [11]:
bigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    bigrams_list = list(nltk.bigrams(unigram_list))
    bigrams[key] = bigrams_list

print("Bigrams:")
for key in bigrams:
    print(key)

Bigrams:
Théatre I_corrected
Théatre II_corrected
Théatre III_corrected
Théatre IV_corrected
Théatre summary_corrected
Théatre V_corrected


In [12]:
bigram_counts = {}

for key, value in bigrams.items():
    string_bigrams = convert_tuple_bigrams(value)
    bigramCount = convert_strings_to_counts(string_bigrams)
    bigram_counts[key] = bigramCount

print("Bigram Counts:")
for key in bigram_counts:
    print(key)

print_first_n_items(bigram_counts, 30)

Bigram Counts:
Théatre I_corrected
Théatre II_corrected
Théatre III_corrected
Théatre IV_corrected
Théatre summary_corrected
Théatre V_corrected
First 30 items in Théatre I_corrected:
corps naturel: 27
quelque chose: 21
premiere cause: 15
cause efficiente: 9
peur faire: 5
pourquoy non: 5
trois principes: 5
principes nature: 5
corps physicien: 5
aucune chose: 4
bien souvent: 4
faut donc: 4
terme dont: 4
premiere matiere: 4
corps naturels: 4
quelle fin: 3
naturel estant: 3
chose absurde: 3
routes choses: 3
aucun temps: 3
pourquoy donc: 3
moyen entre: 3
beaucoup mieux: 3
plusieurs causes: 3
volonté foit: 3
premier principe: 3
beaucoup moins: 3
chose contraire: 3
toute chose: 3
commune choses: 3

First 30 items in Théatre II_corrected:
combien fortes: 21
argent vif: 12
voilà pourquoy: 8
quelque chose: 6
long temps: 5
participe nature: 4
corps elementaires: 4
eau douce: 4
là où: 4
corps naturels: 3
corps naturel: 3
chacune chose: 3
là vient: 3
chose monde: 3
quelques uns: 3
toute nature: 3


In [13]:
dictionary_to_file(bigram_counts, bigram_folder, 'bigram_counts')

Saved Théatre I_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Théatre II_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Théatre III_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Théatre IV_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Théatre summary_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Théatre V_corrected_bigram_counts.csv in tokenized/bigram_counts


In [14]:
trigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    trigrams_list = list(nltk.trigrams(unigram_list))
    trigrams[key] = trigrams_list

print("Trigrams:")
for key in bigrams:
    print(key)

Trigrams:
Théatre I_corrected
Théatre II_corrected
Théatre III_corrected
Théatre IV_corrected
Théatre summary_corrected
Théatre V_corrected


In [15]:
trigram_counts = {}

for key, value in trigrams.items():
    string_trigrams = convert_tuple_trigrams(value)
    trigramCount = convert_strings_to_counts(string_trigrams)
    trigram_counts[key] = trigramCount

print("Trigram Counts:")
for key in trigram_counts:
    print(key)
    
print_first_n_items(trigram_counts, 30)

Trigram Counts:
Théatre I_corrected
Théatre II_corrected
Théatre III_corrected
Théatre IV_corrected
Théatre summary_corrected
Théatre V_corrected
First 30 items in Théatre I_corrected:
premiere cause foit: 2
cause efficiente monde: 2
corps naturel estant: 2
dv éheatre gg: 1
p ax we: 1
ax we ul: 1
mc pen ve: 1
pen ve mas: 1
jn het lies: 1
het lies dite: 1
lies dite cnm: 1
dite cnm sector: 1
moyen snslementles causes: 1
snslementles causes choses: 1
causes choses fins: 1
certes demandes chose: 1
eftimerois grande felicité: 1
pouuois obtenir quelque: 1
obtenir quelque homme: 1
quelque homme zou: 1
homme zou quelque: 1
zou quelque dieu: 1
nus fin theatre: 1
fin theatre monde: 1
puitlance grand ouurier: 1
grand ouurier choses: 1
ràuis ardente affeétion: 1
ardente affeétion celebrer: 1
affeétion celebrer louanges: 1
celebrer louanges contéplationde: 1

First 30 items in Théatre II_corrected:
autant pouuons dire: 2
contiennent chacune chose: 2
chacune chose monde: 2
presse second dv: 1
second

In [16]:
dictionary_to_file(trigram_counts, trigram_folder, 'trigram_counts')

Saved Théatre I_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Théatre II_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Théatre III_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Théatre IV_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Théatre summary_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Théatre V_corrected_trigram_counts.csv in tokenized/trigram_counts


In [17]:
colloc_dict = {}
colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_finder.apply_freq_filter(3)  # Make sure all collocations have occurred at least 5 times
    collocations = bigram_finder.nbest(BigramAssocMeasures.pmi, 500)
    filtered_collocations = [bigram for bigram in collocations if '' not in bigram]
    colloc_dict[key] = filtered_collocations
    
    # Initialize Counter for colloc_counts
    bigram_count_dict = Counter()

    # Count the occurrences of each bigram in the text
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_freqs = bigram_finder.ngram_fd.items()
    
    # Filter bigram counts based on collocations
    for bigram, count in bigram_freqs:
        if bigram in filtered_collocations:
            bigram_count_dict[bigram] = count

    colloc_counts[key] = bigram_count_dict

print("Collocations:")
for key, value in colloc_dict.items():
    print(key)
    # for w1, w2 in value:
    #     print(' ', w1, w2)

print("Collocation Counts:")
for key in colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in colloc_counts[key].most_common(30):
        bigram = " ".join(item)
        print(f"{bigram} {count}")
    print()

dictionary_to_file(colloc_counts, collocation_folder, 'collocation_counts')

Collocations:
Théatre I_corrected
Théatre II_corrected
Théatre III_corrected
Théatre IV_corrected
Théatre summary_corrected
Théatre V_corrected
Collocation Counts:
Théatre I_corrected
corps naturel 27
quelque chose 21
premiere cause 15
cause efficiente 9
peur faire 5
pourquoy non 5
trois principes 5
principes nature 5
corps physicien 5
aucune chose 4
bien souvent 4
faut donc 4
terme dont 4
premiere matiere 4
corps naturels 4
quelle fin 3
naturel estant 3
chose absurde 3
routes choses 3
aucun temps 3
pourquoy donc 3
moyen entre 3
beaucoup mieux 3
plusieurs causes 3
volonté foit 3
premier principe 3
beaucoup moins 3
chose contraire 3
toute chose 3
commune choses 3

Théatre II_corrected
combien fortes 21
argent vif 12
voilà pourquoy 8
quelque chose 6
long temps 5
participe nature 4
corps elementaires 4
eau douce 4
là où 4
corps naturels 3
corps naturel 3
chacune chose 3
là vient 3
chose monde 3
quelques uns 3
toute nature 3
va fond 3
faut icy 3
loix nature 3
quelque peu 3
quelle chose 3
e

In [18]:
trigram_colloc_dict = {}
trigram_colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    trigram_finder = TrigramCollocationFinder.from_words(unigram_list)
    trigram_finder.apply_freq_filter(3)  # Ensure all collocations have occurred at least 5 times
    collocations = trigram_finder.nbest(TrigramAssocMeasures.pmi, 500)
    filtered_collocations = [trigram for trigram in collocations if '' not in trigram]
    trigram_colloc_dict[key] = filtered_collocations
    
    # Initialize Counter for trigram_colloc_counts
    trigram_count_dict = Counter()

    # Count the occurrences of each trigram in the text
    trigram_freqs = trigram_finder.ngram_fd.items()
    
    # Filter trigram counts based on collocations
    for trigram, count in trigram_freqs:
        if trigram in filtered_collocations:
            trigram_count_dict[trigram] = count

    trigram_colloc_counts[key] = trigram_count_dict

print("Trigram Collocations:")
for key, value in trigram_colloc_dict.items():
    print(key)
    #for w1, w2, w3 in value:
    #    print(' ', w1, w2, w3)

print("Trigram Collocation Counts:")
for key in trigram_colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in trigram_colloc_counts[key].most_common(30):
        trigram = " ".join(item)
        print(f"{trigram} {count}")
    print()

dictionary_to_file(trigram_colloc_counts, trigram_collocation_folder, 'trigram_collocation_counts')

Trigram Collocations:
Théatre I_corrected
Théatre II_corrected
Théatre III_corrected
Théatre IV_corrected
Théatre summary_corrected
Théatre V_corrected
Trigram Collocation Counts:
Théatre I_corrected

Théatre II_corrected

Théatre III_corrected
forme pomme pin 3
fuit pres nature 3
où vient chaftrez 3

Théatre IV_corrected

Théatre summary_corrected

Théatre V_corrected
neuf mille ans 3

Saved Théatre I_corrected_trigram_collocation_counts.csv in tokenized/trigram_collocation_counts
Saved Théatre II_corrected_trigram_collocation_counts.csv in tokenized/trigram_collocation_counts
Saved Théatre III_corrected_trigram_collocation_counts.csv in tokenized/trigram_collocation_counts
Saved Théatre IV_corrected_trigram_collocation_counts.csv in tokenized/trigram_collocation_counts
Saved Théatre summary_corrected_trigram_collocation_counts.csv in tokenized/trigram_collocation_counts
Saved Théatre V_corrected_trigram_collocation_counts.csv in tokenized/trigram_collocation_counts


In [19]:
underscore_dict = {}
for key, value in unigrams.items():

    tokenized_words = unigrams.get(key)
    collocations = colloc_dict.get(key)
    
    colloc_words = []
    
    # Iterate through the words making new versions combining collocations
    i = 0
    while i < len(tokenized_words) - 1:
        # If we find a collocation, add and advance by two words
        if (tokenized_words[i], tokenized_words[i + 1]) in collocations:
            colloc_words.append('_'.join((tokenized_words[i], tokenized_words[i + 1])))
            i += 2
        # Otherwise, advance by one word
        else:
            colloc_words.append(tokenized_words[i])
            i += 1

    # Add the last word (if any)
    if i == len(tokenized_words) - 1:
        colloc_words.append(tokenized_words[i])
    underscore_dict[key] = colloc_words

print("Underscore Dictionary:")
for key in underscore_dict:
    print(key)

write_dict_to_files_with_suffix(underscore_dict, underscore_folder, 'underscore_bigrams')

Underscore Dictionary:
Théatre I_corrected
Théatre II_corrected
Théatre III_corrected
Théatre IV_corrected
Théatre summary_corrected
Théatre V_corrected
Théatre I_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Théatre II_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Théatre III_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Théatre IV_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Théatre summary_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Théatre V_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams


In [20]:
trigram_underscore_dict = {}

for key, tokenized_words in unigrams.items():
    collocations = trigram_colloc_dict.get(key, [])
    colloc_words = []
    i = 0
    while i < len(tokenized_words) - 2:
        # If we find a trigram collocation, add and advance by three words
        trigram = (tokenized_words[i], tokenized_words[i + 1], tokenized_words[i + 2])
        if trigram in collocations:
            colloc_words.append('_'.join(trigram))
            i += 3
        else:
            colloc_words.append(tokenized_words[i])
            i += 1
    # Add the last words (if any)
    while i < len(tokenized_words):
        colloc_words.append(tokenized_words[i])
        i += 1
    trigram_underscore_dict[key] = colloc_words

print("Trigram underscore Dictionary:")
for key in trigram_underscore_dict:
    print(key)

write_dict_to_files_with_suffix(trigram_underscore_dict, trigram_underscore_folder, 'underscore_trigrams')

Trigram underscore Dictionary:
Théatre I_corrected
Théatre II_corrected
Théatre III_corrected
Théatre IV_corrected
Théatre summary_corrected
Théatre V_corrected
Théatre I_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Théatre II_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Théatre III_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Théatre IV_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Théatre summary_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Théatre V_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
