In [1]:
from pathlib import Path
import glob
import os
import sys
import csv
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize,wordpunct_tokenize
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures

In [2]:
sys.path.append("/home/lucas-jerusalimiec/Documents/OCR Text/Notebooks")
from tokenizer_func  import (wordcleaner, write_words_to_file, dictionary_to_file, convert_tuple_bigrams,
convert_tuple_trigrams)

from extra_token_func import print_first_n_items, remove_keys_from_nested_dict

from additional_token_func import convert_strings_to_counts

from dict_write import write_dict_to_files_with_suffix

In [3]:
text_loc = Path("./final")
text_files = glob.glob(f"{text_loc}/*.txt")
output_folder = './tokenized/'
tokenized_folder = Path(output_folder)
tokenized_folder.mkdir(exist_ok=True)
print(output_folder)
print("Text files in the spellchecked directory:", text_files)

./tokenized/
Text files in the spellchecked directory: ['final/Théatre_corrected.txt']


In [4]:
# Open stopwords CSV file and list the contents
with open('./stop_words.csv', 'r') as f:
    stopwords = f.read().strip().split(",")
stopwords[-10:]

['ie', 'fc', 'del', 'o', 'dé', 'dela', 'ay', 'w', 'iij', 'enl']

In [5]:
tokenized_texts = {}
for txt in text_files:
    with open(txt, 'r') as f:
        content = f.read()
        file_name = txt.split('\\')[-1]
        #key = file_name.split('.')[0]
        key = os.path.splitext(os.path.basename(file_name))[0]
        tokenized_texts[key] = content
print("Raw texts:", list(tokenized_texts.keys()))       

Raw texts: ['Théatre_corrected']


In [6]:
unigrams = {}

for key, value in tokenized_texts.items():
    unigram_list = wordpunct_tokenize(value)
    cleanwords = [wordcleaner(w) for w in unigram_list]
    unigrams[key] = cleanwords

for key, value in unigrams.items():
    filename = f"./tokenized/{key}.txt"
    write_words_to_file(value, filename, words_per_line=20)
    print(f"Saved content for '{key}' to {filename}")

Saved content for 'Théatre_corrected' to ./tokenized/Théatre_corrected.txt


In [7]:
print("Unigram texts:")
for key in unigrams:
    print(key)

Unigram texts:
Théatre_corrected


In [8]:
# Count up the tokens using a Counter() object
unigram_counts = {}
for key, value in unigrams.items():
    unigram_counts_dict = Counter(value)
    unigram_counts[key] = unigram_counts_dict

print("Unigram Counts:")
for key in unigram_counts:
    print(key)

print_first_n_items(unigram_counts, 25)

Unigram Counts:
Théatre_corrected
First 25 items in Théatre_corrected:
: 74568
de: 8277
la: 5498
que: 5297
l: 4758
les: 4233
le: 3630
en: 3486
qu: 3209
à: 3171
qui: 2883
il: 2809
d: 2702
des: 2657
ne: 2377
par: 2172
a: 2167
plus: 1824
eft: 1778
s: 1705
n: 1700
ce: 1583
du: 1572
fe: 1466
au: 1431



In [9]:
# Remove specified keys from the dictionary
stripped_unigrams = remove_keys_from_nested_dict(unigram_counts, stopwords)

print_first_n_items(stripped_unigrams, 30)

First 30 items in Théatre_corrected:
plus: 1824
comme: 910
corps: 688
autre: 672
autres: 665
peut: 625
foit: 552
toutes: 546
nature: 545
tout: 535
deux: 501
mefme: 496
laquelle: 495
grand: 469
fois: 435
terre: 432
tant: 428
ni: 425
puis: 385
pourquoy: 383
fans: 376
eau: 367
ame: 364
leurs: 350
auec: 349
fes: 345
ainfi: 344
elles: 334
tous: 330
rien: 325



In [10]:
dictionary_to_file(stripped_unigrams, output_folder, 'unigram_counts')

Saved Théatre_corrected_unigram_counts.csv in ./tokenized/


In [11]:
bigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    bigrams_list = list(nltk.bigrams(unigram_list))
    bigrams[key] = bigrams_list

print("Bigrams:")
for key in bigrams:
    print(key)

Bigrams:
Théatre_corrected


In [12]:
bigram_counts = {}

for key, value in bigrams.items():
    string_bigrams = convert_tuple_bigrams(value)
    bigramCount = convert_strings_to_counts(string_bigrams)
    bigram_counts[key] = bigramCount

print("Bigram Counts:")
for key in bigram_counts:
    print(key)

print_first_n_items(bigram_counts, 30)

Bigram Counts:
Théatre_corrected
First 30 items in Théatre_corrected:
toutes fois: 165
plus grand: 159
où vient: 87
peut faire: 79
plus moins: 78
beaucoup plus: 62
tous autres: 60
tant plus: 60
corps naturel: 59
voilà pourquoy: 55
plus grande: 49
long temps: 48
autre chofe: 45
tout ainfi: 43
autres animaux: 42
peut entendre: 41
toutes fortes: 41
foit plus: 39
fois plus: 39
plus petit: 37
toutes choses: 36
vns autres: 36
là où: 36
toutes chofes: 34
combien fortes: 34
puis apres: 34
quelque chofe: 33
autre chose: 32
fes parties: 31
non plus: 30



In [13]:
dictionary_to_file(bigram_counts, output_folder, 'bigram_counts')

Saved Théatre_corrected_bigram_counts.csv in ./tokenized/


In [14]:
trigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    trigrams_list = list(nltk.trigrams(unigram_list))
    trigrams[key] = trigrams_list

print("Trigrams:")
for key in bigrams:
    print(key)

Trigrams:
Théatre_corrected


In [15]:
trigram_counts = {}

for key, value in trigrams.items():
    string_trigrams = convert_tuple_trigrams(value)
    trigramCount = convert_strings_to_counts(string_trigrams)
    trigram_counts[key] = trigramCount

print("Trigram Counts:")
for key in trigram_counts:
    print(key)
    
print_first_n_items(trigram_counts, 30)

Trigram Counts:
Théatre_corrected
First 30 items in Théatre_corrected:
comment peut faire: 18
plus grand partie: 16
auec plus grand: 12
tout enfemble fois: 12
plus grand plus: 11
plus vray femblable: 11
fois plus grand: 10
tout ensemble fois: 9
là peut entendre: 9
tous autres animaux: 9
differentes vnes autres: 8
long temps fans: 8
deux fois plus: 8
rien foit plus: 7
foit plus propre: 7
plus longue durée: 7
long temps apres: 7
fois plus grande: 7
mille cinq cents: 7
plus grand nombre: 6
tres bon tres: 6
quel ineonuenient auroit: 6
pourquoy non pource: 6
fur tous autres: 6
vaut autant dire: 6
ainfi certes efcript: 6
tue où vient: 6
vingt quatre heures: 6
plus haut ciel: 5
ineonuenient auroit difions: 5



In [16]:
dictionary_to_file(trigram_counts, output_folder, 'trigram_counts')

Saved Théatre_corrected_trigram_counts.csv in ./tokenized/


In [17]:
colloc_dict = {}
colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_finder.apply_freq_filter(3)  # Make sure all collocations have occurred at least 5 times
    collocations = bigram_finder.nbest(BigramAssocMeasures.pmi, 500)
    colloc_dict[key] = collocations
    
    # Initialize Counter for colloc_counts
    bigram_count_dict = Counter()

    # Count the occurrences of each bigram in the text
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_freqs = bigram_finder.ngram_fd.items()
    
    # Filter bigram counts based on collocations
    for bigram, count in bigram_freqs:
        if bigram in collocations:
            bigram_count_dict[bigram] = count

    colloc_counts[key] = bigram_count_dict

print("Collocations:")
for key, value in colloc_dict.items():
    print(key)
    # for w1, w2 in value:
    #     print(' ', w1, w2)

print("Collocation Counts:")
for key in colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in colloc_counts[key].most_common(30):
        bigram = " ".join(item)
        print(f"{bigram} {count}")
    print()

dictionary_to_file(colloc_counts, output_folder, 'collocation_counts')

Collocations:
Théatre_corrected
Collocation Counts:
Théatre_corrected
où vient 87
voilà pourquoy 55
long temps 48
grand peine 27
ceftuy cy 25
auons defia 24
quelques vns 24
entendement agent 23
longue durée 22
cinq cens 22
neant moins 19
hors mis 19
vray femblable 19
pierres metaux 16
dont aduient 16
argent vif 16
seroit autant 14
quatriesme livre 14
orient occident 13
chaleur naturelle 12
nombre infiny 12
cause efficiente 11
differentes vnes 11
anges demons 11
mille cinq 11
faut douter 10
auons monftré 10
bon gré 10
loy diuine 10
cinq cents 10

Saved Théatre_corrected_collocation_counts.csv in ./tokenized/


In [18]:
trigram_colloc_dict = {}
trigram_colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    trigram_finder = TrigramCollocationFinder.from_words(unigram_list)
    trigram_finder.apply_freq_filter(3)  # Ensure all collocations have occurred at least 5 times
    collocations = trigram_finder.nbest(TrigramAssocMeasures.pmi, 500)
    trigram_colloc_dict[key] = collocations
    
    # Initialize Counter for trigram_colloc_counts
    trigram_count_dict = Counter()

    # Count the occurrences of each trigram in the text
    trigram_freqs = trigram_finder.ngram_fd.items()
    
    # Filter trigram counts based on collocations
    for trigram, count in trigram_freqs:
        if trigram in collocations:
            trigram_count_dict[trigram] = count

    trigram_colloc_counts[key] = trigram_count_dict

print("Trigram Collocations:")
for key, value in trigram_colloc_dict.items():
    print(key)
    #for w1, w2, w3 in value:
    #    print(' ', w1, w2, w3)

print("Trigram Collocation Counts:")
for key in trigram_colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in trigram_colloc_counts[key].most_common(30):
        trigram = " ".join(item)
        print(f"{trigram} {count}")
    print()

dictionary_to_file(trigram_colloc_counts, output_folder, 'trigram_collocation_counts')

Trigram Collocations:
Théatre_corrected
Trigram Collocation Counts:
Théatre_corrected
comment peut faire 18
plus grand partie 16
auec plus grand 12
tout enfemble fois 12
plus grand plus 11
plus vray femblable 11
fois plus grand 10
tout ensemble fois 9
là peut entendre 9
tous autres animaux 9
differentes vnes autres 8
long temps fans 8
deux fois plus 8
rien foit plus 7
foit plus propre 7
plus longue durée 7
long temps apres 7
fois plus grande 7
mille cinq cents 7
plus grand nombre 6
tres bon tres 6
quel ineonuenient auroit 6
pourquoy non pource 6
fur tous autres 6
vaut autant dire 6
ainfi certes efcript 6
tue où vient 6
vingt quatre heures 6
plus haut ciel 5
ineonuenient auroit difions 5

Saved Théatre_corrected_trigram_collocation_counts.csv in ./tokenized/


In [19]:
underscore_dict = {}
for key, value in unigrams.items():

    tokenized_words = unigrams.get(key)
    collocations = colloc_dict.get(key)
    
    colloc_words = []
    
    # Iterate through the words making new versions combining collocations
    i = 0
    while i < len(tokenized_words) - 1:
        # If we find a collocation, add and advance by two words
        if (tokenized_words[i], tokenized_words[i + 1]) in collocations:
            colloc_words.append('_'.join((tokenized_words[i], tokenized_words[i + 1])))
            i += 2
        # Otherwise, advance by one word
        else:
            colloc_words.append(tokenized_words[i])
            i += 1

    # Add the last word (if any)
    if i == len(tokenized_words) - 1:
        colloc_words.append(tokenized_words[i])
    underscore_dict[key] = colloc_words

print("Underscore Dictionary:")
for key in underscore_dict:
    print(key)

write_dict_to_files_with_suffix(underscore_dict, output_folder, 'underscore_bigrams')

Underscore Dictionary:
Théatre_corrected
Théatre_corrected_underscore_bigrams.txt in ./tokenized/


In [20]:
trigram_underscore_dict = {}

for key, tokenized_words in unigrams.items():
    collocations = trigram_colloc_dict.get(key, [])
    colloc_words = []
    i = 0
    while i < len(tokenized_words) - 2:
        # If we find a trigram collocation, add and advance by three words
        trigram = (tokenized_words[i], tokenized_words[i + 1], tokenized_words[i + 2])
        if trigram in collocations:
            colloc_words.append('_'.join(trigram))
            i += 3
        else:
            colloc_words.append(tokenized_words[i])
            i += 1
    # Add the last words (if any)
    while i < len(tokenized_words):
        colloc_words.append(tokenized_words[i])
        i += 1
    trigram_underscore_dict[key] = colloc_words

print("Trigram underscore Dictionary:")
for key in trigram_underscore_dict:
    print(key)

write_dict_to_files_with_suffix(trigram_underscore_dict, output_folder, 'underscore_trigrams')

Trigram underscore Dictionary:
Théatre_corrected
Théatre_corrected_underscore_trigrams.txt in ./tokenized/
