In [1]:
from pathlib import Path
import glob
import os
import sys
import csv
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize,wordpunct_tokenize
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures

In [2]:
sys.path.append("/home/lucas-jerusalimiec/Documents/OCR Text/Notebooks")
from tokenizer_func  import (wordcleaner, write_words_to_file, dictionary_to_file, convert_tuple_bigrams,
convert_tuple_trigrams)

from extra_token_func import print_first_n_items, remove_keys_from_nested_dict

from additional_token_func import convert_strings_to_counts

from dict_write import write_dict_to_files_with_suffix

In [3]:
text_loc = Path("./final")
text_files = glob.glob(f"{text_loc}/*.txt")
output_folder = './tokenized/'
tokenized_folder = Path(output_folder)
tokenized_folder.mkdir(exist_ok=True)
print(output_folder)
print("Text files in the spellchecked directory:", text_files)

./tokenized/
Text files in the spellchecked directory: ['final/Discours des raisons_corrected.txt']


In [4]:
# Open stopwords CSV file and list the contents
with open('./stop_words.csv', 'r') as f:
    stopwords = f.read().strip().split(",")
stopwords[-10:]

['w', 'iij', 'enl', 'ilz', 'ung', 'esté', 'seroit', 'mp', 'am', 'lx']

In [5]:
tokenized_texts = {}
for txt in text_files:
    with open(txt, 'r') as f:
        content = f.read()
        file_name = txt.split('\\')[-1]
        #key = file_name.split('.')[0]
        key = os.path.splitext(os.path.basename(file_name))[0]
        tokenized_texts[key] = content
print("Raw texts:", list(tokenized_texts.keys()))       

Raw texts: ['Discours des raisons_corrected']


In [6]:
unigrams = {}

for key, value in tokenized_texts.items():
    unigram_list = wordpunct_tokenize(value)
    cleanwords = [wordcleaner(w) for w in unigram_list]
    unigrams[key] = cleanwords

for key, value in unigrams.items():
    filename = f"./tokenized/{key}.txt"
    write_words_to_file(value, filename, words_per_line=20)
    print(f"Saved content for '{key}' to {filename}")

Saved content for 'Discours des raisons_corrected' to ./tokenized/Discours des raisons_corrected.txt


In [7]:
print("Unigram texts:")
for key in unigrams:
    print(key)

Unigram texts:
Discours des raisons_corrected


In [8]:
# Count up the tokens using a Counter() object
unigram_counts = {}
for key, value in unigrams.items():
    unigram_counts_dict = Counter(value)
    unigram_counts[key] = unigram_counts_dict

print("Unigram Counts:")
for key in unigram_counts:
    print(key)

print_first_n_items(unigram_counts, 25)

Unigram Counts:
Discours des raisons_corrected
First 25 items in Discours des raisons_corrected:
: 1828
et: 441
de: 344
la: 233
à: 167
le: 163
les: 159
que: 156
l: 123
est: 120
qui: 98
en: 97
ne: 87
d: 81
qu: 80
ce: 76
plus: 74
ilz: 67
il: 64
du: 64
des: 62
par: 60
leur: 60
si: 53
pour: 52



In [9]:
# Remove specified keys from the dictionary
stripped_unigrams = remove_keys_from_nested_dict(unigram_counts, stopwords)

print_first_n_items(stripped_unigrams, 30)

First 30 items in Discours des raisons_corrected:
plus: 74
si: 53
leurs: 52
roy: 47
mémoires: 36
sans: 25
comme: 24
hommes: 22
tous: 22
tant: 22
ceulx: 21
ceste: 21
aultres: 20
bien: 20
tout: 19
paix: 18
mal: 18
dieu: 18
guerre: 17
contre: 17
estat: 17
faire: 16
aultre: 16
toutes: 15
eulx: 15
prince: 15
ainsy: 15
subjects: 15
estre: 14
liberté: 14



In [10]:
dictionary_to_file(stripped_unigrams, output_folder, 'unigram_counts')

Saved Discours des raisons_corrected_unigram_counts.csv in ./tokenized/


In [11]:
bigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    bigrams_list = list(nltk.bigrams(unigram_list))
    bigrams[key] = bigrams_list

print("Bigrams:")
for key in bigrams:
    print(key)

Bigrams:
Discours des raisons_corrected


In [12]:
bigram_counts = {}

for key, value in bigrams.items():
    string_bigrams = convert_tuple_bigrams(value)
    bigramCount = convert_strings_to_counts(string_bigrams)
    bigram_counts[key] = bigramCount

print("Bigram Counts:")
for key in bigram_counts:
    print(key)

print_first_n_items(bigram_counts, 30)

Bigram Counts:
Discours des raisons_corrected
First 30 items in Discours des raisons_corrected:
toutes choses: 5
peult estre: 4
princes peuples: 4
bon droict: 4
tout ainsy: 4
ainsy disoit: 4
donner loy: 4
peuple romain: 4
roy plus: 3
plus grand: 3
tous ceulx: 3
cy devant: 3
aultre costé: 3
aujourd huy: 3
plus fort: 3
si roy: 3
aultre chose: 3
non seulement: 3
guerre paix: 2
point suivi: 2
toutes aultres: 2
aultres choses: 2
tous princes: 2
plus juste: 2
leurs entreprinses: 2
sans discipline: 2
avecque eulx: 2
leurs vies: 2
maisons femmes: 2
femmes enfans: 2



In [13]:
dictionary_to_file(bigram_counts, output_folder, 'bigram_counts')

Saved Discours des raisons_corrected_bigram_counts.csv in ./tokenized/


In [14]:
trigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    trigrams_list = list(nltk.trigrams(unigram_list))
    trigrams[key] = trigrams_list

print("Trigrams:")
for key in bigrams:
    print(key)

Trigrams:
Discours des raisons_corrected


In [15]:
trigram_counts = {}

for key, value in trigrams.items():
    string_trigrams = convert_tuple_trigrams(value)
    trigramCount = convert_strings_to_counts(string_trigrams)
    trigram_counts[key] = trigramCount

print("Trigram Counts:")
for key in trigram_counts:
    print(key)
    
print_first_n_items(trigram_counts, 30)

Trigram Counts:
Discours des raisons_corrected
First 30 items in Discours des raisons_corrected:
tous princes peuples: 2
maisons femmes enfans: 2
roy peult estre: 2
sorte si guerre: 2
donner loy subjects: 2
discours raisons persuasions: 1
raisons persuasions paix: 1
persuasions paix an: 1
paix an but: 1
an but guerre: 1
but guerre paix: 1
guerre paix laquelle: 1
paix laquelle sacquiert: 1
laquelle sacquiert composition: 1
sacquiert composition pleine: 1
composition pleine entiére: 1
pleine entiére victoire: 1
entiére victoire voix: 1
victoire voix composition: 1
voix composition semble: 1
composition semble mal: 1
semble mal séante: 1
mal séante deffiance: 1
séante deffiance réciproque: 1
deffiance réciproque mutitelles: 1
réciproque mutitelles haines: 1
mutitelles haines injures: 1
haines injures sub: 1
injures sub sistance: 1
sub sistance deux: 1



In [16]:
dictionary_to_file(trigram_counts, output_folder, 'trigram_counts')

Saved Discours des raisons_corrected_trigram_counts.csv in ./tokenized/


In [17]:
colloc_dict = {}
colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_finder.apply_freq_filter(3)  # Make sure all collocations have occurred at least 5 times
    collocations = bigram_finder.nbest(BigramAssocMeasures.pmi, 500)
    colloc_dict[key] = collocations
    
    # Initialize Counter for colloc_counts
    bigram_count_dict = Counter()

    # Count the occurrences of each bigram in the text
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_freqs = bigram_finder.ngram_fd.items()
    
    # Filter bigram counts based on collocations
    for bigram, count in bigram_freqs:
        if bigram in collocations:
            bigram_count_dict[bigram] = count

    colloc_counts[key] = bigram_count_dict

print("Collocations:")
for key, value in colloc_dict.items():
    print(key)
    # for w1, w2 in value:
    #     print(' ', w1, w2)

print("Collocation Counts:")
for key in colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in colloc_counts[key].most_common(30):
        bigram = " ".join(item)
        print(f"{bigram} {count}")
    print()

dictionary_to_file(colloc_counts, output_folder, 'collocation_counts')

Collocations:
Discours des raisons_corrected
Collocation Counts:
Discours des raisons_corrected
toutes choses 5
peult estre 4
princes peuples 4
bon droict 4
tout ainsy 4
ainsy disoit 4
donner loy 4
peuple romain 4
roy plus 3
plus grand 3
tous ceulx 3
cy devant 3
aultre costé 3
aujourd huy 3
plus fort 3
si roy 3
aultre chose 3
non seulement 3

Saved Discours des raisons_corrected_collocation_counts.csv in ./tokenized/


In [18]:
trigram_colloc_dict = {}
trigram_colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    trigram_finder = TrigramCollocationFinder.from_words(unigram_list)
    trigram_finder.apply_freq_filter(3)  # Ensure all collocations have occurred at least 5 times
    collocations = trigram_finder.nbest(TrigramAssocMeasures.pmi, 500)
    trigram_colloc_dict[key] = collocations
    
    # Initialize Counter for trigram_colloc_counts
    trigram_count_dict = Counter()

    # Count the occurrences of each trigram in the text
    trigram_freqs = trigram_finder.ngram_fd.items()
    
    # Filter trigram counts based on collocations
    for trigram, count in trigram_freqs:
        if trigram in collocations:
            trigram_count_dict[trigram] = count

    trigram_colloc_counts[key] = trigram_count_dict

print("Trigram Collocations:")
for key, value in trigram_colloc_dict.items():
    print(key)
    #for w1, w2, w3 in value:
    #    print(' ', w1, w2, w3)

print("Trigram Collocation Counts:")
for key in trigram_colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in trigram_colloc_counts[key].most_common(30):
        trigram = " ".join(item)
        print(f"{trigram} {count}")
    print()

dictionary_to_file(trigram_colloc_counts, output_folder, 'trigram_collocation_counts')

Trigram Collocations:
Discours des raisons_corrected
Trigram Collocation Counts:
Discours des raisons_corrected

Saved Discours des raisons_corrected_trigram_collocation_counts.csv in ./tokenized/


In [19]:
underscore_dict = {}
for key, value in unigrams.items():

    tokenized_words = unigrams.get(key)
    collocations = colloc_dict.get(key)
    
    colloc_words = []
    
    # Iterate through the words making new versions combining collocations
    i = 0
    while i < len(tokenized_words) - 1:
        # If we find a collocation, add and advance by two words
        if (tokenized_words[i], tokenized_words[i + 1]) in collocations:
            colloc_words.append('_'.join((tokenized_words[i], tokenized_words[i + 1])))
            i += 2
        # Otherwise, advance by one word
        else:
            colloc_words.append(tokenized_words[i])
            i += 1

    # Add the last word (if any)
    if i == len(tokenized_words) - 1:
        colloc_words.append(tokenized_words[i])
    underscore_dict[key] = colloc_words

print("Underscore Dictionary:")
for key in underscore_dict:
    print(key)

write_dict_to_files_with_suffix(underscore_dict, output_folder, 'underscore_bigrams')

Underscore Dictionary:
Discours des raisons_corrected
Discours des raisons_corrected_underscore_bigrams.txt in ./tokenized/


In [20]:
trigram_underscore_dict = {}

for key, tokenized_words in unigrams.items():
    collocations = trigram_colloc_dict.get(key, [])
    colloc_words = []
    i = 0
    while i < len(tokenized_words) - 2:
        # If we find a trigram collocation, add and advance by three words
        trigram = (tokenized_words[i], tokenized_words[i + 1], tokenized_words[i + 2])
        if trigram in collocations:
            colloc_words.append('_'.join(trigram))
            i += 3
        else:
            colloc_words.append(tokenized_words[i])
            i += 1
    # Add the last words (if any)
    while i < len(tokenized_words):
        colloc_words.append(tokenized_words[i])
        i += 1
    trigram_underscore_dict[key] = colloc_words

print("Trigram underscore Dictionary:")
for key in trigram_underscore_dict:
    print(key)

write_dict_to_files_with_suffix(trigram_underscore_dict, output_folder, 'underscore_trigrams')

Trigram underscore Dictionary:
Discours des raisons_corrected
Discours des raisons_corrected_underscore_trigrams.txt in ./tokenized/
