In [1]:
from pathlib import Path
import glob
import os
import sys
import csv
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize,wordpunct_tokenize
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures

In [2]:
sys.path.append("/home/lucas-jerusalimiec/Documents/OCR Text/Notebooks")
from tokenizer_func  import (wordcleaner, write_words_to_file, dictionary_to_file, convert_tuple_bigrams,
convert_tuple_trigrams)

from extra_token_func import print_first_n_items, remove_keys_from_nested_dict

from additional_token_func import convert_strings_to_counts

from dict_write import write_dict_to_files_with_suffix

In [3]:
text_loc = Path("./final")
text_files = glob.glob(f"{text_loc}/*.txt")
output_folder = './tokenized/'
tokenized_folder = Path(output_folder)
tokenized_folder.mkdir(exist_ok=True)

output_unigram = f'{output_folder}unigram_counts'
unigram_folder = Path(output_unigram)
unigram_folder.mkdir(exist_ok=True)

output_bigram = f'{output_folder}bigram_counts'
bigram_folder = Path(output_bigram)
bigram_folder.mkdir(exist_ok=True)

output_trigram = f'{output_folder}trigram_counts'
trigram_folder = Path(output_trigram)
trigram_folder.mkdir(exist_ok=True)

output_collocation = f'{output_folder}collocation_counts'
collocation_folder = Path(output_collocation)
collocation_folder.mkdir(exist_ok=True)

output_trigram_collocation = f'{output_folder}trigram_collocation_counts'
trigram_collocation_folder = Path(output_trigram_collocation)
trigram_collocation_folder.mkdir(exist_ok=True)

output_underscore = f'{output_folder}underscore_bigrams'
underscore_folder = Path(output_underscore)
underscore_folder.mkdir(exist_ok=True)

output_trigram_underscore = f'{output_folder}underscore_trigrams'
trigram_underscore_folder = Path(output_trigram_underscore)
trigram_underscore_folder.mkdir(exist_ok=True)

print("Text files in the spellchecked directory:", text_files)

Text files in the spellchecked directory: ['final/Harangue - Saint Germain_corrected.txt', 'final/Harangue - Orléans 2_corrected.txt', 'final/Harangue - religion_corrected.txt', 'final/Harangue - ouverture de parlement_corrected.txt', 'final/Lit de justice_corrected.txt', 'final/Harangue - parlement 3_corrected.txt', 'final/Harangue - lit de justice_corrected.txt', 'final/Harangue - Fontainebleau_corrected.txt', 'final/Harangue - septembre_corrected.txt', 'final/Harangue - parlement 2_corrected.txt', 'final/Harangue - parlement_corrected.txt', 'final/Harangue - Orléans_corrected.txt', 'final/Harangue - Poissy_corrected.txt', 'final/Harangue - Rouen_corrected.txt']


In [4]:
# Open stopwords CSV file and list the contents
with open('./stop_words.csv', 'r') as f:
    stopwords = f.read().strip().split(",")
stopwords[-10:]

['avecque', 'étoit', 'eust', 'feut', 're', 'tr', 'aa', 'tt', 'aaa', 'lzs']

In [5]:
tokenized_texts = {}
for txt in text_files:
    with open(txt, 'r') as f:
        content = f.read()
        file_name = txt.split('\\')[-1]
        #key = file_name.split('.')[0]
        key = os.path.splitext(os.path.basename(file_name))[0]
        tokenized_texts[key] = content
print("Raw texts:", list(tokenized_texts.keys()))       

Raw texts: ['Harangue - Saint Germain_corrected', 'Harangue - Orléans 2_corrected', 'Harangue - religion_corrected', 'Harangue - ouverture de parlement_corrected', 'Lit de justice_corrected', 'Harangue - parlement 3_corrected', 'Harangue - lit de justice_corrected', 'Harangue - Fontainebleau_corrected', 'Harangue - septembre_corrected', 'Harangue - parlement 2_corrected', 'Harangue - parlement_corrected', 'Harangue - Orléans_corrected', 'Harangue - Poissy_corrected', 'Harangue - Rouen_corrected']


In [6]:
unigrams = {}

for key, value in tokenized_texts.items():
    unigram_list = wordpunct_tokenize(value)
    cleanwords = [wordcleaner(w) for w in unigram_list]
    unigrams[key] = cleanwords

for key, value in unigrams.items():
    filename = f"./tokenized/{key}.txt"
    write_words_to_file(value, filename, words_per_line=20)
    print(f"Saved content for '{key}' to {filename}")

Saved content for 'Harangue - Saint Germain_corrected' to ./tokenized/Harangue - Saint Germain_corrected.txt
Saved content for 'Harangue - Orléans 2_corrected' to ./tokenized/Harangue - Orléans 2_corrected.txt
Saved content for 'Harangue - religion_corrected' to ./tokenized/Harangue - religion_corrected.txt
Saved content for 'Harangue - ouverture de parlement_corrected' to ./tokenized/Harangue - ouverture de parlement_corrected.txt
Saved content for 'Lit de justice_corrected' to ./tokenized/Lit de justice_corrected.txt
Saved content for 'Harangue - parlement 3_corrected' to ./tokenized/Harangue - parlement 3_corrected.txt
Saved content for 'Harangue - lit de justice_corrected' to ./tokenized/Harangue - lit de justice_corrected.txt
Saved content for 'Harangue - Fontainebleau_corrected' to ./tokenized/Harangue - Fontainebleau_corrected.txt
Saved content for 'Harangue - septembre_corrected' to ./tokenized/Harangue - septembre_corrected.txt
Saved content for 'Harangue - parlement 2_correct

In [7]:
print("Unigram texts:")
for key in unigrams:
    print(key)

Unigram texts:
Harangue - Saint Germain_corrected
Harangue - Orléans 2_corrected
Harangue - religion_corrected
Harangue - ouverture de parlement_corrected
Lit de justice_corrected
Harangue - parlement 3_corrected
Harangue - lit de justice_corrected
Harangue - Fontainebleau_corrected
Harangue - septembre_corrected
Harangue - parlement 2_corrected
Harangue - parlement_corrected
Harangue - Orléans_corrected
Harangue - Poissy_corrected
Harangue - Rouen_corrected


In [8]:
# Count up the tokens using a Counter() object
unigram_counts = {}
for key, value in unigrams.items():
    unigram_counts_dict = Counter(value)
    unigram_counts[key] = unigram_counts_dict

print("Unigram Counts:")
for key in unigram_counts:
    print(key)

print_first_n_items(unigram_counts, 25)

Unigram Counts:
Harangue - Saint Germain_corrected
Harangue - Orléans 2_corrected
Harangue - religion_corrected
Harangue - ouverture de parlement_corrected
Lit de justice_corrected
Harangue - parlement 3_corrected
Harangue - lit de justice_corrected
Harangue - Fontainebleau_corrected
Harangue - septembre_corrected
Harangue - parlement 2_corrected
Harangue - parlement_corrected
Harangue - Orléans_corrected
Harangue - Poissy_corrected
Harangue - Rouen_corrected
First 25 items in Harangue - Saint Germain_corrected:
: 668
de: 124
et: 102
que: 79
la: 61
en: 59
l: 58
à: 58
le: 48
qui: 43
les: 42
il: 39
a: 38
on: 33
du: 32
ne: 31
d: 30
est: 30
par: 29
roy: 29
nous: 28
pour: 24
ce: 24
vous: 24
qu: 23

First 25 items in Harangue - Orléans 2_corrected:
: 268
et: 89
de: 44
le: 35
qu: 29
que: 29
en: 23
les: 23
roy: 22
la: 19
pour: 18
il: 18
à: 18
du: 16
qui: 15
son: 14
l: 12
des: 12
avoient: 12
on: 11
ou: 11
se: 10
ilz: 10
n: 9
ce: 9

First 25 items in Harangue - religion_corrected:
: 496
et: 88
d

In [9]:
# Remove specified keys from the dictionary
stripped_unigrams = remove_keys_from_nested_dict(unigram_counts, stopwords)

print_first_n_items(stripped_unigrams, 30)

First 30 items in Harangue - Saint Germain_corrected:
roy: 29
dieu: 17
estre: 14
temps: 13
harangues: 12
comme: 12
nostre: 11
plus: 10
si: 10
faict: 9
leurs: 9
tous: 9
ceulx: 8
tout: 8
dire: 8
aultre: 8
peu: 7
religion: 7
bien: 7
mesme: 7
peult: 7
estant: 6
costé: 6
non: 6
tant: 6
église: 6
plusieurs: 6
quand: 6
ainsy: 6
loyx: 6

First 30 items in Harangue - Orléans 2_corrected:
roy: 22
leurs: 8
estats: 8
faire: 8
aultres: 7
estat: 7
estoient: 6
trois: 6
royne: 5
royaulme: 5
ordre: 5
harangues: 5
ceulx: 5
temps: 5
sans: 5
despenses: 5
plus: 5
dict: 5
où: 5
gabelles: 5
affaires: 4
peu: 4
avoir: 4
tel: 4
chascung: 4
prince: 4
millions: 4
chose: 4
six: 4
lesdicts: 4

First 30 items in Harangue - religion_corrected:
roy: 18
mal: 16
faict: 13
fault: 11
dieu: 11
faire: 11
si: 11
conseil: 10
ceste: 10
plus: 9
harangues: 9
comme: 9
religion: 8
court: 8
non: 8
princes: 8
sans: 8
leurs: 8
bonne: 7
ainsi: 7
temps: 7
édictz: 7
concile: 7
avoir: 6
bien: 6
roys: 6
cela: 6
tout: 6
advis: 5
aultres: 5

In [10]:
dictionary_to_file(stripped_unigrams, unigram_folder, 'unigram_counts')

Saved Harangue - Saint Germain_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Harangue - Orléans 2_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Harangue - religion_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Harangue - ouverture de parlement_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Lit de justice_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Harangue - parlement 3_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Harangue - lit de justice_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Harangue - Fontainebleau_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Harangue - septembre_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Harangue - parlement 2_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Harangue - parlement_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Harangue - Orléans_corrected_unigram_counts.csv

In [11]:
bigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    bigrams_list = list(nltk.bigrams(unigram_list))
    bigrams[key] = bigrams_list

print("Bigrams:")
for key in bigrams:
    print(key)

Bigrams:
Harangue - Saint Germain_corrected
Harangue - Orléans 2_corrected
Harangue - religion_corrected
Harangue - ouverture de parlement_corrected
Lit de justice_corrected
Harangue - parlement 3_corrected
Harangue - lit de justice_corrected
Harangue - Fontainebleau_corrected
Harangue - septembre_corrected
Harangue - parlement 2_corrected
Harangue - parlement_corrected
Harangue - Orléans_corrected
Harangue - Poissy_corrected
Harangue - Rouen_corrected


In [12]:
bigram_counts = {}

for key, value in bigrams.items():
    string_bigrams = convert_tuple_bigrams(value)
    bigramCount = convert_strings_to_counts(string_bigrams)
    bigram_counts[key] = bigramCount

print("Bigram Counts:")
for key in bigram_counts:
    print(key)

print_first_n_items(bigram_counts, 30)

Bigram Counts:
Harangue - Saint Germain_corrected
Harangue - Orléans 2_corrected
Harangue - religion_corrected
Harangue - ouverture de parlement_corrected
Lit de justice_corrected
Harangue - parlement 3_corrected
Harangue - lit de justice_corrected
Harangue - Fontainebleau_corrected
Harangue - septembre_corrected
Harangue - parlement 2_corrected
Harangue - parlement_corrected
Harangue - Orléans_corrected
Harangue - Poissy_corrected
Harangue - Rouen_corrected
First 30 items in Harangue - Saint Germain_corrected:
peult estre: 4
court parlement: 3
temps roy: 3
auparavant dieu: 2
amen dement: 2
tous jours: 2
faire combattre: 2
gens guerre: 2
roy nostre: 2
nostre souverain: 2
souverain seigneur: 2
dieu face: 2
non seulement: 2
encore mesme: 2
cause quoy: 2
princes sang: 2
présence roy: 2
assemblée évesques: 2
contenir peuple: 2
bien dira: 2
estre direz: 2
roy henry: 2
roy francois: 2
personnes comme: 2
ceste heure: 2
nostre roy: 2
tout ainsy: 2
adviz court: 2
celles là: 2
tout coup: 2

Firs

In [13]:
dictionary_to_file(bigram_counts, bigram_folder, 'bigram_counts')

Saved Harangue - Saint Germain_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Harangue - Orléans 2_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Harangue - religion_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Harangue - ouverture de parlement_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Lit de justice_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Harangue - parlement 3_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Harangue - lit de justice_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Harangue - Fontainebleau_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Harangue - septembre_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Harangue - parlement 2_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Harangue - parlement_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Harangue - Orléans_corrected_bigram_counts.csv in tokenized/bigram_co

In [14]:
trigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    trigrams_list = list(nltk.trigrams(unigram_list))
    trigrams[key] = trigrams_list

print("Trigrams:")
for key in bigrams:
    print(key)

Trigrams:
Harangue - Saint Germain_corrected
Harangue - Orléans 2_corrected
Harangue - religion_corrected
Harangue - ouverture de parlement_corrected
Lit de justice_corrected
Harangue - parlement 3_corrected
Harangue - lit de justice_corrected
Harangue - Fontainebleau_corrected
Harangue - septembre_corrected
Harangue - parlement 2_corrected
Harangue - parlement_corrected
Harangue - Orléans_corrected
Harangue - Poissy_corrected
Harangue - Rouen_corrected


In [15]:
trigram_counts = {}

for key, value in trigrams.items():
    string_trigrams = convert_tuple_trigrams(value)
    trigramCount = convert_strings_to_counts(string_trigrams)
    trigram_counts[key] = trigramCount

print("Trigram Counts:")
for key in trigram_counts:
    print(key)
    
print_first_n_items(trigram_counts, 30)

Trigram Counts:
Harangue - Saint Germain_corrected
Harangue - Orléans 2_corrected
Harangue - religion_corrected
Harangue - ouverture de parlement_corrected
Lit de justice_corrected
Harangue - parlement 3_corrected
Harangue - lit de justice_corrected
Harangue - Fontainebleau_corrected
Harangue - septembre_corrected
Harangue - parlement 2_corrected
Harangue - parlement_corrected
Harangue - Orléans_corrected
Harangue - Poissy_corrected
Harangue - Rouen_corrected
First 30 items in Harangue - Saint Germain_corrected:
roy nostre souverain: 2
nostre souverain seigneur: 2
peult estre direz: 2
temps roy francois: 2
adviz court parlement: 2
expérience monstré impossible: 2
harangue michel hospital: 1
michel hospital chancelier: 1
hospital chancelier france: 1
chancelier france assemblée: 1
france assemblée états: 1
assemblée états génenaux: 1
états génenaux assemblés: 1
génenaux assemblés saint: 1
assemblés saint germain: 1
saint germain laye: 1
germain laye aout: 1
laye aout sssisons: 1
aout ss

In [16]:
dictionary_to_file(trigram_counts, trigram_folder, 'trigram_counts')

Saved Harangue - Saint Germain_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Harangue - Orléans 2_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Harangue - religion_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Harangue - ouverture de parlement_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Lit de justice_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Harangue - parlement 3_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Harangue - lit de justice_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Harangue - Fontainebleau_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Harangue - septembre_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Harangue - parlement 2_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Harangue - parlement_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Harangue - Orléans_corrected_trigram_counts.csv

In [17]:
colloc_dict = {}
colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_finder.apply_freq_filter(3)  # Make sure all collocations have occurred at least 5 times
    collocations = bigram_finder.nbest(BigramAssocMeasures.pmi, 500)
    colloc_dict[key] = collocations
    
    # Initialize Counter for colloc_counts
    bigram_count_dict = Counter()

    # Count the occurrences of each bigram in the text
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_freqs = bigram_finder.ngram_fd.items()
    
    # Filter bigram counts based on collocations
    for bigram, count in bigram_freqs:
        if bigram in collocations:
            bigram_count_dict[bigram] = count

    colloc_counts[key] = bigram_count_dict

print("Collocations:")
for key, value in colloc_dict.items():
    print(key)
    # for w1, w2 in value:
    #     print(' ', w1, w2)

print("Collocation Counts:")
for key in colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in colloc_counts[key].most_common(30):
        bigram = " ".join(item)
        print(f"{bigram} {count}")
    print()

dictionary_to_file(colloc_counts, collocation_folder, 'collocation_counts')

Collocations:
Harangue - Saint Germain_corrected
Harangue - Orléans 2_corrected
Harangue - religion_corrected
Harangue - ouverture de parlement_corrected
Lit de justice_corrected
Harangue - parlement 3_corrected
Harangue - lit de justice_corrected
Harangue - Fontainebleau_corrected
Harangue - septembre_corrected
Harangue - parlement 2_corrected
Harangue - parlement_corrected
Harangue - Orléans_corrected
Harangue - Poissy_corrected
Harangue - Rouen_corrected
Collocation Counts:
Harangue - Saint Germain_corrected
peult estre 4
court parlement 3
temps roy 3

Harangue - Orléans 2_corrected
roy royne 3
trois estats 3
louis xii 3

Harangue - religion_corrected
tout coup 3
bonne volonté 3

Harangue - ouverture de parlement_corrected
ceste compaignie 4
ceste court 4
comme dict 3
si ceulx 3
ceulx ceste 3
ladicte court 3

Lit de justice_corrected
trop grande 4
adressant parole 3
ordon nances 3
non poinct 3

Harangue - parlement 3_corrected
chef guerre 3

Harangue - lit de justice_corrected
mille

In [18]:
trigram_colloc_dict = {}
trigram_colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    trigram_finder = TrigramCollocationFinder.from_words(unigram_list)
    trigram_finder.apply_freq_filter(3)  # Ensure all collocations have occurred at least 5 times
    collocations = trigram_finder.nbest(TrigramAssocMeasures.pmi, 500)
    trigram_colloc_dict[key] = collocations
    
    # Initialize Counter for trigram_colloc_counts
    trigram_count_dict = Counter()

    # Count the occurrences of each trigram in the text
    trigram_freqs = trigram_finder.ngram_fd.items()
    
    # Filter trigram counts based on collocations
    for trigram, count in trigram_freqs:
        if trigram in collocations:
            trigram_count_dict[trigram] = count

    trigram_colloc_counts[key] = trigram_count_dict

print("Trigram Collocations:")
for key, value in trigram_colloc_dict.items():
    print(key)
    #for w1, w2, w3 in value:
    #    print(' ', w1, w2, w3)

print("Trigram Collocation Counts:")
for key in trigram_colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in trigram_colloc_counts[key].most_common(30):
        trigram = " ".join(item)
        print(f"{trigram} {count}")
    print()

dictionary_to_file(trigram_colloc_counts, trigram_collocation_folder, 'trigram_collocation_counts')

Trigram Collocations:
Harangue - Saint Germain_corrected
Harangue - Orléans 2_corrected
Harangue - religion_corrected
Harangue - ouverture de parlement_corrected
Lit de justice_corrected
Harangue - parlement 3_corrected
Harangue - lit de justice_corrected
Harangue - Fontainebleau_corrected
Harangue - septembre_corrected
Harangue - parlement 2_corrected
Harangue - parlement_corrected
Harangue - Orléans_corrected
Harangue - Poissy_corrected
Harangue - Rouen_corrected
Trigram Collocation Counts:
Harangue - Saint Germain_corrected

Harangue - Orléans 2_corrected

Harangue - religion_corrected

Harangue - ouverture de parlement_corrected

Lit de justice_corrected

Harangue - parlement 3_corrected

Harangue - lit de justice_corrected
mille livres tournoys 5

Harangue - Fontainebleau_corrected

Harangue - septembre_corrected

Harangue - parlement 2_corrected

Harangue - parlement_corrected

Harangue - Orléans_corrected
nostre jeune roy 3

Harangue - Poissy_corrected

Harangue - Rouen_correcte

In [19]:
underscore_dict = {}
for key, value in unigrams.items():

    tokenized_words = unigrams.get(key)
    collocations = colloc_dict.get(key)
    
    colloc_words = []
    
    # Iterate through the words making new versions combining collocations
    i = 0
    while i < len(tokenized_words) - 1:
        # If we find a collocation, add and advance by two words
        if (tokenized_words[i], tokenized_words[i + 1]) in collocations:
            colloc_words.append('_'.join((tokenized_words[i], tokenized_words[i + 1])))
            i += 2
        # Otherwise, advance by one word
        else:
            colloc_words.append(tokenized_words[i])
            i += 1

    # Add the last word (if any)
    if i == len(tokenized_words) - 1:
        colloc_words.append(tokenized_words[i])
    underscore_dict[key] = colloc_words

print("Underscore Dictionary:")
for key in underscore_dict:
    print(key)

write_dict_to_files_with_suffix(underscore_dict, underscore_folder, 'underscore_bigrams')

Underscore Dictionary:
Harangue - Saint Germain_corrected
Harangue - Orléans 2_corrected
Harangue - religion_corrected
Harangue - ouverture de parlement_corrected
Lit de justice_corrected
Harangue - parlement 3_corrected
Harangue - lit de justice_corrected
Harangue - Fontainebleau_corrected
Harangue - septembre_corrected
Harangue - parlement 2_corrected
Harangue - parlement_corrected
Harangue - Orléans_corrected
Harangue - Poissy_corrected
Harangue - Rouen_corrected
Harangue - Saint Germain_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Harangue - Orléans 2_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Harangue - religion_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Harangue - ouverture de parlement_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Lit de justice_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Harangue - parlement 3_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
H

In [20]:
trigram_underscore_dict = {}

for key, tokenized_words in unigrams.items():
    collocations = trigram_colloc_dict.get(key, [])
    colloc_words = []
    i = 0
    while i < len(tokenized_words) - 2:
        # If we find a trigram collocation, add and advance by three words
        trigram = (tokenized_words[i], tokenized_words[i + 1], tokenized_words[i + 2])
        if trigram in collocations:
            colloc_words.append('_'.join(trigram))
            i += 3
        else:
            colloc_words.append(tokenized_words[i])
            i += 1
    # Add the last words (if any)
    while i < len(tokenized_words):
        colloc_words.append(tokenized_words[i])
        i += 1
    trigram_underscore_dict[key] = colloc_words

print("Trigram underscore Dictionary:")
for key in trigram_underscore_dict:
    print(key)

write_dict_to_files_with_suffix(trigram_underscore_dict, trigram_underscore_folder, 'underscore_trigrams')

Trigram underscore Dictionary:
Harangue - Saint Germain_corrected
Harangue - Orléans 2_corrected
Harangue - religion_corrected
Harangue - ouverture de parlement_corrected
Lit de justice_corrected
Harangue - parlement 3_corrected
Harangue - lit de justice_corrected
Harangue - Fontainebleau_corrected
Harangue - septembre_corrected
Harangue - parlement 2_corrected
Harangue - parlement_corrected
Harangue - Orléans_corrected
Harangue - Poissy_corrected
Harangue - Rouen_corrected
Harangue - Saint Germain_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Harangue - Orléans 2_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Harangue - religion_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Harangue - ouverture de parlement_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Lit de justice_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Harangue - parlement 3_corrected_underscore_trigrams.txt in tokenized/u