In [1]:
from pathlib import Path
import glob
import os
import sys
import csv
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize,wordpunct_tokenize
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures

In [2]:
sys.path.append("/home/lucas-jerusalimiec/Documents/OCR Text/Notebooks")
from tokenizer_func  import (wordcleaner, write_words_to_file, dictionary_to_file, convert_tuple_bigrams,
convert_tuple_trigrams)

from extra_token_func import print_first_n_items, remove_keys_from_nested_dict

from additional_token_func import convert_strings_to_counts

from dict_write import write_dict_to_files_with_suffix

In [3]:
text_loc = Path("./final")
text_files = glob.glob(f"{text_loc}/*.txt")
output_folder = './tokenized/'
tokenized_folder = Path(output_folder)
tokenized_folder.mkdir(exist_ok=True)
print(output_folder)
print("Text files in the spellchecked directory:", text_files)

./tokenized/
Text files in the spellchecked directory: ['final/République_corrected.txt']


In [4]:
# Open stopwords CSV file and list the contents
with open('./stop_words.csv', 'r') as f:
    stopwords = f.read().strip().split(",")
stopwords[-10:]

['ie', 'fc', 'del', 'o', 'dé', 'dela', 'ay', 'w', 'iij', 'enl']

In [5]:
tokenized_texts = {}
for txt in text_files:
    with open(txt, 'r') as f:
        content = f.read()
        file_name = txt.split('\\')[-1]
        #key = file_name.split('.')[0]
        key = os.path.splitext(os.path.basename(file_name))[0]
        tokenized_texts[key] = content
print("Raw texts:", list(tokenized_texts.keys()))       

Raw texts: ['République_corrected']


In [6]:
unigrams = {}

for key, value in tokenized_texts.items():
    unigram_list = wordpunct_tokenize(value)
    cleanwords = [wordcleaner(w) for w in unigram_list]
    unigrams[key] = cleanwords

for key, value in unigrams.items():
    filename = f"./tokenized/{key}.txt"
    write_words_to_file(value, filename, words_per_line=20)
    print(f"Saved content for '{key}' to {filename}")

Saved content for 'République_corrected' to ./tokenized/République_corrected.txt


In [7]:
print("Unigram texts:")
for key in unigrams:
    print(key)

Unigram texts:
République_corrected


In [8]:
# Count up the tokens using a Counter() object
unigram_counts = {}
for key, value in unigrams.items():
    unigram_counts_dict = Counter(value)
    unigram_counts[key] = unigram_counts_dict

print("Unigram Counts:")
for key in unigram_counts:
    print(key)

print_first_n_items(unigram_counts, 25)

Unigram Counts:
République_corrected
First 25 items in République_corrected:
: 110281
de: 13322
les: 7127
la: 6794
que: 6167
en: 5978
il: 4959
d: 4959
qui: 4934
l: 4929
le: 4928
qu: 4723
des: 4337
à: 4133
eft: 3062
n: 2867
pour: 2771
du: 2747
par: 2453
ne: 2368
au: 2331
plus: 2294
ce: 2109
ou: 1971
vn: 1932



In [9]:
# Remove specified keys from the dictionary
stripped_unigrams = remove_keys_from_nested_dict(unigram_counts, stopwords)

print_first_n_items(stripped_unigrams, 30)

First 25 items in République_corrected:
plus: 2294
comme: 1521
roy: 1372
bien: 1333
peuple: 895
dit: 831
eftat: 829
peut: 803
prince: 790
fait: 753
faire: 747
autres: 739
fans: 719
point: 711
auffi: 658
tous: 621
tout: 616
autre: 616
foit: 583
fugets: 579
deux: 557
princes: 545
fes: 532
apres: 509
republique: 503



In [10]:
dictionary_to_file(stripped_unigrams, output_folder, 'unigram_counts')

Saved République_corrected_unigram_counts.csv in ./tokenized/


In [11]:
bigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    bigrams_list = list(nltk.bigrams(unigram_list))
    bigrams[key] = bigrams_list

print("Bigrams:")
for key in bigrams:
    print(key)

Bigrams:
République_corrected


In [12]:
bigram_counts = {}

for key, value in bigrams.items():
    string_bigrams = convert_tuple_bigrams(value)
    bigramCount = convert_strings_to_counts(string_bigrams)
    bigram_counts[key] = bigramCount

print("Bigram Counts:")
for key in bigram_counts:
    print(key)

print_first_n_items(bigram_counts, 30)

Bigram Counts:
République_corrected
First 30 items in République_corrected:
eftat populaire: 163
comme dit: 132
prince fouuerain: 98
plus grand: 90
plus grands: 90
roy france: 86
cy deffus: 83
peut faire: 80
menu peuple: 80
peut voir: 77
livre premier: 71
ains auffi: 70
peut dire: 69
bien fouuent: 68
tout ainfi: 63
beaucoup plus: 63
fes fugets: 61
non feulement: 59
non plus: 58
comme fift: 58
corps colleges: 56
peu peu: 55
plus grande: 53
quele roy: 50
foy hommage: 49
cens mil: 48
deux cens: 47
livre sixiesme: 47
comme peut: 46
autres princes: 45



In [13]:
dictionary_to_file(bigram_counts, output_folder, 'bigram_counts')

Saved République_corrected_bigram_counts.csv in ./tokenized/


In [14]:
trigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    trigrams_list = list(nltk.trigrams(unigram_list))
    trigrams[key] = trigrams_list

print("Trigrams:")
for key in bigrams:
    print(key)

Trigrams:
République_corrected


In [15]:
trigram_counts = {}

for key, value in trigrams.items():
    string_trigrams = convert_tuple_trigrams(value)
    trigramCount = convert_strings_to_counts(string_trigrams)
    trigram_counts[key] = trigramCount

print("Trigram Counts:")
for key in trigram_counts:
    print(key)
    
print_first_n_items(trigram_counts, 30)

Trigram Counts:
République_corrected
First 30 items in République_corrected:
comme peut voir: 33
dit cy deffus: 24
monftré cy deffus: 21
dit tite liue: 18
tire apres foy: 17
comme auons dit: 14
comme cas pareil: 13
cens mil efcus: 13
cens mil liures: 13
auons monftré cy: 13
mil cinq cens: 13
fept cens ans: 11
comme dit plutarque: 10
quatre cens mil: 10
plus grand nombre: 10
trois hautes planettes: 10
quel eftat populaire: 9
quelque forte foit: 8
feditions guerres ciuiles: 8
loüys roy france: 8
foy hommage lige: 8
tant foit peu: 7
plus haut point: 7
ferons mefme iugement: 7
ainfi peut voir: 7
traité fait entre: 7
fans aller plus: 7
deux cens mil: 7
deux cens ans: 7
trois cens mil: 7



In [16]:
dictionary_to_file(trigram_counts, output_folder, 'trigram_counts')

Saved République_corrected_trigram_counts.csv in ./tokenized/


In [17]:
colloc_dict = {}
colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_finder.apply_freq_filter(3)  # Make sure all collocations have occurred at least 5 times
    collocations = bigram_finder.nbest(BigramAssocMeasures.pmi, 500)
    colloc_dict[key] = collocations
    
    # Initialize Counter for colloc_counts
    bigram_count_dict = Counter()

    # Count the occurrences of each bigram in the text
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_freqs = bigram_finder.ngram_fd.items()
    
    # Filter bigram counts based on collocations
    for bigram, count in bigram_freqs:
        if bigram in collocations:
            bigram_count_dict[bigram] = count

    colloc_counts[key] = bigram_count_dict

print("Collocations:")
for key, value in colloc_dict.items():
    print(key)
    # for w1, w2 in value:
    #     print(' ', w1, w2)

print("Collocation Counts:")
for key in colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in colloc_counts[key].most_common(30):
        bigram = " ".join(item)
        print(f"{bigram} {count}")
    print()

dictionary_to_file(colloc_counts, output_folder, 'collocation_counts')

Collocations:
République_corrected
Collocation Counts:
République_corrected
tite liue 45
guerres ciuiles 43
lettres patentes 25
deflors auant 18
grecs latins 16
marc antoine 16
hautes planettes 16
voix deliberatiue 15
philippe valois 14
dira quelqu 13
proportion geometrique 13
marc varron 11
offenfiue defenfiue 11
chambre comptes 11
places fortes 10
naples sicile 10
leon afrique 10
di ateur 10
difcipline militaire 10
cours fouueraines 9
attaint conuaincu 9
caton cenfeur 8
thomas more 8
diuines humaines 8
nom collectif 8
prefenter requefte 8
venir bout 8
scipion africain 7
bonnes meurs 7
don ner 7

Saved République_corrected_collocation_counts.csv in ./tokenized/


In [18]:
trigram_colloc_dict = {}
trigram_colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    trigram_finder = TrigramCollocationFinder.from_words(unigram_list)
    trigram_finder.apply_freq_filter(3)  # Ensure all collocations have occurred at least 5 times
    collocations = trigram_finder.nbest(TrigramAssocMeasures.pmi, 500)
    trigram_colloc_dict[key] = collocations
    
    # Initialize Counter for trigram_colloc_counts
    trigram_count_dict = Counter()

    # Count the occurrences of each trigram in the text
    trigram_freqs = trigram_finder.ngram_fd.items()
    
    # Filter trigram counts based on collocations
    for trigram, count in trigram_freqs:
        if trigram in collocations:
            trigram_count_dict[trigram] = count

    trigram_colloc_counts[key] = trigram_count_dict

print("Trigram Collocations:")
for key, value in trigram_colloc_dict.items():
    print(key)
    #for w1, w2, w3 in value:
    #    print(' ', w1, w2, w3)

print("Trigram Collocation Counts:")
for key in trigram_colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in trigram_colloc_counts[key].most_common(30):
        trigram = " ".join(item)
        print(f"{trigram} {count}")
    print()

dictionary_to_file(trigram_colloc_counts, output_folder, 'trigram_collocation_counts')

Trigram Collocations:
République_corrected
Trigram Collocation Counts:
République_corrected
comme peut voir 33
dit cy deffus 24
monftré cy deffus 21
dit tite liue 18
tire apres foy 17
comme auons dit 14
comme cas pareil 13
cens mil efcus 13
cens mil liures 13
auons monftré cy 13
mil cinq cens 13
fept cens ans 11
comme dit plutarque 10
quatre cens mil 10
plus grand nombre 10
trois hautes planettes 10
quel eftat populaire 9
quelque forte foit 8
feditions guerres ciuiles 8
loüys roy france 8
foy hommage lige 8
tant foit peu 7
plus haut point 7
ferons mefme iugement 7
ainfi peut voir 7
traité fait entre 7
fans aller plus 7
deux cens mil 7
deux cens ans 7
trois cens mil 7

Saved République_corrected_trigram_collocation_counts.csv in ./tokenized/


In [19]:
underscore_dict = {}
for key, value in unigrams.items():

    tokenized_words = unigrams.get(key)
    collocations = colloc_dict.get(key)
    
    colloc_words = []
    
    # Iterate through the words making new versions combining collocations
    i = 0
    while i < len(tokenized_words) - 1:
        # If we find a collocation, add and advance by two words
        if (tokenized_words[i], tokenized_words[i + 1]) in collocations:
            colloc_words.append('_'.join((tokenized_words[i], tokenized_words[i + 1])))
            i += 2
        # Otherwise, advance by one word
        else:
            colloc_words.append(tokenized_words[i])
            i += 1

    # Add the last word (if any)
    if i == len(tokenized_words) - 1:
        colloc_words.append(tokenized_words[i])
    underscore_dict[key] = colloc_words

print("Underscore Dictionary:")
for key in underscore_dict:
    print(key)

write_dict_to_files_with_suffix(underscore_dict, output_folder, 'underscore_bigrams')

Underscore Dictionary:
République_corrected
République_corrected_underscore_bigrams.txt in ./tokenized/


In [20]:
trigram_underscore_dict = {}

for key, tokenized_words in unigrams.items():
    collocations = trigram_colloc_dict.get(key, [])
    colloc_words = []
    i = 0
    while i < len(tokenized_words) - 2:
        # If we find a trigram collocation, add and advance by three words
        trigram = (tokenized_words[i], tokenized_words[i + 1], tokenized_words[i + 2])
        if trigram in collocations:
            colloc_words.append('_'.join(trigram))
            i += 3
        else:
            colloc_words.append(tokenized_words[i])
            i += 1
    # Add the last words (if any)
    while i < len(tokenized_words):
        colloc_words.append(tokenized_words[i])
        i += 1
    trigram_underscore_dict[key] = colloc_words

print("Trigram underscore Dictionary:")
for key in trigram_underscore_dict:
    print(key)

write_dict_to_files_with_suffix(trigram_underscore_dict, output_folder, 'underscore_trigrams')

Trigram underscore Dictionary:
République_corrected
République_corrected_underscore_trigrams.txt in ./tokenized/
