In [1]:
from pathlib import Path
import glob
import os
import sys
import csv
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize,wordpunct_tokenize
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures

import locale
# Set the locale to your desired setting (e.g., 'fr_FR.UTF-8' for French)
locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')

'fr_FR.UTF-8'

In [2]:
sys.path.append("/home/lucas-jerusalimiec/Documents/OCR Text/Notebooks")
from tokenizer_func  import (wordcleaner, write_words_to_file, dictionary_to_file, convert_tuple_bigrams,
convert_tuple_trigrams)

from extra_token_func import print_first_n_items, remove_keys_from_nested_dict

from additional_token_func import convert_strings_to_counts

from dict_write import write_dict_to_files_with_suffix

In [3]:
text_loc = Path("./final")
text_files = sorted(glob.glob(f"{text_loc}/*.txt"), key=locale.strxfrm)
output_folder = './tokenized/'
tokenized_folder = Path(output_folder)
tokenized_folder.mkdir(exist_ok=True)

output_unigram = f'{output_folder}unigram_counts'
unigram_folder = Path(output_unigram)
unigram_folder.mkdir(exist_ok=True)

output_bigram = f'{output_folder}bigram_counts'
bigram_folder = Path(output_bigram)
bigram_folder.mkdir(exist_ok=True)

output_trigram = f'{output_folder}trigram_counts'
trigram_folder = Path(output_trigram)
trigram_folder.mkdir(exist_ok=True)

output_collocation = f'{output_folder}collocation_counts'
collocation_folder = Path(output_collocation)
collocation_folder.mkdir(exist_ok=True)

output_trigram_collocation = f'{output_folder}trigram_collocation_counts'
trigram_collocation_folder = Path(output_trigram_collocation)
trigram_collocation_folder.mkdir(exist_ok=True)

output_underscore = f'{output_folder}underscore_bigrams'
underscore_folder = Path(output_underscore)
underscore_folder.mkdir(exist_ok=True)

output_trigram_underscore = f'{output_folder}underscore_trigrams'
trigram_underscore_folder = Path(output_trigram_underscore)
trigram_underscore_folder.mkdir(exist_ok=True)

print("Text files in the spellchecked directory:", text_files)

Text files in the spellchecked directory: ['final/Démonomanie I_corrected.txt', 'final/Démonomanie II_corrected.txt', 'final/Démonomanie III_corrected.txt', 'final/Démonomanie IV_corrected.txt', 'final/Démonomanie preface Repair_corrected.txt']


In [4]:
# Open stopwords CSV file and list the contents
with open('./stop_words.csv', 'r') as f:
    stopwords = f.read().strip().split(",")
stopwords[-10:]

['v', 'vn', 'vne', 'vnes', 'w', 'x', 'ya', 'encores', 'quele', 'queles']

In [5]:
tokenized_texts = {}
for txt in text_files:
    with open(txt, 'r') as f:
        content = f.read()
        file_name = txt.split('\\')[-1]
        #key = file_name.split('.')[0]
        key = os.path.splitext(os.path.basename(file_name))[0]
        tokenized_texts[key] = content
print("Raw texts:", list(tokenized_texts.keys()))       

Raw texts: ['Démonomanie I_corrected', 'Démonomanie II_corrected', 'Démonomanie III_corrected', 'Démonomanie IV_corrected', 'Démonomanie preface Repair_corrected']


In [6]:
unigrams = {}

for key, value in tokenized_texts.items():
    unigram_list = wordpunct_tokenize(value)
    cleanwords = [wordcleaner(w) for w in unigram_list]
    unigrams[key] = cleanwords

for key, value in unigrams.items():
    filename = f"./tokenized/{key}.txt"
    write_words_to_file(value, filename, words_per_line=20)
    print(f"Saved content for '{key}' to {filename}")

Saved content for 'Démonomanie I_corrected' to ./tokenized/Démonomanie I_corrected.txt
Saved content for 'Démonomanie II_corrected' to ./tokenized/Démonomanie II_corrected.txt
Saved content for 'Démonomanie III_corrected' to ./tokenized/Démonomanie III_corrected.txt
Saved content for 'Démonomanie IV_corrected' to ./tokenized/Démonomanie IV_corrected.txt
Saved content for 'Démonomanie preface Repair_corrected' to ./tokenized/Démonomanie preface Repair_corrected.txt


In [7]:
print("Unigram texts:")
for key in unigrams:
    print(key)

Unigram texts:
Démonomanie I_corrected
Démonomanie II_corrected
Démonomanie III_corrected
Démonomanie IV_corrected
Démonomanie preface Repair_corrected


In [8]:
# Count up the tokens using a Counter() object
unigram_counts = {}
for key, value in unigrams.items():
    unigram_counts_dict = Counter(value)
    unigram_counts[key] = unigram_counts_dict

print("Unigram Counts:")
for key in unigram_counts:
    print(key)

print_first_n_items(unigram_counts, 25)

Unigram Counts:
Démonomanie I_corrected
Démonomanie II_corrected
Démonomanie III_corrected
Démonomanie IV_corrected
Démonomanie preface Repair_corrected
First 25 items in Démonomanie I_corrected:
: 8961
de: 693
que: 442
les: 423
il: 380
qui: 358
la: 339
en: 330
des: 328
qu: 321
l: 311
le: 271
à: 264
dieu: 263
eft: 245
d: 211
pour: 174
par: 171
a: 160
ce: 156
comme: 150
n: 145
du: 145
c: 138
ne: 133

First 25 items in Démonomanie II_corrected:
: 13497
de: 877
que: 543
en: 494
qu: 490
à: 451
les: 420
qui: 419
la: 415
d: 413
il: 410
l: 361
le: 340
des: 286
a: 252
eft: 206
vn: 201
c: 197
n: 186
par: 186
pour: 182
on: 169
ce: 167
e: 165
dieu: 161

First 25 items in Démonomanie III_corrected:
: 6797
de: 655
que: 391
qu: 384
en: 370
il: 354
les: 335
la: 327
qui: 306
d: 267
le: 266
à: 263
l: 225
des: 199
ne: 183
vn: 163
eft: 158
on: 147
pour: 144
par: 134
ce: 131
n: 130
a: 126
dieu: 125
c: 114

First 25 items in Démonomanie IV_corrected:
: 11060
de: 881
la: 412
que: 398
d: 349
qui: 338
qu: 335

In [9]:
# Remove specified keys from the dictionary
stripped_unigrams = remove_keys_from_nested_dict(unigram_counts, stopwords)

print_first_n_items(stripped_unigrams, 30)

First 30 items in Démonomanie I_corrected:
: 8961
dieu: 263
bien: 83
quand: 55
premier: 54
choses: 50
chose: 49
sorciers: 49
dire: 47
faire: 47
fort: 46
hommes: 44
esprits: 42
celuy: 42
monde: 37
dict: 36
anges: 35
entre: 35
mot: 34
nature: 34
esprit: 33
faut: 32
quelque: 30
non: 29
hebrieux: 29
void: 27
homme: 26
grand: 26
pourquoy: 25
moyens: 24

First 30 items in Démonomanie II_corrected:
: 13497
dieu: 161
sorciers: 101
diable: 94
bien: 82
p: 67
faire: 62
hommes: 52
corps: 51
second: 50
quand: 44
sathan: 42
plusieurs: 40
dire: 37
chose: 34
homme: 34
dict: 32
f: 31
grand: 31
non: 30
faut: 30
veu: 30
mort: 30
ans: 30
fort: 29
esprit: 28
cela: 28
quil: 27
paroles: 27
choses: 26

First 30 items in Démonomanie III_corrected:
: 6797
dieu: 125
sorciers: 93
sathan: 74
bien: 59
diable: 57
faire: 50
sorcier: 39
mal: 34
veu: 30
contre: 29
dire: 29
plusieurs: 28
an: 27
fin: 27
quand: 26
sorciere: 26
esprit: 26
esprits: 25
faut: 25
quelques: 25
hommes: 24
vie: 23
depuis: 22
cela: 22
celuy: 22
mo

In [10]:
dictionary_to_file(stripped_unigrams, unigram_folder, 'unigram_counts')

Saved Démonomanie I_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Démonomanie II_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Démonomanie III_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Démonomanie IV_corrected_unigram_counts.csv in tokenized/unigram_counts
Saved Démonomanie preface Repair_corrected_unigram_counts.csv in tokenized/unigram_counts


In [11]:
bigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    bigrams_list = list(nltk.bigrams(unigram_list))
    bigrams[key] = bigrams_list

print("Bigrams:")
for key in bigrams:
    print(key)

Bigrams:
Démonomanie I_corrected
Démonomanie II_corrected
Démonomanie III_corrected
Démonomanie IV_corrected
Démonomanie preface Repair_corrected


In [12]:
bigram_counts = {}

for key, value in bigrams.items():
    string_bigrams = convert_tuple_bigrams(value)
    bigramCount = convert_strings_to_counts(string_bigrams)
    bigram_counts[key] = bigramCount

print("Bigram Counts:")
for key in bigram_counts:
    print(key)

print_first_n_items(bigram_counts, 30)

Bigram Counts:
Démonomanie I_corrected
Démonomanie II_corrected
Démonomanie III_corrected
Démonomanie IV_corrected
Démonomanie preface Repair_corrected
First 30 items in Démonomanie I_corrected:
 : 2540
dieu : 144
 dieu: 59
premier : 45
 quand: 42
bien : 34
dire : 31
esprits : 28
 premier: 28
 bien: 27
sorciers : 25
hommes : 25
anges : 24
chose : 23
 non: 22
 homme: 22
 faut: 21
 esprit: 21
 pourquoy: 21
 combien: 20
 efcripture: 19
monde : 19
 entre: 19
 void: 18
 dire: 18
 p: 18
fort : 18
dict : 18
choses : 18
 neantmoins: 17

First 30 items in Démonomanie II_corrected:
 : 5405
dieu : 98
diable : 54
 p: 52
p : 50
second : 49
sorciers : 48
 sorciers: 38
hommes : 37
 bien: 36
 dieu: 36
 diable: 32
 quand: 31
 second: 30
 faire: 26
 faut: 24
mort : 24
 an: 23
 quoy: 23
corps : 22
 lequel: 22
 f: 22
 non: 21
an : 20
 homme: 20
sathan : 20
dire : 19
f : 19
 neantmoins: 18
chap : 18

First 30 items in Démonomanie III_corrected:
 : 1779
dieu : 65
 sorciers: 33
sathan : 33
sorciers : 31
diab

In [13]:
dictionary_to_file(bigram_counts, bigram_folder, 'bigram_counts')

Saved Démonomanie I_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Démonomanie II_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Démonomanie III_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Démonomanie IV_corrected_bigram_counts.csv in tokenized/bigram_counts
Saved Démonomanie preface Repair_corrected_bigram_counts.csv in tokenized/bigram_counts


In [14]:
trigrams = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    #unigram_list = [word for word in value]
    trigrams_list = list(nltk.trigrams(unigram_list))
    trigrams[key] = trigrams_list

print("Trigrams:")
for key in bigrams:
    print(key)

Trigrams:
Démonomanie I_corrected
Démonomanie II_corrected
Démonomanie III_corrected
Démonomanie IV_corrected
Démonomanie preface Repair_corrected


In [15]:
trigram_counts = {}

for key, value in trigrams.items():
    string_trigrams = convert_tuple_trigrams(value)
    trigramCount = convert_strings_to_counts(string_trigrams)
    trigram_counts[key] = trigramCount

print("Trigram Counts:")
for key in trigram_counts:
    print(key)
    
print_first_n_items(trigram_counts, 30)

Trigram Counts:
Démonomanie I_corrected
Démonomanie II_corrected
Démonomanie III_corrected
Démonomanie IV_corrected
Démonomanie preface Repair_corrected
First 30 items in Démonomanie I_corrected:
  : 769
dieu  : 46
 premier : 23
 dieu : 20
  pourquoy: 20
premier  : 16
 lib : 14
 dire : 12
anges  : 12
 cap : 12
 com : 12
quelque chose : 11
bien  : 11
 p : 11
  dieu: 11
 efcripture : 10
 bien : 10
cap  : 10
  eftà: 10
  sorciers: 10
lib  : 9
 dict : 9
  non: 9
  neantmoins: 8
 quand : 8
  dire: 8
 mauuais : 8
malins esprits : 8
 impieté : 8
 void : 7

First 30 items in Démonomanie II_corrected:
  : 2846
  p: 43
 p : 39
dieu  : 30
p  : 30
 second : 29
diable  : 24
 dieu : 22
 an : 19
second  : 18
 sorciers : 17
  sorciers: 17
hommes  : 15
 f : 15
an  : 14
  second: 14
 diable : 14
  pourquoy: 13
  f: 13
 homme : 12
 hommes : 12
  non: 10
 dire : 10
 chap : 10
  neantmoins: 9
 quil : 9
  faut: 9
 quel : 9
 quoy : 9
sorciers  : 9

First 30 items in Démonomanie III_corrected:
  : 596
dieu  :

In [16]:
dictionary_to_file(trigram_counts, trigram_folder, 'trigram_counts')

Saved Démonomanie I_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Démonomanie II_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Démonomanie III_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Démonomanie IV_corrected_trigram_counts.csv in tokenized/trigram_counts
Saved Démonomanie preface Repair_corrected_trigram_counts.csv in tokenized/trigram_counts


In [17]:
colloc_dict = {}
colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_finder.apply_freq_filter(3)  # Make sure all collocations have occurred at least 5 times
    collocations = bigram_finder.nbest(BigramAssocMeasures.pmi, 500)
    colloc_dict[key] = collocations
    
    # Initialize Counter for colloc_counts
    bigram_count_dict = Counter()

    # Count the occurrences of each bigram in the text
    bigram_finder = BigramCollocationFinder.from_words(unigram_list)
    bigram_freqs = bigram_finder.ngram_fd.items()
    
    # Filter bigram counts based on collocations
    for bigram, count in bigram_freqs:
        if bigram in collocations:
            bigram_count_dict[bigram] = count

    colloc_counts[key] = bigram_count_dict

print("Collocations:")
for key, value in colloc_dict.items():
    print(key)
    # for w1, w2 in value:
    #     print(' ', w1, w2)

print("Collocation Counts:")
for key in colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in colloc_counts[key].most_common(30):
        bigram = " ".join(item)
        print(f"{bigram} {count}")
    print()

dictionary_to_file(colloc_counts, collocation_folder, 'collocation_counts')

Collocations:
Démonomanie I_corrected
Démonomanie II_corrected
Démonomanie III_corrected
Démonomanie IV_corrected
Démonomanie preface Repair_corrected
Collocation Counts:
Démonomanie I_corrected
premier  45
 quand 42
dire  31
esprits  28
anges  24
 non 22
 homme 22
 faut 21
 esprit 21
 pourquoy 21
 combien 20
 efcripture 19
 void 18
 p 18
 neantmoins 17
lib  17
quelque chose 16
 foit 16
p  15
bons  15
 ment 15
com  15
malins esprits 15
 lib 14
cap  14
mal  14
 quoy 14
 celà 14
quel  14
 elles 14

Démonomanie II_corrected
 p 52
second  49
 faut 24
mort  24
 an 23
 quoy 23
 lequel 22
an  20
 neantmoins 18
chap  18
 pourquoy 17
 ains 17
 come 16
 difoit 15
com  14
 lifons 13
 quelquesfois 13
cas pareil 11
 ame 11
 autant 11
 caril 11
 aage 11
malins esprits 10
 combien 10
 quia 10
 z 10
dicu  9
tou  9
 ap 9
 quant 9

Démonomanie III_corrected
dieu  65
sathan  33
diable  25
 an 21
 fin 21
 non 18
 faut 18
 ains 16
 depuis 15
mort  15
esprits  15
dire  15
vie  14
 come 13
 void 13
malings e

In [18]:
trigram_colloc_dict = {}
trigram_colloc_counts = {}

for key, value in unigrams.items():
    unigram_list = [word for word in value if word.lower() not in stopwords]
    trigram_finder = TrigramCollocationFinder.from_words(unigram_list)
    trigram_finder.apply_freq_filter(3)  # Ensure all collocations have occurred at least 5 times
    collocations = trigram_finder.nbest(TrigramAssocMeasures.pmi, 500)
    trigram_colloc_dict[key] = collocations
    
    # Initialize Counter for trigram_colloc_counts
    trigram_count_dict = Counter()

    # Count the occurrences of each trigram in the text
    trigram_freqs = trigram_finder.ngram_fd.items()
    
    # Filter trigram counts based on collocations
    for trigram, count in trigram_freqs:
        if trigram in collocations:
            trigram_count_dict[trigram] = count

    trigram_colloc_counts[key] = trigram_count_dict

print("Trigram Collocations:")
for key, value in trigram_colloc_dict.items():
    print(key)
    #for w1, w2, w3 in value:
    #    print(' ', w1, w2, w3)

print("Trigram Collocation Counts:")
for key in trigram_colloc_counts:
    print(key)
    # Print first n items, assuming print_first_n_items function is defined elsewhere
    for item, count in trigram_colloc_counts[key].most_common(30):
        trigram = " ".join(item)
        print(f"{trigram} {count}")
    print()

dictionary_to_file(trigram_colloc_counts, trigram_collocation_folder, 'trigram_collocation_counts')

Trigram Collocations:
Démonomanie I_corrected
Démonomanie II_corrected
Démonomanie III_corrected
Démonomanie IV_corrected
Démonomanie preface Repair_corrected
Trigram Collocation Counts:
Démonomanie I_corrected
   769
dieu   46
 premier  23
 dieu  20
  pourquoy 20
premier   16
 lib  14
 dire  12
anges   12
 cap  12
 com  12
quelque chose  11
bien   11
 p  11
  dieu 11
 efcripture  10
 bien  10
cap   10
  eftà 10
  sorciers 10
lib   9
 dict  9
  non 9
  neantmoins 8
 quand  8
  dire 8
 mauuais  8
malins esprits  8
 impieté  8
 void  7

Démonomanie II_corrected
   2846
  p 43
 p  39
dieu   30
p   30
 second  29
diable   24
 dieu  22
 an  19
second   18
 sorciers  17
  sorciers 17
hommes   15
 f  15
an   14
  second 14
 diable  14
  pourquoy 13
  f 13
 homme  12
 hommes  12
  non 10
 dire  10
 chap  10
  neantmoins 9
 quil  9
  faut 9
 quel  9
 quoy  9
sorciers   9

Démonomanie III_corrected
   596
dieu   25
sathan   12
 fin  11
 ame  10
 sorciers  10
 depuis  10
dire   9
  autant 9
 an m

In [19]:
underscore_dict = {}
for key, value in unigrams.items():

    tokenized_words = unigrams.get(key)
    collocations = colloc_dict.get(key)
    
    colloc_words = []
    
    # Iterate through the words making new versions combining collocations
    i = 0
    while i < len(tokenized_words) - 1:
        # If we find a collocation, add and advance by two words
        if (tokenized_words[i], tokenized_words[i + 1]) in collocations:
            colloc_words.append('_'.join((tokenized_words[i], tokenized_words[i + 1])))
            i += 2
        # Otherwise, advance by one word
        else:
            colloc_words.append(tokenized_words[i])
            i += 1

    # Add the last word (if any)
    if i == len(tokenized_words) - 1:
        colloc_words.append(tokenized_words[i])
    underscore_dict[key] = colloc_words

print("Underscore Dictionary:")
for key in underscore_dict:
    print(key)

write_dict_to_files_with_suffix(underscore_dict, underscore_folder, 'underscore_bigrams')

Underscore Dictionary:
Démonomanie I_corrected
Démonomanie II_corrected
Démonomanie III_corrected
Démonomanie IV_corrected
Démonomanie preface Repair_corrected
Démonomanie I_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Démonomanie II_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Démonomanie III_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Démonomanie IV_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams
Démonomanie preface Repair_corrected_underscore_bigrams.txt in tokenized/underscore_bigrams


In [20]:
trigram_underscore_dict = {}

for key, tokenized_words in unigrams.items():
    collocations = trigram_colloc_dict.get(key, [])
    colloc_words = []
    i = 0
    while i < len(tokenized_words) - 2:
        # If we find a trigram collocation, add and advance by three words
        trigram = (tokenized_words[i], tokenized_words[i + 1], tokenized_words[i + 2])
        if trigram in collocations:
            colloc_words.append('_'.join(trigram))
            i += 3
        else:
            colloc_words.append(tokenized_words[i])
            i += 1
    # Add the last words (if any)
    while i < len(tokenized_words):
        colloc_words.append(tokenized_words[i])
        i += 1
    trigram_underscore_dict[key] = colloc_words

print("Trigram underscore Dictionary:")
for key in trigram_underscore_dict:
    print(key)

write_dict_to_files_with_suffix(trigram_underscore_dict, trigram_underscore_folder, 'underscore_trigrams')

Trigram underscore Dictionary:
Démonomanie I_corrected
Démonomanie II_corrected
Démonomanie III_corrected
Démonomanie IV_corrected
Démonomanie preface Repair_corrected
Démonomanie I_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Démonomanie II_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Démonomanie III_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Démonomanie IV_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
Démonomanie preface Repair_corrected_underscore_trigrams.txt in tokenized/underscore_trigrams
