In [1]:
import pandas as pd
from pathlib import Path
import csv
import os

import glob
# Import Counter()
from collections import Counter

# For making wordclouds
from wordcloud import WordCloud
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from PIL import Image
import urllib.request

In [2]:
# Create an empty Counter object called `word_frequency`
word_frequency = Counter()

In [3]:
def prompt_user_to_select(prompt, options):
    print(prompt)
    for i, option in enumerate(options, start=1):
        print(f"{i}. {option}")
    choice = int(input("Enter the number of your choice: ")) - 1
    return options[choice]

# Get the current working directory
current_directory = os.getcwd()

# Prompt user to select a subfolder in the current working directory
subfolders = [f.path for f in os.scandir(current_directory) if f.is_dir()]
selected_subfolder = prompt_user_to_select("Select a subfolder:", subfolders)

# Search for .csv files in the selected subfolder
csv_files_in_subfolder = glob.glob(os.path.join(selected_subfolder, '*.csv'))
unigram_csv = prompt_user_to_select("Select a .csv file as unigram_csv:", csv_files_in_subfolder)

# Search for .csv files in the current working directory
csv_files_in_current_directory = glob.glob(os.path.join(current_directory, '*.csv'))
stopword_csv = prompt_user_to_select("Select a .csv file as stopword_csv:", csv_files_in_current_directory)

# Prompt user to select a subfolder for sorted_counts
sorted_counts_subfolder = prompt_user_to_select("Select a subfolder for sorted_counts:", subfolders)

# Extract the prefix from the unigram_csv filename
unigram_csv_filename = os.path.basename(unigram_csv)
prefix = unigram_csv_filename.split('_')[0]

# Set the sorted_counts variable
sorted_counts = os.path.join(sorted_counts_subfolder, f"{prefix}_sorted_counts.csv")

print(f"unigram_csv = '{unigram_csv}'")
print(f"stopword_csv = '{stopword_csv}'")
print(f"sorted_counts = '{sorted_counts}'")


Select a subfolder:
1. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Démonomanie/Concatenated/tokenized
2. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Démonomanie/Concatenated/final
3. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Démonomanie/Concatenated/.ipynb_checkpoints


Enter the number of your choice:  1


Select a .csv file as unigram_csv:
1. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Démonomanie/Concatenated/tokenized/Démonomanie_corrected_trigram_counts.csv
2. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Démonomanie/Concatenated/tokenized/Démonomanie_corrected_bigram_counts.csv
3. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Démonomanie/Concatenated/tokenized/Démonomanie_corrected_collocation_counts.csv
4. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Démonomanie/Concatenated/tokenized/Démonomanie_corrected_unigram_counts.csv


Enter the number of your choice:  4


Select a .csv file as stopword_csv:
1. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Démonomanie/Concatenated/spellcheck_data.csv
2. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Démonomanie/Concatenated/stop_words.csv


Enter the number of your choice:  2


Select a subfolder for sorted_counts:
1. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Démonomanie/Concatenated/tokenized
2. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Démonomanie/Concatenated/final
3. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Démonomanie/Concatenated/.ipynb_checkpoints


Enter the number of your choice:  1


unigram_csv = '/home/lucas-jerusalimiec/Documents/OCR Text/Text/Démonomanie/Concatenated/tokenized/Démonomanie_corrected_unigram_counts.csv'
stopword_csv = '/home/lucas-jerusalimiec/Documents/OCR Text/Text/Démonomanie/Concatenated/stop_words.csv'
sorted_counts = '/home/lucas-jerusalimiec/Documents/OCR Text/Text/Démonomanie/Concatenated/tokenized/Démonomanie_sorted_counts.csv'


In [4]:
stop_words = []

with open(stopword_csv, mode='r') as f:
    stop_words = list(csv.reader(f))[0]

In [5]:
with open(unigram_csv, mode='r') as file:
    reader = csv.reader(file)
    next(reader)
    for row in reader:
        token, count = row[0], int(row[1])
        # Convert token to lowercase
        token = token.lower()
        # Check if the token is alphanumeric and not a stop word
        if token.isalpha() and token not in stop_words:
            word_frequency[token] += count    

In [6]:
for gram, count in word_frequency.most_common(25):
    print(gram.ljust(20), count)

dieu                 676
comme                480
sors                 437
plus                 426
bien                 330
sor                  244
dia                  235
autres               231
in                   229
faire                229
di                   214
livre                195
peut                 194
point                192
tous                 182
dela                 180
faut                 173
dit                  172
sathan               168
tout                 166
quand                162
mort                 161
dire                 154
sans                 146
re                   146


In [7]:
with open(sorted_counts, mode = 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['unigram', 'count'])
    for gram, count in word_frequency.most_common():
        writer.writerow([gram, count])

In [8]:
### Download cloud image for our word cloud shape ###
# It is not required to have a shape to create a word cloud
download_url = 'https://ithaka-labs.s3.amazonaws.com/static-files/images/tdm/tdmdocs/sample_cloud.png'
urllib.request.urlretrieve(download_url, f'./tokenized/sample_cloud.png')
print('Cloud shape downloaded.')

Cloud shape downloaded.


In [9]:
# Create a wordcloud from our data

#Word Cloud documentation https://amueller.github.io/word_cloud/generated/wordcloud.WordCloud.html

# Adding a mask shape of a cloud to your word cloud
# By default, the shape will be a rectangle
# You can specify any shape you like based on an image file
cloud_mask = np.array(Image.open('./tokenized/sample_cloud.png')) # Specifies the location of the mask shape
cloud_mask = np.where(cloud_mask > 3, 255, cloud_mask) # this line will take all values greater than 3 and make them 255 (white)

### Specify word cloud details
wordcloud = WordCloud(
    width = 800, # Change the pixel width of the image if blurry
    height = 600, # Change the pixel height of the image if blurry
    background_color = "white", # Change the background color
    colormap = 'viridis', # The colors of the words, see https://matplotlib.org/stable/tutorials/colors/colormaps.html
    max_words = 150, # Change the max number of words shown
    min_font_size = 4, # Do not show small text
    
    # Add a shape and outline (known as a mask) to your wordcloud
    contour_color = 'blue', # The outline color of your mask shape
    mask = cloud_mask, # 
    contour_width = 1
).generate_from_frequencies(word_frequency)

mpl.rcParams['figure.figsize'] = (20,20) # Change the image size displayed
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
#plt.show()

plt.savefig('./tokenized/wordcloud.png', format='png', dpi=300, bbox_inches='tight')

# Close the plot window after saving the image
plt.close()