In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from pathlib import Path  
import glob
import os
import csv

# Global variable to store the selected stopwords
selected_stopwords = None

def get_directory_path():
    # Get the current working directory
    cwd = os.getcwd()

    print(f"Current working directory: {cwd}")
    print("1. Use the current working directory")
    print("2. Select a subdirectory")

    choice = input("Enter your choice (default is 1): ") or '1'

    if choice == '1':
        return cwd
    elif choice == '2':
        subdirs = [d for d in os.listdir(cwd) if os.path.isdir(os.path.join(cwd, d))]
        if not subdirs:
            print("No subdirectories found.")
            return cwd

        print("Available subdirectories:")
        for i, subdir in enumerate(subdirs):
            print(f"{i + 1}. {subdir}")

        subdir_choice = int(input("Select a subdirectory number: ")) - 1
        if subdir_choice < 0 or subdir_choice >= len(subdirs):
            print("Invalid choice. Using current working directory.")
            return cwd

        return os.path.join(cwd, subdirs[subdir_choice])
    else:
        print("Invalid choice. Using current working directory.")
        return cwd

def exclude_files(text_files):
    print("Do you want to exclude any files? (yes/no, default is yes)")
    choice = input().strip().lower() or 'yes'

    if choice == 'yes':
        print("Available text files:")
        for i, file in enumerate(text_files):
            file_stem = f"{Path(file).stem}{Path(file).suffix}"
            print(f"{i + 1}. {file_stem}")

        user_input = input("Enter the numbers of the files you want to exclude, separated by commas (press Enter to include all files):").strip()
        if user_input:
            exclude_indices = [int(x.strip()) - 1 for x in user_input.split(',')]
            text_files = [file for i, file in enumerate(text_files) if i not in exclude_indices]

    return text_files

def get_stopwords():
    global selected_stopwords
    
    # Check if stopwords have already been selected
    if selected_stopwords is not None:
        print("Using previously selected stopwords file.")
        return selected_stopwords

    cwd = os.getcwd()
    csv_files = sorted([f for f in os.listdir(cwd) if f.endswith('.csv')])

    if not csv_files:
        print("No CSV files found in the current working directory.")
        return []

    print("Available CSV files:")
    for i, file in enumerate(csv_files):
        print(f"{i + 1}. {file}")

    while True:
        choice = input("Select a CSV file number for the stopword list: ").strip()
        if choice.isdigit():
            choice = int(choice) - 1
            if 0 <= choice < len(csv_files):
                stopword_file = csv_files[choice]
                stopwords = []
                with open(stopword_file, 'r', encoding='utf-8') as file:
                    reader = csv.reader(file)
                    for row in reader:
                        stopwords.extend(row)
                stopwords = [word.strip() for word in stopwords]
                # Save the selected stopwords in the global variable
                selected_stopwords = stopwords
                return stopwords
        print("Invalid choice. Please enter a valid number from the list.")

def get_top_n_value():
    while True:
        try:
            top_n_input = input("Enter the number of top terms to display (default is 30): ").strip()
            if not top_n_input:
                return 30
            top_n = int(top_n_input)
            return top_n
        except ValueError:
            print("Invalid input. Please enter a valid number.")

In [2]:
# Get the directory path from the user
directory_path = get_directory_path()
text_files = sorted(glob.glob(f"{directory_path}/*.txt"))

# Prompt the user to exclude any files
text_files = exclude_files(text_files)

text_titles = [Path(text).stem for text in text_files]

# Print the final list of text files
print("Final list of text files:")
for text_title in text_titles:
    print(text_title)

Current working directory: /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected/lemmatized
1. Use the current working directory
2. Select a subdirectory


Enter your choice (default is 1):  


Do you want to exclude any files? (yes/no, default is yes)


 no


Final list of text files:
Discours des raisons_corrected_stemmed
Démonomanie Repair_corrected_stemmed
Harangue - Fontainebleau_corrected_stemmed
Harangue - Orléans 2_corrected_stemmed
Harangue - Orléans_corrected_stemmed
Harangue - Poissy_corrected_stemmed
Harangue - Rouen_corrected_stemmed
Harangue - Saint Germain_corrected_stemmed
Harangue - lit de justice_corrected_stemmed
Harangue - ouverture de parlement_corrected_stemmed
Harangue - parlement 2_corrected_stemmed
Harangue - parlement 3_corrected_stemmed
Harangue - parlement_corrected_stemmed
Harangue - religion_corrected_stemmed
Harangue - septembre_corrected_stemmed
La réponse_corrected_stemmed
Le paradoxe_corrected_stemmed
Lettre_corrected_stemmed
Lit de justice_corrected_stemmed
Memoire - Namur_corrected_stemmed
Memoire - le but_corrected_stemmed
Memoire au roi_corrected_stemmed
Memoires d'État Refuge_corrected_stemmed
Memoires d'état_corrected_stemmed
Recueil_corrected_stemmed
Remonstrances - Royaume_corrected_stemmed
Remonstra

In [3]:
# Global variable to store the selected stopwords
if 'selected_stopwords' not in globals():
    selected_stopwords = None

# Get the custom stopword list from the user
stop_words = get_stopwords()

# Get the number of top terms to display
top_n = get_top_n_value()

text_contents = []
for file_path in text_files:
    with open(file_path, 'r', encoding='utf-8') as file:
        text_contents.append(file.read())

tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)
tfidf_vector = tfidf_vectorizer.fit_transform(text_contents)
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=text_titles, columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df = tfidf_df.stack().reset_index()
tfidf_df = tfidf_df.rename(columns={0: 'tfidf', 'level_0': 'document', 'level_1': 'term'})

# Get the top N terms for each document
top_tfidf = tfidf_df.sort_values(by=['document', 'tfidf'], ascending=[True, False])\
    .groupby(['document']).head(top_n)

# Prompt the user for a filename, reprompt if return is hit accidentally
while True:
    filename = input("Enter a filename (without extension) to save the top TF-IDF terms: ").strip()
    if filename:
        break
    print("Filename cannot be empty. Please enter a valid filename.")

# Save the top TF-IDF terms to an Excel workbook
output_file = f"{filename}_tfidf.xlsx"
top_tfidf.to_excel(output_file, index=False, engine='openpyxl')
print(f"Top TF-IDF terms saved to {output_file} in the current directory.")


Available CSV files:
1. stop_words.csv


Select a CSV file number for the stopword list:  1
Enter the number of top terms to display (default is 30):  50
Enter a filename (without extension) to save the top TF-IDF terms:  top50


Top TF-IDF terms saved to top50_tfidf.xlsx in the current directory.


In [4]:
import altair as alt
import numpy as np

# Get the custom stopword list from the user
stop_words = get_stopwords()

# Function to select an .xlsx file
def select_xlsx_file():
    cwd = os.getcwd()
    xlsx_files = sorted([f for f in os.listdir(cwd) if f.endswith('.xlsx')])
    
    if not xlsx_files:
        print("No XLSX files found in the current working directory.")
        return None

    print("Available XLSX files:")
    for i, file in enumerate(xlsx_files):
        print(f"{i + 1}. {file}")

    choice_input = input("Select an XLSX file number to read as top_tfidf (default is 1): ").strip()
    choice = int(choice_input) - 1 if choice_input.isdigit() else 0
    if choice < 0 or choice >= len(xlsx_files):
        print("Invalid choice.")
        return None

    return xlsx_files[choice]

# Prompt user to select an .xlsx file
xlsx_file = select_xlsx_file()
if xlsx_file:
    top_tfidf = pd.read_excel(xlsx_file)
    
    # Extract the stem of the selected XLSX file
    xlsx_stem = Path(xlsx_file).stem

    # Prompt user to truncate groups with default value as 'no'
    truncate_groups = input("Do you want to truncate the plot by only including certain groups? (yes/no, default is no): ").strip().lower() or 'no'
    if truncate_groups == 'yes':
        available_groups = top_tfidf['document'].unique()
        print("Available groups (documents) with index numbers:")
        for i, group in enumerate(available_groups):
            print(f"{i + 1}. {group}")

        selected_groups = input("Enter the index numbers of the groups to include (e.g., 1,2,3 or 1-3): ").strip()
        selected_indexes = []

        # Split the input by commas to handle both individual values and ranges
        for part in selected_groups.split(','):
            if '-' in part:
                start_index, end_index = map(int, part.split('-'))
                selected_indexes.extend(range(start_index, end_index + 1))
            else:
                selected_indexes.append(int(part))

        # Convert selected indexes to groups
        selected_groups = [available_groups[i - 1] for i in selected_indexes]

        # Filter the DataFrame
        top_tfidf = top_tfidf[top_tfidf['document'].isin(selected_groups)]

    # Remove '_corrected' from the 'document' labels
    top_tfidf['document'] = top_tfidf['document'].str.replace('_corrected', '')
    top_tfidf['document'] = top_tfidf['document'].str.replace('Repair', '')
    top_tfidf['document'] = top_tfidf['document'].str.replace('_lemmatized', '')
    top_tfidf['document'] = top_tfidf['document'].str.replace('_stemmed', '')

    # Prompt user to limit the number of words displayed in each group
    limit_words = input("Do you want to limit the number of words displayed in each group? (yes/no, default is yes): ").strip().lower() or 'yes'
    if limit_words == 'yes':
        max_words = int(input("Enter the maximum number of words to display per group: ").strip())
        top_tfidf['rank'] = top_tfidf.groupby('document')['tfidf'].rank("first", ascending=False)
        top_tfidf = top_tfidf[top_tfidf['rank'] <= max_words]

    # Prompt user to choose whether to use a preset list of terms, specify custom terms, or enter a blank list
    print("1. Use preset list of terms")
    print("2. Specify custom terms")
    print("3. Enter a blank list")
    term_choice_input = input("Enter your choice (1, 2, or 3), default is 1: ").strip()
    term_choice = int(term_choice_input) if term_choice_input.isdigit() else 1


    preset_terms = ['bien', 'cayer', 'céan', 'chambr', 'chos', 'conseil', 'conseiller', 'court',
                    'déni', 'dieu', 'divin', 'droit', 'édict', 'églis', 'estat', 'héres',
                    'judg', 'jug', 'justic', 'loi', 'majest', 'magistrat', 'offic', 'ordon',
                    'parlement', 'paix', 'princ', 'puissanc',
                    'réform', 'religion', 'republ', 'ressort', 'roy', 'royaum',
                    'sathan', 'sorc', 'sorci', 'souverain', 'sujet']
    if term_choice == 1:
        term_list = preset_terms
    elif term_choice == 2:
        user_terms = input("Enter a list of words separated by commas for highlighting (e.g., war, peace): ")
        term_list = [term.strip() for term in user_terms.split(",")]
    else:
        term_list = []

    # Prompt user for a filename to save the PNG file
    file_name = input("Enter a filename (without extension) to save the visualization: ")
    output_file = f"{file_name}.png"

    # Prompt user to specify the width of the chart
    chart_width_input = input("Enter the chart width (default is 600): ").strip()
    chart_width = int(chart_width_input) if chart_width_input.isdigit() else 600

    # Adding a little randomness to break ties in term ranking
    top_tfidf_plusRand = top_tfidf.copy()
    top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0]) * 0.0001

    # Base chart for all visualizations, with rank calculation
    base = alt.Chart(top_tfidf_plusRand).encode(
        x=alt.X('rank:O', axis=alt.Axis(labelAngle=0)),  # Set labelAngle to horizontal (0 degrees)
        y='document:N'
    ).transform_window(
        rank="rank()",
        sort=[alt.SortField("tfidf", order="descending")],
        groupby=["document"]
    )

    # Heatmap specification
    heatmap = base.mark_rect().encode(
        color='tfidf:Q'
    )

    # Red circle over terms in the entered list
    circle = base.mark_circle(size=100).encode(
        color=alt.condition(
            alt.FieldOneOfPredicate(field='term', oneOf=term_list),
            alt.value('red'),
            alt.value('#FFFFFF00')
        )
    )

    # Text labels, white for darker heatmap colors
    text = base.mark_text(baseline='middle', fontSize=12).encode(
        text='term:N',
        color=alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
    )

    # Combine the heatmap, circle, and text
    final_chart = alt.layer(heatmap, circle, text).properties(width=chart_width)

    # Display the chart in Jupyter Lab
    final_chart.display()

    # Save the chart as a PNG file using the base Altair package
    try:
        final_chart.save(output_file)
        print(f"Visualization saved as {output_file} in the current directory.")
    except Exception as e:
        print(f"Error saving visualization: {e}")
else:
    print("No valid XLSX file selected.")

Using previously selected stopwords file.
Available XLSX files:
1. top50_tfidf.xlsx


Select an XLSX file number to read as top_tfidf (default is 1):  1
Do you want to truncate the plot by only including certain groups? (yes/no, default is no):  
Do you want to limit the number of words displayed in each group? (yes/no, default is yes):  
Enter the maximum number of words to display per group:  15


1. Use preset list of terms
2. Specify custom terms
3. Enter a blank list


Enter your choice (1, 2, or 3), default is 1:  
Enter a filename (without extension) to save the visualization:  top15
Enter the chart width (default is 600):  1100


Visualization saved as top15.png in the current directory.
