reforming the data into something easier to leaf through: separating context from the fact_fiction_words_found into its own entries so they can be separately opened which will hopefully make it easier to read and rate by hand

In [1]:
import pandas as pd

def convert_csv_to_excel(input_csv, output_excel):
    """
    Convert a CSV file to an Excel file, ensuring that context text is formatted as plain text in Excel.
    
    Parameters:
    - input_csv (str): Path to the input CSV file.
    - output_excel (str): Path to the output Excel file.
    """
    # Read the CSV file
    df = pd.read_csv(input_csv)
    
    # Function to ensure that text is formatted as plain text in Excel
    def format_as_text(value):
        """Ensure that the text is formatted as plain text in Excel."""
        if isinstance(value, str) and value.startswith('='):
            return f"'{value}"  # Prefix with a single quote to force text format
        return value

    # Apply the formatting function to the 'Context' column
    df['Context'] = df['Context'].apply(format_as_text)
    
    # Save to Excel file
    df.to_excel(output_excel, index=False, engine='xlsxwriter')
    
    print(f"CSV data has been successfully converted to {output_excel}.")

In [16]:
convert_csv_to_excel('data_for_viewing/extracted_raw/extracted_contexts_from_rs_text.csv', 'data_for_viewing/extracted_raw/ff_royal_society_words_found_full.xlsx') #royal society
convert_csv_to_excel('data_for_viewing/extracted_raw/extracted_contexts_from_general_magazine.csv', 'data_for_viewing/extracted_raw/ff_general_magazine_words_found_full.xlsx') #general magazine
convert_csv_to_excel('data_for_viewing/extracted_raw/extracted_contexts_from_spectator.csv', 'data_for_viewing/extracted_raw/ff_spectator_words_found_full.xlsx') #spectator



CSV data has been successfully converted to data_for_viewing/extracted_raw/ff_royal_society_words_found_full.xlsx.
CSV data has been successfully converted to data_for_viewing/extracted_raw/ff_general_magazine_words_found_full.xlsx.
CSV data has been successfully converted to data_for_viewing/extracted_raw/ff_spectator_words_found_full.xlsx.


In [2]:
convert_csv_to_excel('data_for_viewing/extracted_raw/extracted_contexts_from_rs_text_RSTA.csv', 'data_for_viewing/extracted_raw/ff_royal_society_words_found_full_RSTA.xlsx') #royal society


CSV data has been successfully converted to data_for_viewing/extracted_raw/ff_royal_society_words_found_full_RSTA.xlsx.


In [3]:
import pandas as pd
import os

def split_contexts(input_file, output_excel_file, contexts_directory, subfolder_name):
    """
    Split contexts from an Excel file into separate text files and save a reduced version of the data.
   
    Parameters:
    - input_file (str): Path to the input Excel file with contexts.
    - output_excel_file (str): Path to the output Excel file without the context column.
    - contexts_directory (str): Directory where the context text files will be saved.
    - subfolder_name (str): Name of the subfolder within the contexts_directory where the context files will be saved.
    """
    df = pd.read_excel(input_file)
   
    df['ID'] = range(1, len(df) + 1)
   
    df_no_context = df[['ID', 'Date', 'Filename', 'Keyword', 'Author', 'Title']]
    df_no_context.to_excel(output_excel_file, index=False)
   
    full_contexts_directory = os.path.join(contexts_directory, subfolder_name)
    os.makedirs(full_contexts_directory, exist_ok=True)
   
    # Counter for the number of context files created
    context_files_count = 0
   
    for _, row in df.iterrows():
        context_id = row['ID']
        context_text = row['Context']
        text_filename = os.path.join(full_contexts_directory, f'{context_id}.txt')
       
        with open(text_filename, 'w', encoding='utf-8') as file:
            file.write(context_text)
        
        context_files_count += 1
   
    # If no context files were created, add a file indicating no keywords were found
    if context_files_count == 0:
        no_keyword_file = os.path.join(full_contexts_directory, "no_keywords_found.txt")
        with open(no_keyword_file, 'w', encoding='utf-8') as file:
            file.write("No keywords were found :(")
   
    print("Processing complete.")

In [18]:
#royal society
split_contexts(
    'data_for_viewing/extracted_raw/ff_royal_society_words_found_full.xlsx',
    'data_for_viewing/ff_royal_society_words_found_context_separate.xlsx',
    'data_for_viewing/contexts',
    'royal_society_RSTL'
)

#general magazine
split_contexts(
    'data_for_viewing/extracted_raw/ff_general_magazine_words_found_full.xlsx',
    'data_for_viewing/ff_general_magazine_words_found_context_separate.xlsx',
    'data_for_viewing/contexts',
    'general_magazine'
) #completely empty?

#spectator
split_contexts(
    'data_for_viewing/extracted_raw/ff_spectator_words_found_full.xlsx',
    'data_for_viewing/ff_spectator_words_found_context_separate.xlsx',
    'data_for_viewing/contexts',
    'spectator'
)

Processing complete.
Processing complete.
Processing complete.


In [5]:
#royal society
split_contexts(
    'data_for_viewing/extracted_raw/ff_royal_society_words_found_full_RSTA.xlsx',
    'data_for_viewing/ff_royal_society_words_found_context_separate_RSTA.xlsx',
    'data_for_viewing/contexts',
    'royal_society_RSTA'
)

Processing complete.
