# Data exploration on the SoNaR corpus

In [83]:
# imports
import pandas as pd
import xml.etree.ElementTree as ET
from azure.storage.blob import BlobServiceClient
from io import StringIO
import os
from dotenv import load_dotenv
from collections import Counter
import csv

# Load environment variables from the .env file
load_dotenv()

# Azure Blob Storage configuration
connection_string = os.getenv("SONAR_STORAGE_KEY")
container_name = "books"

### Counting the number of words and 10-grams in a subset of the corpus

In [72]:
class BlobProcessor:
    def __init__(self, connection_string, container_name):
        self.connection_string = connection_string
        self.container_name = container_name
        self.blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        self.container_client = self.blob_service_client.get_container_client(container_name)
        self.data = []
        self.output_file = '../out/wordcount_and_10gram_perfile.csv'
    
    def list_blobs(self):
        """List all blobs in the container."""
        return list(self.container_client.list_blobs())
    
    def process_xml_files(self):
        """Process XML files in the container and compute word counts and 10-grams."""
        blobs = self.list_blobs()
        xml_files = [blob.name for blob in blobs if blob.name.endswith('.xml')]
        print(f"XML files: {xml_files}")
        print(f"Processing {len(blobs)} files...")
        print(f"First 5 files: {[blob.name for blob in blobs][:5]}")

        total_files = len(blobs)
        for i, blob in enumerate(blobs, start=1):
            blob_name = blob.name
            print(f"Processing file {blob_name} ({i}/{total_files})")

            if blob_name.endswith('.xml'):
                try:
                    # Download blob content
                    blob_client = self.container_client.get_blob_client(blob_name)
                    blob_data = blob_client.download_blob().readall()
                    print(f"Blob data: {blob_data[:100]}")  # Preview first 100 bytes

                    # Parse XML content
                    xml_content = blob_data.decode('iso-8859-15')
                    tree = ET.ElementTree(ET.fromstring(xml_content))
                    root = tree.getroot()

                    # Count <w> elements
                    word_count = len(root.findall('.//dcoi:w', {'dcoi': 'http://lands.let.ru.nl/projects/d-coi/ns/1.0'}))
                except ET.ParseError as e:
                    print(f"Error parsing {blob_name}: {e}")
                    word_count = 0

                # Calculate the number of 10-grams
                ten_grams = max(0, word_count - 10 + 1)
                
                # Append results
                self.data.append({
                    'filename': blob_name,
                    'word_count': word_count,
                    'ten_grams': ten_grams
                })

                print(f"File {blob_name} processed: {i}/{total_files}")

    def get_dataframe(self):
        """Return the data as a Pandas DataFrame."""
        return pd.DataFrame(self.data)
    
    def save_dataframe(self):
        """Save the DataFrame to a file."""
        df = self.get_dataframe()
        df.to_csv(self.output_file, index=False)
        print(f"DataFrame saved to {self.output_file}")
    
    def get_total_ten_grams(self):
        """Calculate the total number of 10-grams."""
        if not self.data:
            return 0
        df = self.get_dataframe()
        return df['ten_grams'].sum()
    
    def summary(self):
        """Print a summary of the processing results."""
        df = self.get_dataframe()
        print(df)
        print(f"Total 10-grams: {self.get_total_ten_grams()}")

In [None]:
# Initialize BlobProcessor
processor = BlobProcessor(connection_string, container_name)

# Process XML files
processor.process_xml_files()

# Print summary
processor.summary()

# Save DataFrame to a CSV file
processor.save_dataframe()

XML files: ['WR-P-P-B-0000000001.dcoi.xml', 'WR-P-P-B-0000000002.dcoi.xml', 'WR-P-P-B-0000000003.dcoi.xml', 'WR-P-P-B-0000000004.dcoi.xml', 'WR-P-P-B-0000000005.dcoi.xml', 'WR-P-P-B-0000000006.dcoi.xml', 'WR-P-P-B-0000000007.dcoi.xml', 'WR-P-P-B-0000000008.dcoi.xml', 'WR-P-P-B-0000000011.dcoi.xml', 'WR-P-P-B-0000000012.dcoi.xml', 'WR-P-P-B-0000000013.dcoi.xml', 'WR-P-P-B-0000000014.dcoi.xml', 'WR-P-P-B-0000000015.dcoi.xml', 'WR-P-P-B-0000000016.dcoi.xml', 'WR-P-P-B-0000000017.dcoi.xml', 'WR-P-P-B-0000000018.dcoi.xml', 'WR-P-P-B-0000000019.dcoi.xml', 'WR-P-P-B-0000000020.dcoi.xml', 'WR-P-P-B-0000000021.dcoi.xml', 'WR-P-P-B-0000000022.dcoi.xml', 'WR-P-P-B-0000000023.dcoi.xml', 'WR-P-P-B-0000000024.dcoi.xml', 'WR-P-P-B-0000000025.dcoi.xml', 'WR-P-P-B-0000000026.dcoi.xml', 'WR-P-P-B-0000000027.dcoi.xml', 'WR-P-P-B-0000000028.dcoi.xml', 'WR-P-P-B-0000000029.dcoi.xml', 'WR-P-P-B-0000000030.dcoi.xml', 'WR-P-P-B-0000000031.dcoi.xml', 'WR-P-P-B-0000000032.dcoi.xml', 'WR-P-P-B-0000000033.dcoi.xm

### Inspecting full text and topics

In [80]:
class BlobTextExtractor:
    def __init__(self, connection_string, container_name):
        self.connection_string = connection_string
        self.container_name = container_name
        self.blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        self.container_client = self.blob_service_client.get_container_client(container_name)

    def extract_text_from_blob(self, blob_name):
        """Extract text from <w> elements in a single blob."""
        try:
            # Download blob content
            blob_client = self.container_client.get_blob_client(blob_name)
            blob_data = blob_client.download_blob().readall()
            
            # Parse the XML content
            tree = ET.ElementTree(ET.fromstring(blob_data.decode('iso-8859-15')))
            root = tree.getroot()
            
            # Extract text from <w> elements
            word_elements = root.findall('.//dcoi:w', {'dcoi': 'http://lands.let.ru.nl/projects/d-coi/ns/1.0'})
            return ' '.join(w.text for w in word_elements if w.text)
        except Exception as e:
            print(f"Error processing blob {blob_name}: {e}")
            return ""

    def save_texts_to_single_file(self, output_file):
        """Extract text from all XML blobs in the container and save to a single file."""
        with open(output_file, 'w', encoding='utf-8') as outfile:
            # List all blobs in the container
            blobs = list(self.container_client.list_blobs())
            xml_files = [blob.name for blob in blobs if blob.name.endswith('.xml')]

            print(f"Found {len(xml_files)} XML files in container '{self.container_name}'.")

            for i, blob_name in enumerate(xml_files, start=1):
                print(f"Processing file {i}/{len(xml_files)}: {blob_name}")

                # Extract text from the blob
                text = self.extract_text_from_blob(blob_name)

                # Write the title (blob name without extension), text, and blank line to the output file
                outfile.write(f"{os.path.splitext(blob_name)[0]}\n")  # Write the title
                outfile.write(text)  # Write the extracted text
                outfile.write("\n\n")  # Add a blank line between entries
                
                print(f"Added text from {blob_name} to the combined file.")
        
        print(f"All texts saved to a single file at {output_file}.")

In [82]:
output_file = "../out/combined.txt"

# Initialize the BlobTextExtractor
extractor = BlobTextExtractor(connection_string, container_name)

# Extract texts and save to a single output file
extractor.save_texts_to_single_file(output_file)

Found 508 XML files in container 'books'.
Processing file 1/508: WR-P-P-B-0000000001.dcoi.xml
Added text from WR-P-P-B-0000000001.dcoi.xml to the combined file.
Processing file 2/508: WR-P-P-B-0000000002.dcoi.xml
Added text from WR-P-P-B-0000000002.dcoi.xml to the combined file.
Processing file 3/508: WR-P-P-B-0000000003.dcoi.xml
Added text from WR-P-P-B-0000000003.dcoi.xml to the combined file.
Processing file 4/508: WR-P-P-B-0000000004.dcoi.xml
Added text from WR-P-P-B-0000000004.dcoi.xml to the combined file.
Processing file 5/508: WR-P-P-B-0000000005.dcoi.xml
Added text from WR-P-P-B-0000000005.dcoi.xml to the combined file.
Processing file 6/508: WR-P-P-B-0000000006.dcoi.xml
Added text from WR-P-P-B-0000000006.dcoi.xml to the combined file.
Processing file 7/508: WR-P-P-B-0000000007.dcoi.xml
Added text from WR-P-P-B-0000000007.dcoi.xml to the combined file.
Processing file 8/508: WR-P-P-B-0000000008.dcoi.xml
Added text from WR-P-P-B-0000000008.dcoi.xml to the combined file.
Proces

### Counting per word

In [54]:
class BlobWordFrequencyAggregator:
    def __init__(self, connection_string, container_name):
        self.connection_string = connection_string
        self.container_name = container_name
        self.blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        self.container_client = self.blob_service_client.get_container_client(container_name)

    def extract_word_frequencies_from_blob(self, blob_name):
        """Extract word frequencies from a single blob."""
        try:
            # Download blob content
            blob_client = self.container_client.get_blob_client(blob_name)
            blob_data = blob_client.download_blob().readall()
            
            # Parse the XML content
            tree = ET.ElementTree(ET.fromstring(blob_data.decode('iso-8859-15')))
            root = tree.getroot()
            
            # Extract words from <w> elements
            word_elements = root.findall('.//dcoi:w', {'dcoi': 'http://lands.let.ru.nl/projects/d-coi/ns/1.0'})
            words = [w.text for w in word_elements if w.text]
            
            # Return word frequencies as a Counter
            return Counter(words)
        except Exception as e:
            print(f"Error processing blob {blob_name}: {e}")
            return Counter()

    def aggregate_word_frequencies(self):
        """Aggregate word frequencies from all XML blobs in the container."""
        total_frequencies = Counter()
        
        # List all XML blobs in the container
        blobs = list(self.container_client.list_blobs())
        xml_files = [blob.name for blob in blobs if blob.name.endswith('.xml')]

        print(f"Found {len(xml_files)} XML files in container '{self.container_name}'.")

        for i, blob_name in enumerate(xml_files, start=1):
            print(f"Processing file {i}/{len(xml_files)}: {blob_name}")
            
            # Extract word frequencies from the current blob
            word_frequencies = self.extract_word_frequencies_from_blob(blob_name)
            
            # Update the total frequencies with the current blob's frequencies
            total_frequencies.update(word_frequencies)
        
        return total_frequencies

    def save_aggregated_frequencies_to_csv(self, word_frequencies, output_file):
        """Save aggregated word frequencies to a CSV file."""
        # Sort words by frequency in descending order
        sorted_frequencies = sorted(word_frequencies.items(), key=lambda item: item[1], reverse=True)
        
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            # Write header
            writer.writerow(['Word', 'Frequency'])
            # Write sorted word and frequency data
            for word, freq in sorted_frequencies:
                writer.writerow([word, freq])
        
        print(f"Word frequencies saved to {output_file}.")

    def get_word_frequency_from_csv(self, word, csv_file):
        """Get the frequency of a single word from the CSV file."""
        try:
            # Load the CSV file into a DataFrame
            word_frequencies = pd.read_csv(csv_file, sep=',', on_bad_lines='skip')

            # Find the frequency for the given word
            word_df = word_frequencies[word_frequencies['Word'] == word]
            
            if not word_df.empty:
                frequency = word_df['Frequency'].iloc[0]
                print(f"The word '{word}' appears {frequency} time(s) in the CSV file.")
                return frequency
            else:
                print(f"The word '{word}' does not appear in the CSV file.")
                return 0
        except Exception as e:
            print(f"Error reading CSV file {csv_file}: {e}")
            return 0

In [55]:
output_csv = "../out/word_frequencies_books.csv"

# Initialize the BlobWordFrequencyAggregator
aggregator = BlobWordFrequencyAggregator(connection_string, container_name)

# Aggregate word frequencies
total_word_frequencies = aggregator.aggregate_word_frequencies()

# Save aggregated frequencies to a CSV file
aggregator.save_aggregated_frequencies_to_csv(total_word_frequencies, output_csv)


Found 508 XML files in container 'books'.
Processing file 1/508: WR-P-P-B-0000000001.dcoi.xml
Processing file 2/508: WR-P-P-B-0000000002.dcoi.xml
Processing file 3/508: WR-P-P-B-0000000003.dcoi.xml
Processing file 4/508: WR-P-P-B-0000000004.dcoi.xml
Processing file 5/508: WR-P-P-B-0000000005.dcoi.xml
Processing file 6/508: WR-P-P-B-0000000006.dcoi.xml
Processing file 7/508: WR-P-P-B-0000000007.dcoi.xml
Processing file 8/508: WR-P-P-B-0000000008.dcoi.xml
Processing file 9/508: WR-P-P-B-0000000011.dcoi.xml
Processing file 10/508: WR-P-P-B-0000000012.dcoi.xml
Processing file 11/508: WR-P-P-B-0000000013.dcoi.xml
Processing file 12/508: WR-P-P-B-0000000014.dcoi.xml
Processing file 13/508: WR-P-P-B-0000000015.dcoi.xml
Processing file 14/508: WR-P-P-B-0000000016.dcoi.xml
Processing file 15/508: WR-P-P-B-0000000017.dcoi.xml
Processing file 16/508: WR-P-P-B-0000000018.dcoi.xml
Processing file 17/508: WR-P-P-B-0000000019.dcoi.xml
Processing file 18/508: WR-P-P-B-0000000020.dcoi.xml
Processing fi

In [59]:
word_to_check = "mannen"
output_csv = "../out/word_frequencies_books.csv"
aggregator.get_word_frequency_from_csv(word_to_check, output_csv)

The word 'mannen' appears 10811 time(s) in the CSV file.


10811

### File locator word a specific word

In [62]:
class BlobWordSearcher:
    def __init__(self, connection_string, container_name):
        self.connection_string = connection_string
        self.container_name = container_name
        self.blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        self.container_client = self.blob_service_client.get_container_client(container_name)

    def find_blobs_with_word(self, target_word):
        """Find blobs in the container that contain the target word."""
        matching_blobs = []
        
        # List all XML blobs in the container
        blobs = list(self.container_client.list_blobs())
        xml_files = [blob.name for blob in blobs if blob.name.endswith('.xml')]

        print(f"Searching for the word '{target_word}' in {len(xml_files)} XML files.")

        for i, blob_name in enumerate(xml_files, start=1):
            print(f"Processing file {i}/{len(xml_files)}: {blob_name}")

            try:
                # Download blob content
                blob_client = self.container_client.get_blob_client(blob_name)
                blob_data = blob_client.download_blob().readall()

                # Parse the XML content
                tree = ET.ElementTree(ET.fromstring(blob_data.decode('iso-8859-15')))
                root_element = tree.getroot()

                # Search for the target word in all <w> elements using the namespace
                word_elements = root_element.findall('.//dcoi:w', {'dcoi': 'http://lands.let.ru.nl/projects/d-coi/ns/1.0'})
                for word_element in word_elements:
                    if word_element.text and target_word in word_element.text:
                        matching_blobs.append(blob_name)
                        print(f"Word '{target_word}' found in {blob_name}")
                        break  # Stop searching this file if the word is found

            except ET.ParseError:
                print(f"Error parsing {blob_name}. Skipping.")

        return matching_blobs

In [None]:
search_word = "mama's"

# Initialize the BlobWordSearcher
searcher = BlobWordSearcher(connection_string, container_name)

# Search for blobs containing the word
blobs_with_word = searcher.find_blobs_with_word(search_word)

 # Print the results
if blobs_with_word:
    print(f"The word '{search_word}' was found in the following blobs:")
    for blob in blobs_with_word:
        print(blob)
else:
    print(f"No blobs contain the word '{search_word}'.")

Searching for the word 'mama's' in 508 XML files.
Processing file 1/508: WR-P-P-B-0000000001.dcoi.xml
Processing file 2/508: WR-P-P-B-0000000002.dcoi.xml
Processing file 3/508: WR-P-P-B-0000000003.dcoi.xml
Processing file 4/508: WR-P-P-B-0000000004.dcoi.xml
Processing file 5/508: WR-P-P-B-0000000005.dcoi.xml
Processing file 6/508: WR-P-P-B-0000000006.dcoi.xml
Processing file 7/508: WR-P-P-B-0000000007.dcoi.xml
Processing file 8/508: WR-P-P-B-0000000008.dcoi.xml
Processing file 9/508: WR-P-P-B-0000000011.dcoi.xml
Processing file 10/508: WR-P-P-B-0000000012.dcoi.xml
Processing file 11/508: WR-P-P-B-0000000013.dcoi.xml
Processing file 12/508: WR-P-P-B-0000000014.dcoi.xml
Processing file 13/508: WR-P-P-B-0000000015.dcoi.xml
Processing file 14/508: WR-P-P-B-0000000016.dcoi.xml
Processing file 15/508: WR-P-P-B-0000000017.dcoi.xml
Processing file 16/508: WR-P-P-B-0000000018.dcoi.xml
Processing file 17/508: WR-P-P-B-0000000019.dcoi.xml
Processing file 18/508: WR-P-P-B-0000000020.dcoi.xml
Proce

### Generating genedered wordpairs and professions csv files

In [76]:
class BlobWordAnalyzer:
    def __init__(self, connection_string, container_name):
        self.connection_string = connection_string
        self.container_name = container_name
        self.blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        self.container_client = self.blob_service_client.get_container_client(container_name)

    def load_csv(self, file_path):
        """Load a CSV file into a Pandas DataFrame."""
        return pd.read_csv(file_path, sep=',', on_bad_lines='skip')

    def analyze_gendered_words(self, word_frequencies, gendered_wordpairs_list, output_csv):
        """Analyze gendered words and save the results to a CSV."""
        gendered_wordpairs = word_frequencies[word_frequencies['Word'].isin(gendered_wordpairs_list)].reset_index(drop=True)
        gendered_wordpairs['Word'] = gendered_wordpairs["Word"].str.lower()
        gendered_wordpairs = gendered_wordpairs.groupby(["Word"]).agg({"Frequency": "sum"}).reset_index()

        word_mapping = {
            "baron": ["baron"],
            "barones": ["barones"],
            "broer": ["broer", "broers"],
            "bruid": ["bruid", "bruiden"],
            "bruidegom": ["bruidegom", "bruidegoms"],
            "dame": ["dame", "dames"],
            "dochter": ["dochter", "dochters"],
            "gentleman": ["gentleman"],
            "gozer": ["gozer", "gozers"],
            "grootmoeder": ["grootmoeder", "grootmoeders"],
            "grootvader": ["grootvader", "grootvaders"],
            "heer": ["heer", "heren"],
            "hertog": ["hertog", "hertogen"],
            "hertogin": ["hertogin"],
            "hij": ["hij"],
            "hijzelf": ["hijzelf"],
            "jongen": ["jongen", "jongens"],
            "jongetje": ["jongetje", "jongetjes"],
            "kerel": ["kerel", "kerels"],
            "kleindochter": ["kleindochter", "kleindochters"],
            "kleinzoon": ["kleinzoon", "kleinzonen"],
            "koning": ["koning", "koningen"],
            "koningin": ["koningin", "koninginnen"],
            "lady": ["lady"],
            "lord": ["lord"],
            "mama": ["mama", "mama's", "mamma", "mamma's"],
            "man": ["man", "mannen"],
            "meid": ["meid", "meiden"],
            "meisje": ["meisje", "meisjes"],
            "meneer": ["meneer"],
            "mijnheer": ["mijnheer"],
            "mevrouw": ["mevrouw"],
            "miss": ["miss"],
            "mister": ["mister"],
            "moeder": ["moeder", "moeders"],
            "monnik": ["monnik", "monniken"],
            "mr": ["mr", "mr.", "mrs", "mrs."],
            "ms": ["ms", "ms."],
            "neef": ["neef", "neven"],
            "nicht": ["nicht", "nichten"],
            "non": ["non", "nonnen"],
            "oma": ["oma", "oma's"],
            "oom": ["oom", "ooms"],
            "opa": ["opa", "opa's"],
            "papa": ["papa", "papa's", "pappa", "pappa's"],
            "peetmoeder": ["peetmoeder"],
            "peetvader": ["peetvader"],
            "prins": ["prins", "prinsen"],
            "prinses": ["prinses", "prinsessen"],
            "schoondochter": ["schoondochter", "schoondochters"],
            "schoonmoeder": ["schoonmoeder", "schoonmoeders"],
            "schoonvader": ["schoonvader", "schoonvaders"],
            "schoonzoon": ["schoonzoon", "schoonzonen"],
            "stiefdochter": ["stiefdochter"],
            "stiefmoeder": ["stiefmoeder", "stiefmoeders"],
            "stiefvader": ["stiefvader", "stiefvaders"],
            "stiefzoon": ["stiefzoon", "stiefzonen"],
            "tante": ["tante", "tantes"],
            "vader": ["vader", "vaders"],
            "vrouw": ["vrouw", "vrouwen"],
            "mannelijk": ["mannelijk", "mannelijke"],
            "vrouwelijk": ["vrouwelijk", "vrouwelijke"],
            "wijf": ["wijf", "wijven"],
            "zij": ["zij"],
            "zijzelf": ["zijzelf"],
            "zoon": ["zoon", "zonen"],
            "zus": ["zus", "zussen"],
        }

        reverse_mapping = {plural: singular for singular, plurals in word_mapping.items() for plural in plurals}
        gendered_wordpairs["BaseWord"] = gendered_wordpairs["Word"].map(reverse_mapping)

        gendered_wordpairs.to_csv(output_csv, index=False)

        gendered_wordpairs_base = gendered_wordpairs.groupby(["BaseWord"]).agg({"Frequency": "sum"}).reset_index()
        gendered_wordpairs_base = gendered_wordpairs_base.sort_values(by='Frequency', ascending=False)

        return gendered_wordpairs, gendered_wordpairs_base

    def analyze_professions(self, word_frequencies, professions_list, output_csv):
        """Analyze professions and save the results to a CSV."""
        professions = word_frequencies[word_frequencies['Word'].isin(professions_list)].reset_index(drop=True)
        professions['Word'] = professions["Word"].str.lower()
        professions = professions.groupby(["Word"]).agg({"Frequency": "sum"}).reset_index()
        professions = professions.sort_values(by='Frequency', ascending=False)

        professions.to_csv(output_csv, index=False)
        return professions


In [77]:
# Initialize the word analyzer
analyzer = BlobWordAnalyzer(connection_string, container_name)

# Analyze gendered words
local_csv_path = "../out/word_frequencies_books.csv"
word_frequencies = analyzer.load_csv(local_csv_path)
gendered_wordpairs_list = [
    'man', 'vrouw',
    'Man', 'Vrouw',
    'mannen', 'vrouwen',
    'Mannen', 'Vrouwen',
    'jongen', 'meisje',
    'Jongen', 'Meisje',
    'jongens', 'meisjes',
    'Jongens', 'Meisjes',
    'jongetje', 'meisje',
    'Jongetje', 'Meisje',
    'jongetjes', 'meisjes',
    'Jongetjes', 'Meisjes',
    'kerel', 'meid',
    'Kerel', 'Meid',
    'kerels', 'meiden',
    'Kerels', 'Meiden',
    'gozer', 'meid',
    'Gozer', 'Meid',
    'gozers', 'meiden',
    'Gozers', 'Meiden',
    'gozer', 'wijf',
    'Gozer', 'Wijf',
    'gozers', 'wijven',
    'Gozers', 'Wijven',
    'gentleman', 'lady',
    'Gentleman', 'Lady',
    'lord', 'lady',
    'Lord', 'Lady',
    'heer', 'dame',
    'Heer', 'Dame',
    'heren', 'dames',
    'Heren', 'Dames',
    'meneer', 'mevrouw',
    'Meneer', 'Mevrouw',
    'mijnheer', 'mevrouw',
    'Mijnheer', 'Mevrouw',
    'mister', 'miss',
    'Mister', 'Miss',
    'Mr.', 'Ms.',
    'Mr', 'Ms',
    'Mr.', 'Mrs.',
    'Mr', 'Mrs',
    'mannelijk', 'vrouwelijk',
    'Mannelijk', 'Vrouwelijk',
    'mannelijke', 'vrouwelijke',
    'Mannelijke', 'Vrouwelijke',
    'koning', 'koningin',
    'Koning', 'Koningin',
    'koningen', 'koninginnen',
    'Koningen', 'Koninginnen',
    'prins', 'prinses',
    'Prins', 'Prinses',
    'prinsen', 'prinsessen',
    'Prinsen', 'Prinsessen',
    'hertog', 'hertogin',
    'Hertog', 'Hertogin',
    'hertogen', 'hertoginnen',
    'monnik', 'non',
    'Monnik', 'Non',
    'monniken', 'nonnen',
    'Monniken', 'Nonnen',
    'baron', 'barones',
    'Baron', 'Barones',
    'hij', 'zij',
    'Hij', 'Zij',
    'hijzelf', 'zijzelf',
    'Hijzelf', 'Zijzelf',
    'vader', 'moeder',
    'Vader', 'Moeder',
    'vaders', 'moeders',
    'Vaders', 'Moeders',
    'papa', 'mama',
    'Papa', 'Mama',
    'papa\'s', 'mama\'s',
    'Papa\'s', 'Mama\'s',
    'pappa', 'mamma',
    'Pappa', 'Mamma',
    'pappa\'s', 'mamma\'s',
    'Pappa\'s', 'Mamma\'s',
    'broer', 'zus',
    'Broer', 'Zus',
    'broers', 'zussen',
    'Broers', 'Zussen',
    'neef', 'nicht',
    'Neef', 'Nicht',
    'neven', 'nichten',
    'oom', 'tante',
    'Oom', 'Tante',
    'ooms', 'tantes',
    'grootvader', 'grootmoeder',
    'Grootvader', 'Grootmoeder',
    'grootvaders', 'grootmoeders',
    'Grootvaders', 'Grootmoeders',
    'opa', 'oma',
    'Opa', 'Oma',
    'opa\'s', 'oma\'s',
    'Opa\'s', 'Oma\'s',
    'zoon', 'dochter',
    'Zoon', 'Dochter',
    'zonen', 'dochters',
    'Zonen', 'Dochters',
    'kleinzoon', 'kleindochter',
    'Kleinzoon', 'Kleindochter',
    'kleinzonen', 'kleindochters',
    'schoonzoon', 'schoondochter',
    'Schoonzoon', 'Schoondochter',
    'schoonzonen', 'schoondochters',
    'stiefvader', 'stiefmoeder',
    'Stiefvader', 'Stiefmoeder',
    'stiefvaders', 'stiefmoeders',
    'Stiefvaders', 'Stiefmoeders',
    'stiefzoon', 'stiefdochter',
    'Stiefzoon', 'Stiefdochter',
    'stiefzonen', 'stiefdochters',
    'schoonvader', 'schoonmoeder',
    'schoonvaders', 'schoonmoeders',
    'bruidegom', 'bruid',
    'Bruidegom', 'Bruid',
    'bruidegoms', 'bruiden',
    'peetvader', 'peetmoeder',
    'Peetvader', 'Peetmoeder',
]
gendered_output_csv = "../out/gendered_wordpairs.csv"
gendered_wordpairs, gendered_wordpairs_base = analyzer.analyze_gendered_words(word_frequencies, gendered_wordpairs_list, gendered_output_csv)

# Analyze professions
professions_list = [
    #"Pedagoog", "Pedagogen", "pedagoog", "pedagogen",
    #"Docent", "Docenten", "docent", "docenten",
    #"leraar", "leraren", "Leraar", "Leraren", 
    #"leerkracht", "leerkrachten", "Leerkracht", "Leerkrachten",
    #"onderwijzer", "onderwijzers", "Onderwijzer", "Onderwijzers",
    #"auteur", "auteurs", "Auteur", "Auteurs",
    #"Kunstenaar", "Kunstenaars", "kunstenaar", "kunstenaars",
    #"journalist", "journalisten", "Journalist", "Journalisten",
    #"fotograaf", "fotografen", "Fotograaf", "Fotografen",
    #"adviseur", "adviseurs", "Adviseur", "Adviseurs",
    #"vertegenwoordiger", "vertegenwoordigers", "Vertegenwoordiger", "Vertegenwoordigers",
    #"verkoper", "Verkoper",
    #"winkelier", "Winkelier",
    #"teamleider", "Teamleider",
    #"hoogleraar", "hoogleraren", "Hoogleraar", "Hoogleraren",
    #"apotheker", "apothekers", "Apotheker", "Apothekers",
    #"assistent", "assistenten", "Assistent", "Assistenten",
    #"conciërge", "conciërges", "Conciërge", "Conciërges",
    
    "technicus", #"Technicus",
    "accountant", "Accountant",
    "supervisor", #"Supervisor",
    "ingenieur", "Ingenieur",
    "arbeider", "Arbeider",
    #"klerk", "Klerk",
    "adviseur", "Adviseur",
    "inspecteur", "Inspecteur",
    "monteur", #"Monteur",
    'manager', 'Manager',
    "administrateur", "Administrateur", 
    "portier", "Portier",
    "psycholoog", "Psycholoog", 
    "arts", "Arts",
    "timmerman", "Timmerman",
    "verpleegkundige", "Verpleegkundige",
    "onderzoeker", "Onderzoeker",
    "barkeeper", #"Barkeeper",
    "specialist", "Specialist",
    "Deskundige", "deskundige",
    "expert", "Expert",
    "elektricien", #"Elektricien",
    "ambtenaar", #"Ambtenaar",
    "officier", "Officier",
    "beambte", #"Beambte",
    "patholoog", #"Patholoog",
    "jurist", "Jurist",
    "rechtsgeleerde", #"Rechtsgeleerde",
    "roostermaker", #"Roostermaker",
    "huisarts", "Huisarts",
    "loodgieter", #"Loodgieter",
    "dierenarts", "Dierenarts",
    "fysiotherapeut", "Fysiotherapeut",
    "scheikundige", "Scheikundige", 
    "machinist", "Machinist",
    "architect", "Architect",
    "bakker", "Bakker",
    "programmeur", #"Programmeur",
    "wetenschapper", #"Wetenschapper",
    "coördinator", "Coördinator",
    "kassier", #"Kassier",
    "schilder", "Schilder",
    "makelaar", "Makelaar",
    "chef", "Chef",
    "dokter", "Dokter",
    ]
professions_output_csv = "../out/professions.csv"
professions = analyzer.analyze_professions(word_frequencies, professions_list, professions_output_csv)

### Summarizing the entire corpus

In [78]:
class CorpusSummaryAnalyzer:
    @staticmethod
    def analyze_corpus_summary(word_frequencies_file, gendered_wordpairs_file, professions_file, wordcount_and_10gram_file):
        """Generate a summary of the corpus analysis."""
        # Load CSV files
        word_frequencies = pd.read_csv(word_frequencies_file, sep=',', on_bad_lines='skip')
        gendered_wordpairs_base = pd.read_csv(gendered_wordpairs_file, sep=',', on_bad_lines='skip')
        professions = pd.read_csv(professions_file, sep=',', on_bad_lines='skip')
        wordcount_and_10gram = pd.read_csv(wordcount_and_10gram_file, sep=',', on_bad_lines='skip')

        total_words = word_frequencies['Frequency'].sum()
        total_ten_grams = wordcount_and_10gram['ten_grams'].sum()
        num_files = len(wordcount_and_10gram)

        total_frequency_gendered = gendered_wordpairs_base['Frequency'].sum()
        total_frequency_professions = professions['Frequency'].sum()

        summary = {
            "Total Words in Corpus": total_words,
            "Unique Words in Corpus": len(word_frequencies),
            "Gendered Words": len(gendered_wordpairs_base),
            "Frequency of Gendered Words": total_frequency_gendered,
            "Professions": len(professions),
            "Frequency of Professions": total_frequency_professions,
            "Number of Book Files": num_files,
            "Total 10-Grams": total_ten_grams,
        }

        return summary

In [79]:
# Summarize the corpus
summary = CorpusSummaryAnalyzer.analyze_corpus_summary(
    "../out/word_frequencies_books.csv",
    "../out/gendered_wordpairs.csv",
    "../out/professions.csv",
    "../out/wordcount_and_10gram_perfile.csv"
)

for key, value in summary.items():
    print(f"{key}: {value}")

Total Words in Corpus: 26226228
Unique Words in Corpus: 342999
Gendered Words: 122
Frequency of Gendered Words: 523226
Professions: 44
Frequency of Professions: 9431
Number of Book Files: 508
Total 10-Grams: 26221656
