# Data exploration on the SoNaR corpus
### Now, extracting the lemmatized words from the corpus, instead of the full words

In [4]:
import pandas as pd
import xml.etree.ElementTree as ET
from azure.storage.blob import BlobServiceClient
import os

# Azure Blob Storage configuration
connection_string = os.getenv("SONAR_STORAGE_KEY")

In [5]:
container_name = "books"
output_file_wordcount_and_10gram_perfile = f"../outlemma/{container_name}/wordcount_and_10gram_perfile_{container_name}.csv"
output_file_fulltext = f"../outlemma/{container_name}/combined_{container_name}.txt"
output_csv_wordfrequencies = f"../outlemma/{container_name}/word_frequencies_{container_name}.csv"
gendered_output_csv = f"../outlemma/{container_name}/gendered_wordpairs_{container_name}.csv"
professions_output_csv = f"../outlemma/{container_name}/professions_{container_name}.csv"

### Counting the number of words and 10-grams in a subset of the corpus

In [8]:
class BlobProcessor:
    def __init__(self, connection_string, container_name, output_file):
        self.connection_string = connection_string
        self.container_name = container_name
        self.blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        self.container_client = self.blob_service_client.get_container_client(container_name)
        self.data = []
        self.output_file = output_file
    
    def list_blobs(self):
        """List all blobs in the container."""
        return list(self.container_client.list_blobs())
    
    def process_xml_files(self):
        """Process XML files in the container and extract lemmas from <w> elements."""
        blobs = self.list_blobs()
        xml_files = [blob.name for blob in blobs if blob.name.endswith('.xml')]
        print(f"Processing {len(xml_files)} XML files...")

        total_files = len(xml_files)
        for i, blob_name in enumerate(xml_files, start=1):
            print(f"Processing file {blob_name} ({i}/{total_files})")

            try:
                # Download blob content
                blob_client = self.container_client.get_blob_client(blob_name)
                blob_data = blob_client.download_blob().readall()
                xml_content = blob_data.decode('iso-8859-15')

                # Parse XML content
                tree = ET.ElementTree(ET.fromstring(xml_content))
                root = tree.getroot()

                # Extract lemma attributes from <w> elements
                lemmas = [
                    w.get('lemma', '')  # Extract lemma attribute
                    for w in root.findall('.//dcoi:w', {'dcoi': 'http://lands.let.ru.nl/projects/d-coi/ns/1.0'})
                ]

                # Count words
                word_count = len(lemmas)

                # Calculate the number of 10-grams
                ten_grams = max(0, word_count - 10 + 1)

                # Append results
                self.data.append({
                    'filename': blob_name,
                    'word_count': word_count,
                    'ten_grams': ten_grams,
                    'lemmas': ', '.join(lemmas)  # Store lemmas as a comma-separated string
                })

                print(f"File {blob_name} processed: {i}/{total_files}")

            except ET.ParseError as e:
                print(f"Error parsing {blob_name}: {e}")

    def get_dataframe(self):
        """Return the data as a Pandas DataFrame."""
        return pd.DataFrame(self.data)
    
    def save_dataframe(self):
        """Save the DataFrame to a file."""
        df = self.get_dataframe()
        df.to_csv(self.output_file, index=False)
        print(f"DataFrame saved to {self.output_file}")
    
    def get_total_ten_grams(self):
        """Calculate the total number of 10-grams."""
        if not self.data:
            return 0
        df = self.get_dataframe()
        return df['ten_grams'].sum()
    
    def summary(self):
        """Print a summary of the processing results."""
        df = self.get_dataframe()
        print(df)
        print(f"Total 10-grams: {self.get_total_ten_grams()}")

In [9]:
# Initialize BlobProcessor
processor = BlobProcessor(connection_string, container_name, output_file_wordcount_and_10gram_perfile)

# Process XML files
processor.process_xml_files()

# Print summary
processor.summary()

# Save DataFrame to a CSV file
processor.save_dataframe()

Processing 508 XML files...
Processing file WR-P-P-B-0000000001.dcoi.xml (1/508)
File WR-P-P-B-0000000001.dcoi.xml processed: 1/508
Processing file WR-P-P-B-0000000002.dcoi.xml (2/508)
File WR-P-P-B-0000000002.dcoi.xml processed: 2/508
Processing file WR-P-P-B-0000000003.dcoi.xml (3/508)
File WR-P-P-B-0000000003.dcoi.xml processed: 3/508
Processing file WR-P-P-B-0000000004.dcoi.xml (4/508)
File WR-P-P-B-0000000004.dcoi.xml processed: 4/508
Processing file WR-P-P-B-0000000005.dcoi.xml (5/508)
File WR-P-P-B-0000000005.dcoi.xml processed: 5/508
Processing file WR-P-P-B-0000000006.dcoi.xml (6/508)
File WR-P-P-B-0000000006.dcoi.xml processed: 6/508
Processing file WR-P-P-B-0000000007.dcoi.xml (7/508)
File WR-P-P-B-0000000007.dcoi.xml processed: 7/508
Processing file WR-P-P-B-0000000008.dcoi.xml (8/508)
File WR-P-P-B-0000000008.dcoi.xml processed: 8/508
Processing file WR-P-P-B-0000000011.dcoi.xml (9/508)
File WR-P-P-B-0000000011.dcoi.xml processed: 9/508
Processing file WR-P-P-B-0000000012.

### Inspecting full text and topics

In [None]:
class BlobTextExtractor:
    def __init__(self, connection_string, container_name):
        self.connection_string = connection_string
        self.container_name = container_name
        self.blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        self.container_client = self.blob_service_client.get_container_client(container_name)

    def extract_text_from_blob(self, blob_name):
        """Extract text from <w> elements in a single blob."""
        try:
            # Download blob content
            blob_client = self.container_client.get_blob_client(blob_name)
            blob_data = blob_client.download_blob().readall()
            
            # Parse the XML content
            tree = ET.ElementTree(ET.fromstring(blob_data.decode('iso-8859-15')))
            root = tree.getroot()
            
            # Extract lemma attributes from <w> elements
            word_elements = root.findall('.//dcoi:w', {'dcoi': 'http://lands.let.ru.nl/projects/d-coi/ns/1.0'})
            return ' '.join(w.get('lemma', '') for w in word_elements if w.get('lemma', ''))
        except Exception as e:
            print(f"Error processing blob {blob_name}: {e}")
            return ""

    def save_texts_to_single_file(self, output_file):
        """Extract text from all XML blobs in the container and save to a single file."""
        with open(output_file, 'w', encoding='utf-8') as outfile:
            # List all blobs in the container
            blobs = list(self.container_client.list_blobs())
            xml_files = [blob.name for blob in blobs if blob.name.endswith('.xml')]

            print(f"Found {len(xml_files)} XML files in container '{self.container_name}'.")

            for i, blob_name in enumerate(xml_files, start=1):
                print(f"Processing file {i}/{len(xml_files)}: {blob_name}")

                # Extract text from the blob
                text = self.extract_text_from_blob(blob_name)

                # Write the title (blob name without extension), text, and blank line to the output file
                outfile.write(f"{os.path.splitext(blob_name)[0]}\n")  # Write the title
                outfile.write(text)  # Write the extracted text
                outfile.write("\n\n")  # Add a blank line between entries
                
                print(f"Added text from {blob_name} to the combined file.")
        
        print(f"All texts saved to a single file at {output_file}.")