# Data exploration on the SoNaR corpus

In [29]:
# imports
import pandas as pd
import xml.etree.ElementTree as ET
from azure.storage.blob import BlobServiceClient
from io import StringIO
import os
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Azure Blob Storage configuration
connection_string = os.getenv("SONAR_STORAGE_KEY")

### Counting the number of words and 10-grams in a subset of the corpus

In [19]:
class BlobProcessor:
    def __init__(self, connection_string, container_name):
        self.connection_string = connection_string
        self.container_name = container_name
        self.blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        self.container_client = self.blob_service_client.get_container_client(container_name)
        self.data = []
    
    def list_blobs(self):
        """List all blobs in the container."""
        return list(self.container_client.list_blobs())
    
    def process_xml_files(self):
        """Process XML files in the container and compute word counts and 10-grams."""
        blobs = self.list_blobs()
        xml_files = [blob.name for blob in blobs if blob.name.endswith('.xml')]
        print(f"XML files: {xml_files}")
        print(f"Processing {len(blobs)} files...")
        print(f"First 5 files: {[blob.name for blob in blobs][:5]}")

        total_files = len(blobs)
        for i, blob in enumerate(blobs, start=1):
            blob_name = blob.name
            print(f"Processing file {blob_name} ({i}/{total_files})")

            if blob_name.endswith('.xml'):
                try:
                    # Download blob content
                    blob_client = self.container_client.get_blob_client(blob_name)
                    blob_data = blob_client.download_blob().readall()
                    print(f"Blob data: {blob_data[:100]}")  # Preview first 100 bytes

                    # Parse XML content
                    xml_content = blob_data.decode('iso-8859-15')
                    tree = ET.ElementTree(ET.fromstring(xml_content))
                    root = tree.getroot()

                    # Count <w> elements
                    word_count = len(root.findall('.//dcoi:w', {'dcoi': 'http://lands.let.ru.nl/projects/d-coi/ns/1.0'}))
                except ET.ParseError as e:
                    print(f"Error parsing {blob_name}: {e}")
                    word_count = 0

                # Calculate the number of 10-grams
                ten_grams = max(0, word_count - 10 + 1)
                
                # Append results
                self.data.append({
                    'filename': blob_name,
                    'word_count': word_count,
                    'ten_grams': ten_grams
                })

                print(f"File {blob_name} processed: {i}/{total_files}")

    def get_dataframe(self):
        """Return the data as a Pandas DataFrame."""
        return pd.DataFrame(self.data)
    
    def get_total_ten_grams(self):
        """Calculate the total number of 10-grams."""
        if not self.data:
            return 0
        df = self.get_dataframe()
        return df['ten_grams'].sum()
    
    def summary(self):
        """Print a summary of the processing results."""
        df = self.get_dataframe()
        print(df)
        print(f"Total 10-grams: {self.get_total_ten_grams()}")

In [20]:
container_name = "books"
    
# Initialize BlobProcessor
processor = BlobProcessor(connection_string, container_name)

# Process XML files
processor.process_xml_files()

# Print summary
processor.summary()

XML files: ['WR-P-P-B-0000000001.dcoi.xml', 'WR-P-P-B-0000000002.dcoi.xml', 'WR-P-P-B-0000000003.dcoi.xml', 'WR-P-P-B-0000000004.dcoi.xml', 'WR-P-P-B-0000000005.dcoi.xml', 'WR-P-P-B-0000000006.dcoi.xml', 'WR-P-P-B-0000000007.dcoi.xml', 'WR-P-P-B-0000000008.dcoi.xml', 'WR-P-P-B-0000000011.dcoi.xml', 'WR-P-P-B-0000000012.dcoi.xml', 'WR-P-P-B-0000000013.dcoi.xml', 'WR-P-P-B-0000000014.dcoi.xml', 'WR-P-P-B-0000000015.dcoi.xml', 'WR-P-P-B-0000000016.dcoi.xml', 'WR-P-P-B-0000000017.dcoi.xml', 'WR-P-P-B-0000000018.dcoi.xml', 'WR-P-P-B-0000000019.dcoi.xml', 'WR-P-P-B-0000000020.dcoi.xml', 'WR-P-P-B-0000000021.dcoi.xml', 'WR-P-P-B-0000000022.dcoi.xml', 'WR-P-P-B-0000000023.dcoi.xml', 'WR-P-P-B-0000000024.dcoi.xml', 'WR-P-P-B-0000000025.dcoi.xml', 'WR-P-P-B-0000000026.dcoi.xml', 'WR-P-P-B-0000000027.dcoi.xml', 'WR-P-P-B-0000000028.dcoi.xml', 'WR-P-P-B-0000000029.dcoi.xml', 'WR-P-P-B-0000000030.dcoi.xml', 'WR-P-P-B-0000000031.dcoi.xml', 'WR-P-P-B-0000000032.dcoi.xml', 'WR-P-P-B-0000000033.dcoi.xm

In [21]:
container_name = "press-releases"
    
# Initialize BlobProcessor
processor = BlobProcessor(connection_string, container_name)

# Process XML files
processor.process_xml_files()

# Print summary
processor.summary()

XML files: ['WR-P-E-F-0000000001.dcoi.xml', 'WR-P-E-F-0000000002.dcoi.xml', 'WR-P-E-F-0000000003.dcoi.xml', 'WR-P-E-F-0000000004.dcoi.xml', 'WR-P-E-F-0000000005.dcoi.xml', 'WR-P-E-F-0000000006.dcoi.xml', 'WR-P-E-F-0000000007.dcoi.xml', 'WR-P-E-F-0000000008.dcoi.xml', 'WR-P-E-F-0000000009.dcoi.xml', 'WR-P-E-F-0000000010.dcoi.xml', 'WR-P-E-F-0000000011.dcoi.xml', 'WR-P-E-F-0000000012.dcoi.xml', 'WR-P-E-F-0000000013.dcoi.xml', 'WR-P-E-F-0000000014.dcoi.xml', 'WR-P-E-F-0000000015.dcoi.xml', 'WR-P-E-F-0000000016.dcoi.xml', 'WR-P-E-F-0000000017.dcoi.xml', 'WR-P-E-F-0000000018.dcoi.xml', 'WR-P-E-F-0000000019.dcoi.xml', 'WR-P-E-F-0000000020.dcoi.xml', 'WR-P-E-F-0000000021.dcoi.xml', 'WR-P-E-F-0000000022.dcoi.xml', 'WR-P-E-F-0000000023.dcoi.xml', 'WR-P-E-F-0000000024.dcoi.xml', 'WR-P-E-F-0000000025.dcoi.xml', 'WR-P-E-F-0000000026.dcoi.xml', 'WR-P-E-F-0000000027.dcoi.xml', 'WR-P-E-F-0000000028.dcoi.xml', 'WR-P-E-F-0000000029.dcoi.xml', 'WR-P-E-F-0000000030.dcoi.xml', 'WR-P-E-F-0000000151.dcoi.xm

### Inspecting full text and topics

In [27]:
class BlobTextExtractor:
    def __init__(self, connection_string, container_name):
        self.connection_string = connection_string
        self.container_name = container_name
        self.blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        self.container_client = self.blob_service_client.get_container_client(container_name)

    def extract_text_from_blob(self, blob_name):
        """Extract text from <w> elements in a single blob."""
        try:
            # Download blob content
            blob_client = self.container_client.get_blob_client(blob_name)
            blob_data = blob_client.download_blob().readall()
            
            # Parse the XML content
            tree = ET.ElementTree(ET.fromstring(blob_data.decode('iso-8859-15')))
            root = tree.getroot()
            
            # Extract text from <w> elements
            word_elements = root.findall('.//dcoi:w', {'dcoi': 'http://lands.let.ru.nl/projects/d-coi/ns/1.0'})
            return ' '.join(w.text for w in word_elements if w.text)
        except Exception as e:
            print(f"Error processing blob {blob_name}: {e}")
            return ""

    def save_texts_to_single_file(self, output_file):
        """Extract text from all XML blobs in the container and save to a single file."""
        with open(output_file, 'w', encoding='utf-8') as outfile:
            # List all blobs in the container
            blobs = list(self.container_client.list_blobs())
            xml_files = [blob.name for blob in blobs if blob.name.endswith('.xml')]

            print(f"Found {len(xml_files)} XML files in container '{self.container_name}'.")

            for i, blob_name in enumerate(xml_files, start=1):
                print(f"Processing file {i}/{len(xml_files)}: {blob_name}")

                # Extract text from the blob
                text = self.extract_text_from_blob(blob_name)

                # Write the title (blob name without extension), text, and blank line to the output file
                outfile.write(f"{os.path.splitext(blob_name)[0]}\n")  # Write the title
                outfile.write(text)  # Write the extracted text
                outfile.write("\n\n")  # Add a blank line between entries
                
                print(f"Added text from {blob_name} to the combined file.")
        
        print(f"All texts saved to a single file at {output_file}.")

In [28]:
container_name = "press-releases"
output_file = "../out/combined.txt"

# Initialize the BlobTextExtractor
extractor = BlobTextExtractor(connection_string, container_name)

# Extract texts and save to a single output file
extractor.save_texts_to_single_file(output_file)

Found 1053 XML files in container 'press-releases'.
Processing file 1/1053: WR-P-E-F-0000000001.dcoi.xml
Added text from WR-P-E-F-0000000001.dcoi.xml to the combined file.
Processing file 2/1053: WR-P-E-F-0000000002.dcoi.xml
Added text from WR-P-E-F-0000000002.dcoi.xml to the combined file.
Processing file 3/1053: WR-P-E-F-0000000003.dcoi.xml
Added text from WR-P-E-F-0000000003.dcoi.xml to the combined file.
Processing file 4/1053: WR-P-E-F-0000000004.dcoi.xml
Added text from WR-P-E-F-0000000004.dcoi.xml to the combined file.
Processing file 5/1053: WR-P-E-F-0000000005.dcoi.xml
Added text from WR-P-E-F-0000000005.dcoi.xml to the combined file.
Processing file 6/1053: WR-P-E-F-0000000006.dcoi.xml
Added text from WR-P-E-F-0000000006.dcoi.xml to the combined file.
Processing file 7/1053: WR-P-E-F-0000000007.dcoi.xml
Added text from WR-P-E-F-0000000007.dcoi.xml to the combined file.
Processing file 8/1053: WR-P-E-F-0000000008.dcoi.xml
Added text from WR-P-E-F-0000000008.dcoi.xml to the com

### Counting per specific word