In [8]:
import csv

def read_ann_file(ann_file):
    """
    Reads the .ann file and extracts the label and annotated text.
    """
    annotations = []
    with open(ann_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split("\t")
            # Process only annotation lines (ignore relations)
            if len(parts) > 1 and parts[0].startswith('T'):
                label = parts[1].split(" ")[0]  # Extract label
                text = parts[-1]  # Extract the text that is annotated
                annotations.append((label, text))
    return annotations

def create_csv_from_ann(text_file, ann_file, output_csv):
    """
    Reads the text from the text file, extracts annotations from the .ann file,
    and writes them to a CSV file.
    """
    # Read the text from the corresponding text file
    with open(text_file, 'r', encoding='utf-8') as f:
        text = f.read().strip()

    # Read the annotations from the .ann file
    annotations = read_ann_file(ann_file)

    # Prepare data for CSV
    data = []
    for label, text_content in annotations:
        data.append([text_content, label])

    # Write the extracted annotations to the CSV file
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Text', 'Label'])  # CSV headers
        writer.writerows(data)

# Example usage
text_file = 'A01.txt'  # Path to the text file that corresponds to the .ann file
ann_file = 'A01.ann'   # Path to the .ann file
output_csv = 'output.csv'   # Path to the output CSV file

create_csv_from_ann(text_file, ann_file, output_csv)


In [2]:
import os
import csv

def read_ann_file(ann_file):
    """
    Reads the .ann file and extracts the label and annotated text.
    """
    annotations = []
    with open(ann_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split("\t")
            # Process only annotation lines (ignore relations)
            if len(parts) > 1 and parts[0].startswith('T'):
                label = parts[1].split(" ")[0]  # Extract label
                text = parts[-1]  # Extract the text that is annotated
                annotations.append((label, text))
    return annotations

def create_csv_from_folder(corpus_folder, output_csv):
    """
    Loops through all .ann and .txt files in the given folder and creates a single CSV file.
    """
    # Open the output CSV file for writing
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Text', 'Label'])  # CSV headers

        # Loop through all .ann files in the folder
        for filename in os.listdir(corpus_folder):
            if filename.endswith('.ann'):
                # Derive the corresponding .txt file
                base_filename = filename.replace('.ann', '')
                ann_file = os.path.join(corpus_folder, filename)
                text_file = os.path.join(corpus_folder, base_filename + '.txt')

                # Check if the corresponding .txt file exists
                if os.path.exists(text_file):
                    # Read the annotations from the .ann file
                    annotations = read_ann_file(ann_file)

                    # Read the corresponding text from the .txt file
                    with open(text_file, 'r', encoding='utf-8') as f:
                        text = f.read().strip()

                    # Prepare data for CSV and write to it
                    for label, annotated_text in annotations:
                        writer.writerow([annotated_text, label])

    print(f"CSV file '{output_csv}' created successfully.")

# Example usage
corpus_folder = 'compiled_corpus'  # Path to the folder containing .ann and .txt files
output_csv = 'compiled_output.csv'  # Path to the output CSV file

create_csv_from_folder(corpus_folder, output_csv)


CSV file 'compiled_output.csv' created successfully.
