In [138]:
import re
import csv
from collections import defaultdict
import os

In [139]:
def parse_claims(file_path):
    """Parses the claims file and extracts specifications, metrics, and values."""
    claims = []
    with open(file_path, 'r') as file:
        for line in file:
            match = re.match(r'Claim \d+: \|\{(.*?)\}, (.*?), (.*?)\|', line)
            if match:
                specifications_raw, metric, value = match.groups()
                specifications = re.findall(r'\|([^|]+), ([^|]+)\|', specifications_raw)
                claims.append({
                    'specifications': {k.strip().lower(): v.strip() for k, v in specifications},
                    'metric': metric.strip(),
                    'value': value.strip()
                })
            else:
                print(f'Warning: could not parse line "{line.strip()}" in {file_path}')
    return claims

In [140]:
def profile_claims(claims):
    """Profiles the claims data and computes distributions."""
    name_distribution = defaultdict(int)
    value_distribution = defaultdict(lambda: defaultdict(int))
    metric_distribution = defaultdict(int)

    for claim in claims:
        for name, value in claim['specifications'].items():
            name_distribution[name] += 1
            value_distribution[name][value] += 1
        metric_distribution[claim['metric']] += 1

    return name_distribution, value_distribution, metric_distribution

In [141]:
def save_profiling_to_csv_and_log(name_distribution, value_distribution, metric_distribution, output_filename):
    """Saves the profiling results and log to a CSV."""
    with open(output_filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)

        # Write distributions of 'name'
        writer.writerow(['Distributions of name in specifications'])
        writer.writerow(['Name', 'Occurences'])
        for name, count in sorted(name_distribution.items()):
            writer.writerow([name, count])

        # Write distributions of 'values'
        writer.writerow([])  # Blank row for separation
        writer.writerow(['Distributions of values for each name of each specification'])
        writer.writerow(['Name', 'Value', 'Occurrences'])
        for name, values in sorted(value_distribution.items()):
            for value, occurrences in values.items():
                writer.writerow([name, value, occurrences])

        # Write distributions of metrics
        writer.writerow([])  # Blank row for separation
        writer.writerow(['Distributions of metrics'])
        writer.writerow(['Metric', 'Occurences'])
        for metric, count in sorted(metric_distribution.items()):
            writer.writerow([metric, count])

In [142]:
input_dir = 'ground_truth'
output_file = 'consegna/profilings/ground_truth.csv'

# Parse claims
claims = []
for filename in os.listdir(input_dir):
    input_file = os.path.join(input_dir, filename)
    claims.append(parse_claims(input_file))

claims = [claim for sublist in claims for claim in sublist]

# Profile claims
name_distribution, value_distribution, metric_distribution = profile_claims(claims)

save_profiling_to_csv_and_log(name_distribution, value_distribution, metric_distribution, output_file)
print(f"Profiling results saved to {output_file}")

Profiling results saved to consegna/profilings/ground_truth.csv
