# Installations and Imports

In [35]:
import pandas as pd
import sys
import os
import evaluate

# Sentence Classification

In [None]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.utils import compute_metrics_for_label

# ===================================================================================================
# Get Class Count
df = pd.read_excel('/home/fantoni/patent-sentence-classification/data/6000_axiomatic_dataset.xlsx')
class_counts = df['sent_tag'].value_counts()
total = class_counts.sum()

labels = ["FUN", "STR", "MIX", "OTH"]
counts = [class_counts.get(label, 0) for label in labels]
percentages = [count / total * 100 for count in counts]

data = {
    "Sentence Class": labels + ["Total"],
    "N. of Sentences": counts + [total],
    "% of Sentences": percentages + [100.00]
}
df = pd.DataFrame(data)

# ===================================================================================================
# Get results of Agreement 
df_agree = pd.read_excel('/home/fantoni/patent-sentence-classification/data/1200_agreement_All.xlsx')

# Compute F1-scores
true_labels = df_agree['sent_tag_mc']
pred_labels = df_agree['sent_tag_ml']
_, _, f1_FUN = compute_metrics_for_label(true_labels, pred_labels, label='FUN', average='weighted')
_, _, f1_STR = compute_metrics_for_label(true_labels, pred_labels, label='STR', average='weighted')
_, _, f1_MIX = compute_metrics_for_label(true_labels, pred_labels, label='MIX', average='weighted')
_, _, f1_OTH = compute_metrics_for_label(true_labels, pred_labels, label='OTH', average='weighted')
_, _, f1_all = compute_metrics_for_label(true_labels, pred_labels, label='All', average='weighted')

# Add F1-scores to the table
f1_scores = {
    'FUN': f1_FUN,
    'STR': f1_STR,
    'MIX': f1_MIX,
    'OTH': f1_OTH,
    'Total': f1_all
}
df['F1-score'] = [f1_scores[label] for label in labels + ['Total']]

# Format numeric columns
df['N. of Sentences'] = df['N. of Sentences'].map(lambda x: f"{x:,}")
df['% of Sentences'] = df['% of Sentences'].map(lambda x: f"{x:.2f}")
df['F1-score'] = df['F1-score'].map(lambda x: f"{x:.3f}")

# Save
df.to_excel('/home/fantoni/patent-sentence-classification/results/tables/dataset_classification.xlsx', index=False)
df

Unnamed: 0,Sentence Class,N. of Sentences,% of Sentences,F1-score
0,FUN,2222,37.03,0.795
1,STR,2759,45.98,0.803
2,MIX,608,10.13,0.481
3,OTH,411,6.85,0.596
4,Total,6000,100.0,0.739


# NER

In [37]:
sys.path.append('/home/fantoni/patent-ner/src')
from utils import compute_metrics_exact, compute_metrics_partial

# ===================================================================================================
# Get Entity Counts
df = pd.read_excel('/home/fantoni/patent-sentence-classification/data/6000_axiomatic_dataset.xlsx')
target_entities = ['A', 'D', 'R', 'P', 'AX']

data = []
total_count = 0

# Count 'B-' labels per entity
entity_counts = {}
for ent in target_entities:
    labels = df[f'{ent}_labels'].dropna().str.split(',').explode()
    count = sum(label.strip().startswith('B-') for label in labels)
    entity_counts[ent] = count
    total_count += count

for ent, count in entity_counts.items():
    data.append({
        'Entity': ent,
        'N. of Entity': count,
        '% of Entity': round((count / total_count) * 100, 2)
    })

# Append total row
data.append({
    'Entity': 'Total',
    'N. of Entity': total_count,
    '% of Entity': 100.00
})

df_entity = pd.DataFrame(data)

# ===================================================================================================
# Get results of Agreement 
df_agree = pd.read_excel('/home/fantoni/patent-sentence-classification/data/1200_agreement_All.xlsx')

seqeval_metric = evaluate.load("seqeval") 

data = []
for target_entity in target_entities:
    # 1. Exact Match
    actual_labels = df_agree[f'{target_entity}_labels_mc'].str.split(',').tolist()
    pred_labels = df_agree[f'{target_entity}_labels_ml'].str.split(',').tolist()
    _, _, f1_exact, _ = compute_metrics_exact(seqeval_metric, pred_labels, actual_labels, target_entity)

    # 2. Partial Match
    df_agree[f'{target_entity}_labels_mc_clean'] = df_agree[f'{target_entity}_labels_mc'].str.replace('B-', '').str.replace('I-', '')
    df_agree[f'{target_entity}_labels_ml_clean'] = df_agree[f'{target_entity}_labels_ml'].str.replace('B-', '').str.replace('I-', '')
    actual_labels = df_agree[f'{target_entity}_labels_mc_clean'].str.split(',').explode().tolist()
    pred_labels = df_agree[f'{target_entity}_labels_ml_clean'].str.split(',').explode().tolist()
    _, _, f1_partial, _ = compute_metrics_partial(pred_labels, actual_labels, target_entity)

    data.append({
        'Entity': target_entity,
        'Exact_F1': f1_exact, 
        'Partial_F1': f1_partial
    })

df_agree_result = pd.DataFrame(data)

# Append total (mean F1 scores)
total_row = {
    'Entity': 'Total',
    'Exact_F1': df_agree_result['Exact_F1'].mean(),
    'Partial_F1': df_agree_result['Partial_F1'].mean()
}

df_agree_result = df_agree_result._append(total_row, ignore_index=True)

# ===================================================================================================
# Merge
df_results = pd.merge(df_entity, df_agree_result, how='left', on='Entity')

# Format numeric columns
df_results['N. of Entity'] = df_results['N. of Entity'].map(lambda x: f"{x:,}")
df_results['% of Entity'] = df_results['% of Entity'].map(lambda x: f"{x:.2f}")
df_results['Exact_F1'] = df_results['Exact_F1'].map(lambda x: f"{x:.3f}")
df_results['Partial_F1'] = df_results['Partial_F1'].map(lambda x: f"{x:.3f}")

# Save
df_results.to_excel('/home/fantoni/patent-sentence-classification/results/tables/dataset_ner.xlsx', index=False)
df_results

Unnamed: 0,Entity,N. of Entity,% of Entity,Exact_F1,Partial_F1
0,A,4372,29.86,0.799,0.804
1,D,2861,19.54,0.671,0.698
2,R,4009,27.38,0.687,0.721
3,P,3168,21.63,0.674,0.75
4,AX,234,1.6,0.548,0.576
5,Total,14644,100.0,0.676,0.71


Unnamed: 0,Entity,N. of Entity,% of Entity,Exact_F1,Partial_F1
0,A,4372,29.86,0.799,0.804
1,D,2861,19.54,0.671,0.698
2,R,4009,27.38,0.687,0.721
3,P,3168,21.63,0.674,0.75
4,AX,234,1.6,0.548,0.576
5,Total,14644,100.0,0.676,0.71


In [None]:
# Get results of Agreement 
df_agree = pd.read_excel('/home/fantoni/patent-sentence-classification/data/1200_agreement_All.xlsx')

seqeval_metric = evaluate.load("seqeval") 

results = []

for target_entity in target_entities:
    # 1. Exact Match
    actual_labels = df_agree[f'{target_entity}_labels_mc'].str.split(',').tolist()
    pred_labels = df_agree[f'{target_entity}_labels_ml'].str.split(',').tolist()
    _, _, f1_exact, _ = compute_metrics_exact(seqeval_metric, pred_labels, actual_labels, target_entity)

    # 2. Partial Match
    df_agree[f'{target_entity}_labels_mc_clean'] = df_agree[f'{target_entity}_labels_mc'].str.replace('B-', '').str.replace('I-', '')
    df_agree[f'{target_entity}_labels_ml_clean'] = df_agree[f'{target_entity}_labels_ml'].str.replace('B-', '').str.replace('I-', '')

    actual_labels = df_agree[f'{target_entity}_labels_mc_clean'].str.split(',').explode().tolist()
    pred_labels = df_agree[f'{target_entity}_labels_ml_clean'].str.split(',').explode().tolist()
    _, _, f1_partial, _ = compute_metrics_partial(pred_labels, actual_labels, target_entity)

    results.append({
        'Entity': target_entity,
        'Exact_F1': round(f1_exact, 2),
        'Partial_F1': round(f1_partial, 2)
    })

df_results = pd.DataFrame(results)