In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import os

os.makedirs('../data/processed', exist_ok=True)

def parse_iate_tbx(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    # Define namespace
    ns = {'tbx': 'urn:iso:std:iso:30042:ed-2'}
    
    terminology_data = []
    
    # Find all conceptEntry elements
    for concept_entry in root.findall('.//tbx:conceptEntry', ns):
        entry_id = concept_entry.get('id')
        
        # Extract subject field
        subject_field = ""
        subject_desc = concept_entry.find('.//tbx:descrip[@type="subjectField"]', ns)
        if subject_desc is not None and subject_desc.text:
            subject_field = subject_desc.text.strip()
        
        # Extract terms by language with reliability codes
        pt_terms = []
        en_terms = []
        
        # Find all langSec elements
        for lang_sec in concept_entry.findall('.//tbx:langSec', ns):
            lang = lang_sec.get('{http://www.w3.org/XML/1998/namespace}lang')
            
            # Find all termSec within this language section
            for term_sec in lang_sec.findall('.//tbx:termSec', ns):
                # Get term text from <term> element
                term_elem = term_sec.find('tbx:term', ns)  # Direct child, not descendant
                if term_elem is not None and term_elem.text:
                    term_text = term_elem.text.strip()
                    
                    # Get reliability code from descrip element
                    reliability_elem = term_sec.find('tbx:descrip[@type="reliabilityCode"]', ns)
                    reliability_code = reliability_elem.text.strip() if reliability_elem is not None and reliability_elem.text else "unknown"
                    
                    term_data = {
                        'term': term_text,
                        'reliability': reliability_code
                    }
                    
                    if lang == 'pt':
                        pt_terms.append(term_data)
                    elif lang == 'en':
                        en_terms.append(term_data)
        
        # Create term pairs (PT -> EN)
        if pt_terms and en_terms:
            for pt_data in pt_terms:
                for en_data in en_terms:
                    terminology_data.append({
                        'entry_id': entry_id,
                        'pt_term': pt_data['term'],
                        'pt_reliability': pt_data['reliability'],
                        'en_term': en_data['term'],
                        'en_reliability': en_data['reliability'],
                        'subject_field': subject_field
                    })
    
    return pd.DataFrame(terminology_data)

# Test the parsing first
df_raw = parse_iate_tbx('../data/raw/IATE_Trade.tbx')

print("Raw data sample:")
print(df_raw.head(10))

print("\nUnique PT terms (first 10):")
print(df_raw['pt_term'].unique()[:10])

print("\nUnique EN terms (first 10):")
print(df_raw['en_term'].unique()[:10])

Raw data sample:
  entry_id                                            pt_term pt_reliability  \
0   845665  Convenção Internacional para a Simplificação d...              6   
1   753589                                 leite de transição              1   
2   753589                                 leite de transição              1   
3  3530433                       inovação nova para a empresa              9   
4  1161665                       sistema de concessão inicial              9   
5   784228                                            COCERAL              6   
6   784228                                            COCERAL              6   
7   784228  Comité do Comércio de Cereais e de Alimentos p...              6   
8   784228  Comité do Comércio de Cereais e de Alimentos p...              6   
9  1202984                                  manteiga de cacau              9   

                                             en_term en_reliability  \
0  International Convention rel

In [4]:
def clean_terminology_data(df):
    print("=== Data Cleaning Process ===")
    print(f"Initial dataset: {len(df)} rows")
    
    # Remove very short terms
    df_cleaned = df[df['pt_term'].str.len() >= 3]
    df_cleaned = df_cleaned[df_cleaned['en_term'].str.len() >= 3]
    
    print(f"After removing very short terms: {len(df_cleaned)} rows")
    
    # Remove duplicates
    df_cleaned = df_cleaned.drop_duplicates()
    
    print(f"After removing duplicates: {len(df_cleaned)} rows")
    
    return df_cleaned.reset_index(drop=True)

# Clean the data
df_clean = clean_terminology_data(df_raw)

# Save processed data
df_clean.to_csv('../data/processed/iate_terminology_clean.csv', index=False)
print(f"\nSaved {len(df_clean)} term pairs")

print("\n=== Sample cleaned data ===")
print(df_clean.head(10))

print("\n=== Reliability Code Distribution ===")
print("Portuguese terms:")
print(df_clean['pt_reliability'].value_counts().sort_index())
print("\nEnglish terms:")
print(df_clean['en_reliability'].value_counts().sort_index())

print(f"\n=== Summary Statistics ===")
print(f"Total term pairs: {len(df_clean)}")
print(f"Unique concepts: {df_clean['entry_id'].nunique()}")
print(f"Unique Portuguese terms: {df_clean['pt_term'].nunique()}")
print(f"Unique English terms: {df_clean['en_term'].nunique()}")

=== Data Cleaning Process ===
Initial dataset: 14599 rows
After removing very short terms: 14494 rows
After removing duplicates: 14475 rows

Saved 14475 term pairs

=== Sample cleaned data ===
  entry_id                                            pt_term pt_reliability  \
0   845665  Convenção Internacional para a Simplificação d...              6   
1   753589                                 leite de transição              1   
2   753589                                 leite de transição              1   
3  3530433                       inovação nova para a empresa              9   
4  1161665                       sistema de concessão inicial              9   
5   784228                                            COCERAL              6   
6   784228                                            COCERAL              6   
7   784228  Comité do Comércio de Cereais e de Alimentos p...              6   
8   784228  Comité do Comércio de Cereais e de Alimentos p...              6   
9  1202