In [None]:
!wget https://web-language-models.s3.amazonaws.com/paracrawl/release9/es-eu/es-eu.tmx.gz

--2025-11-29 09:54:51--  https://web-language-models.s3.amazonaws.com/paracrawl/release9/es-eu/es-eu.tmx.gz
Resolving web-language-models.s3.amazonaws.com (web-language-models.s3.amazonaws.com)... 16.182.97.25, 52.217.118.57, 16.15.180.87, ...
Connecting to web-language-models.s3.amazonaws.com (web-language-models.s3.amazonaws.com)|16.182.97.25|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1275028162 (1.2G) [binary/octet-stream]
Saving to: ‘es-eu.tmx.gz’


2025-11-29 09:55:14 (53.8 MB/s) - ‘es-eu.tmx.gz’ saved [1275028162/1275028162]



In [None]:
!gunzip /content/es-eu.tmx.gz

In [None]:
#@title Extraction des paires phrase Basque ↔ Espagnol dont la phrase basque contient "idazkaria"

import csv
import xml.etree.ElementTree as ET
from itertools import islice

def extract_sentences_with_keyword(tmx_path, output_path, keyword='idazkaria'):
    """
    Extract sentence pairs from TMX file where Basque text contains the keyword.
    Uses iterative parsing to handle large files efficiently.
    """

    # Counter for progress tracking
    total_units = 0
    matched_units = 0

    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Basque (eu)', 'Spanish (es)'])  # Header

        # Use iterparse for memory-efficient parsing of large XML
        context = ET.iterparse(tmx_path, events=('end',))

        for event, elem in context:
            if elem.tag == 'tu':  # Translation Unit
                total_units += 1

                eu_text = None
                es_text = None

                # Find all tuv (translation unit variant) elements
                for tuv in elem.findall('tuv'):
                    # Get language - TMX uses xml:lang or lang attribute
                    lang = tuv.get('{http://www.w3.org/XML/1998/namespace}lang') or tuv.get('lang', '')
                    lang = lang.lower()

                    # Get the segment text
                    seg = tuv.find('seg')
                    if seg is not None:
                        # Handle text that might be nested or have tail text
                        text = ''.join(seg.itertext()).strip() if seg.text is None else seg.text.strip()
                        if not text:
                            text = ''.join(seg.itertext()).strip()

                        if 'eu' in lang:
                            eu_text = text
                        elif 'es' in lang:
                            es_text = text

                # Check if keyword is in Basque text (case-insensitive)
                if eu_text and es_text and keyword.lower() in eu_text.lower():
                    writer.writerow([eu_text, es_text])
                    matched_units += 1

                    # Print progress every 100 matches
                    if matched_units % 100 == 0:
                        print(f"Found {matched_units} matches so far...")

                # Clear element to free memory (crucial for large files)
                elem.clear()

                # Print progress every 1 million units
                if total_units % 1000000 == 0:
                    print(f"Processed {total_units:,} translation units...")

    print(f"\n{'='*50}")
    print(f"Processing complete!")
    print(f"Total translation units processed: {total_units:,}")
    print(f"Sentences containing '{keyword}': {matched_units:,}")
    print(f"Output saved to: {output_path}")

    return matched_units

# Run the extraction
tmx_file = "/content/es-eu.tmx"
output_file = "/content/idazkaria_sentences.csv"

print("Starting TMX extraction...")
print(f"Looking for sentences containing: 'idazkaria'")
print(f"Input file: {tmx_file}")
print(f"Output file: {output_file}")
print("="*50 + "\n")

matches = extract_sentences_with_keyword(tmx_file, output_file, 'idazkaria')

# Display sample results
print("\n" + "="*50)
print("Sample of extracted sentences:")
print("="*50)

import pandas as pd
df = pd.read_csv(output_file)
print(f"\nTotal rows in CSV: {len(df)}")
print("\nFirst 5 examples:")
display(df.head())

Starting TMX extraction...
Looking for sentences containing: 'idazkaria'
Input file: /content/es-eu.tmx
Output file: /content/idazkaria_sentences.csv

Found 100 matches so far...
Found 200 matches so far...
Found 300 matches so far...
Found 400 matches so far...
Processed 1,000,000 translation units...
Found 500 matches so far...
Found 600 matches so far...
Found 700 matches so far...
Found 800 matches so far...
Processed 2,000,000 translation units...
Found 900 matches so far...
Found 1000 matches so far...
Found 1100 matches so far...
Found 1200 matches so far...
Processed 3,000,000 translation units...
Found 1300 matches so far...

Processing complete!
Total translation units processed: 3,344,372
Sentences containing 'idazkaria': 1,376
Output saved to: /content/idazkaria_sentences.csv

Sample of extracted sentences:

Total rows in CSV: 1376

First 5 examples:


Unnamed: 0,Basque (eu),Spanish (es)
0,David Saldoni Generalitateko Garraio eta Mugik...,También han sido elegidos vicepresidentes la p...
1,Aita Menniren Bilboko Neurorrehabilitazio Zent...,DKV Seguros ha entregado 10.000 euros a fisiot...
2,Bizkaiko Liga Federatuetako Azken Faseak eta B...,"La solicitud para organizar Fases Federación, ..."
3,"Estatuko Berdintasunerako Idazkariak, Soledad ...","La secretaria de Estado para la Igualdad, Sole..."
4,"Baina, herri txiki askotan bake epailea udalet...","Pero en muchos pueblos pequeños, el juez de pa..."


In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('/content/idazkaria_sentences.csv')

# Define the keywords to search for. Using regex for 'Secretario/a|Secretaria/o' and case-insensitive matching.
keyword_pattern = r'Secretario/a|Secretaria/o'

# Count occurrences in the 'Spanish (es)' column
count = df['Spanish (es)'].str.contains(keyword_pattern, case=False, na=False).sum()

print(f"Number of occurrences of 'Secretario/a' or 'Secretaria/o' in the Spanish column: {count}")

Number of occurrences of 'Secretario/a' or 'Secretaria/o' in the Spanish column: 82


In [None]:
import pandas as pd

# Load the CSV file (if not already loaded, or to ensure we're using the original data)
df = pd.read_csv('/content/idazkaria_sentences.csv')

# Define the keywords to search for
keyword_pattern = r'Secretario/a|Secretaria/o'

# Filter the DataFrame based on the keyword pattern in the 'Spanish (es)' column
filtered_df = df[df['Spanish (es)'].str.contains(keyword_pattern, case=False, na=False)]

# Define the path for the new CSV file
output_filtered_file = '/content/secretario_filtered_sentences.csv'

# Save the filtered DataFrame to a new CSV file
filtered_df.to_csv(output_filtered_file, index=False, encoding='utf-8')

print(f"New CSV file created at: {output_filtered_file}")
print(f"Number of rows in the new CSV: {len(filtered_df)}")

# Display the first few rows of the new filtered DataFrame
display(filtered_df.head())

New CSV file created at: /content/secretario_filtered_sentences.csv
Number of rows in the new CSV: 82


Unnamed: 0,Basque (eu),Spanish (es)
39,"Idazkariak prestatu behar du ziurtagiria, eta ...",Este certificado deberá ser elaborado por el S...
41,Batzordearen bilerarako deia batzordeko buruak...,La convocatoria del Comité corresponderá al Pr...
50,Erabakitzeke dauden gaiei buruzko informazioa ...,El/La Secretario/a informará sobre los asuntos...
72,Gobernu Batzordeko Batzorde Iraunkorrak bilera...,La celebración de una Comisión Permanente de l...
90,"Helburua: Beintza-Labaien, Donamaria, Oiz eta ...",Es objeto de la presente convocatoria la contr...


In [None]:
#@title Nettoyage du fichier CSV en enlevant "Secretario/a" et "Secretaria/o"

import pandas as pd

# Load the original CSV file
df_original = pd.read_csv('/content/idazkaria_sentences.csv')

# Define the keywords to search for
keyword_pattern = r'Secretario/a|Secretaria/o'

# Create a boolean mask to identify rows containing the keywords
mask = df_original['Spanish (es)'].str.contains(keyword_pattern, case=False, na=False)

# Extract the indices of the lines where the keywords are found
# Using .loc to get the actual index from the DataFrame
indices_to_exclude = df_original.loc[mask].index.tolist()

print(f"Indices of sentences containing '{keyword_pattern}':")
print(indices_to_exclude)
print(f"Total number of lines with keywords: {len(indices_to_exclude)}")

# Create a new DataFrame that excludes the identified sentences
df_without_keywords = df_original[~mask]

# Define the path for the new CSV file
output_excluded_file = '/content/sentences_without_secretario.csv'

# Save the new DataFrame to a CSV file
df_without_keywords.to_csv(output_excluded_file, index=False, encoding='utf-8')

print(f"\nNew CSV file created at: {output_excluded_file}")
print(f"Number of rows in the new CSV: {len(df_without_keywords)}")

# Display the first few rows of the new filtered DataFrame
print("\nFirst 5 examples from the new CSV (without keywords):")
display(df_without_keywords.head())

Indices of sentences containing 'Secretario/a|Secretaria/o':
[39, 41, 50, 72, 90, 108, 124, 130, 138, 185, 190, 202, 205, 217, 258, 259, 262, 298, 330, 339, 342, 356, 440, 453, 503, 512, 526, 541, 551, 580, 590, 616, 618, 623, 645, 670, 694, 706, 708, 717, 727, 734, 742, 753, 782, 838, 841, 847, 892, 907, 930, 931, 963, 984, 987, 1030, 1052, 1075, 1080, 1090, 1101, 1116, 1134, 1160, 1165, 1188, 1193, 1216, 1232, 1238, 1261, 1269, 1278, 1287, 1297, 1301, 1319, 1326, 1329, 1355, 1356, 1375]
Total number of lines with keywords: 82

New CSV file created at: /content/sentences_without_secretario.csv
Number of rows in the new CSV: 1294

First 5 examples from the new CSV (without keywords):


Unnamed: 0,Basque (eu),Spanish (es)
0,David Saldoni Generalitateko Garraio eta Mugik...,También han sido elegidos vicepresidentes la p...
1,Aita Menniren Bilboko Neurorrehabilitazio Zent...,DKV Seguros ha entregado 10.000 euros a fisiot...
2,Bizkaiko Liga Federatuetako Azken Faseak eta B...,"La solicitud para organizar Fases Federación, ..."
3,"Estatuko Berdintasunerako Idazkariak, Soledad ...","La secretaria de Estado para la Igualdad, Sole..."
4,"Baina, herri txiki askotan bake epailea udalet...","Pero en muchos pueblos pequeños, el juez de pa..."


In [None]:
#@title Nombre d'occurence de "secretaria" dans cleaned_sentences.csv en utilisant polars

import polars as pl

df = pl.read_csv('/content/cleaned sentences.csv')

keyword_pattern = r'secretaria' # Motif en minuscules

count = df.filter(
    # On convertit la colonne en minuscules avant de chercher
    pl.col('Spanish (es)').str.to_lowercase().str.contains(keyword_pattern, literal=False)
).height

print(f"{count}")

377


In [None]:
#@title Nombre d'occurence de "secretario" dans cleaned_sentences.csv en utilisant polars

import polars as pl

df = pl.read_csv('/content/cleaned sentences.csv')

keyword_pattern = r'secretario' # Motif en minuscules

count = df.filter(
    # On convertit la colonne en minuscules avant de chercher
    pl.col('Spanish (es)').str.to_lowercase().str.contains(keyword_pattern, literal=False)
).height

print(f"{count}")

702


# Ratio :
M : 702 <br>
F : 377

**Hypothèse** : le masculin est utilisé surtout pour des postes plus institutionnels (secretario general, secretario de Estado, etc.), ce qui est peut-être sur-représenté dans notre corpus.

**Hypothèse corrollaire** : Si le contexte est institutionnel, administratif, juridique, le modèle peut potentiellement avoir plus tendance à traduire au masculin.

Pour vérifier cette hypothèse, il faudrait faire des mesure sur un gros corpus en espagnol

### Question : quid des pluriels (secretarios, secretarias) ?