In [None]:
pip install sacremoses -q

In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# 1. Configuration
input_file = '/content/Programatzailea_sentences.csv'
model_name = "Helsinki-NLP/opus-mt-eu-es"

# 2. Set up the device (Use GPU if available for faster processing)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# 3. Load Model and Tokenizer
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# 4. Load the CSV file
try:
    # We read without a header assumption to strictly access by index,
    # or you can remove 'header=None' if your CSV has a header row.
    # keeping header=0 (default) usually safest if there are titles.
    df = pd.read_csv(input_file)
except FileNotFoundError:
    print(f"Error: File not found at {input_file}")
    exit()

# 5. Define the translation function
def translate_sentence(text):
    # Handle empty rows or non-string data
    if pd.isna(text) or str(text).strip() == "":
        return ""

    # Tokenize (Ensure inputs are on the same device as the model)
    inputs = tokenizer(
        str(text),
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to(device)

    # Generate translation
    translated_ids = model.generate(
        **inputs,
        max_length=128,
        num_beams=4,
        early_stopping=True
    )

    # Decode
    return tokenizer.decode(translated_ids[0], skip_special_tokens=True)

# 6. Process the data
print("Translating sentences from the first column...")

# Extract text from the first column (index 0)
basque_sentences = df.iloc[:, 0]
spanish_translations = []

# Iterate and translate
total_rows = len(basque_sentences)
for i, sentence in enumerate(basque_sentences):
    translation = translate_sentence(sentence)
    spanish_translations.append(translation)

    # Optional: Print progress every 10 rows
    if (i + 1) % 10 == 0:
        print(f"Processed {i + 1}/{total_rows}")

# 7. Write to the Third Column
# Ensure the dataframe has at least 3 columns.
# If it has fewer, we append empty columns until we reach index 2.
while df.shape[1] < 2:
    df[f'Empty_Col_{df.shape[1]}'] = ""

if df.shape[1] < 3:
    # Insert new column at index 2 (3rd column)
    df.insert(2, "Spanish_Translation", spanish_translations)
else:
    # Update existing 3rd column
    df.iloc[:, 2] = spanish_translations

# 8. Save the file
# We use standard CSV writing mode. index=False prevents adding a new index column.
df.to_csv(input_file, index=False)

print(f"Translation complete. Updated file saved to: {input_file}")

Using device: cuda
Loading model...


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/825k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/834k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Translating sentences from the first column...
Processed 10/160
Processed 20/160
Processed 30/160
Processed 40/160
Processed 50/160
Processed 60/160
Processed 70/160
Processed 80/160
Processed 90/160
Processed 100/160
Processed 110/160
Processed 120/160
Processed 130/160
Processed 140/160
Processed 150/160
Processed 160/160
Translation complete. Updated file saved to: /content/Programatzailea_sentences.csv


In [3]:
import pandas as pd
import re

file_path = '/content/Programatzailea_sentences.csv'

try:
    # Load the CSV file
    df = pd.read_csv(file_path)

    # Check if the file has enough columns
    if df.shape[1] < 3:
        print("Error: The CSV file must have at least 3 columns.")
    else:
        # Select the 2nd column (index 1) and 3rd column (index 2)
        # Convert to string and lowercase to ensure case-insensitive matching
        col_2 = df.iloc[:, 1].astype(str).str.lower()
        col_3 = df.iloc[:, 2].astype(str).str.lower()

        # Define the regex patterns with word boundaries (\b)
        # \b ensures we match "programador" but NOT "programadora" or "programadores"
        pat_masc = r'\bprogramador\b'
        pat_fem = r'\bprogramadora\b'

        # --- 2nd Column Counts ---
        count_2_masc = col_2.str.count(pat_masc).sum()
        count_2_fem = col_2.str.count(pat_fem).sum()

        # --- 3rd Column Counts ---
        count_3_masc = col_3.str.count(pat_masc).sum()
        count_3_fem = col_3.str.count(pat_fem).sum()

        # --- Output Results ---
        print("-" * 50)
        print("ANALYSIS RESULTS")
        print("-" * 50)

        print("SECOND COLUMN:")
        print(f"1. Occurrences of 'programador':  {int(count_2_masc)}")
        print(f"2. Occurrences of 'programadora': {int(count_2_fem)}")
        print("-" * 50)

        print("THIRD COLUMN:")
        print(f"3. Occurrences of 'programador':  {int(count_3_masc)}")
        print(f"4. Occurrences of 'programadora': {int(count_3_fem)}")
        print("-" * 50)

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

--------------------------------------------------
ANALYSIS RESULTS
--------------------------------------------------
SECOND COLUMN:
1. Occurrences of 'programador':  63
2. Occurrences of 'programadora': 19
--------------------------------------------------
THIRD COLUMN:
3. Occurrences of 'programador':  77
4. Occurrences of 'programadora': 11
--------------------------------------------------
