# Translation of prompts

In [1]:
import sqlite3
import pandas as pd
import os


db_path = "../../../giicg.db"
if not os.path.exists(db_path):
    raise FileNotFoundError(f"Database file does not exist: {db_path}")

conn = sqlite3.connect(db_path)
prompts = pd.read_sql("SELECT * FROM filtered_prompts", conn)
prompts

Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6
...,...,...,...,...,...,...,...,...,...
750,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code. I want to: Get nodes and edge...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92
751,1845,37,user,\n nun möchte ich judgement balancing m...,\n nun möchte ich judgement balancing m...,,,Woman (cisgender),29
752,1847,37,user,\n ich sehe keine veränderung im Plot. Was ...,\n ich sehe keine veränderung im Plot. Was ...,,,Woman (cisgender),29
753,1849,2,user,\n I am working on the problem of reconstru...,\n I am working on the problem of reconstru...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8


## Annotate with language
https://huggingface.co/papluca/xlm-roberta-base-language-detection

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch


def load_language_detection_model():
    model_ckpt = "papluca/xlm-roberta-base-language-detection"
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)
    return tokenizer, model

def detect_language(text: str, tokenizer, model) -> str:
    if not isinstance(text, str) or text.strip() == "":
        return "unknown"
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits

    probs = torch.softmax(logits, dim=-1)
    pred_idx = torch.argmax(probs, dim=1).item()  # Get index of highest probability
    # confidence = probs[0][pred_idx].item()

    # Map index to label
    id2lang = model.config.id2label
    language = id2lang[pred_idx]

    return language

tokenizer, lang_model = load_language_detection_model()
prompts['language'] = prompts['conversational'].apply(detect_language, args=(tokenizer, lang_model))
prompts

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...
750,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code. I want to: Get nodes and edge...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92,en
751,1845,37,user,\n nun möchte ich judgement balancing m...,\n nun möchte ich judgement balancing m...,,,Woman (cisgender),29,de
752,1847,37,user,\n ich sehe keine veränderung im Plot. Was ...,\n ich sehe keine veränderung im Plot. Was ...,,,Woman (cisgender),29,de
753,1849,2,user,\n I am working on the problem of reconstru...,\n I am working on the problem of reconstru...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8,en


## Translation
https://huggingface.co/Helsinki-NLP/opus-mt-de-en
https://huggingface.co/Helsinki-NLP/opus-mt-it-en

In [3]:
from transformers import AutoModelForSeq2SeqLM

def load_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return tokenizer, model

def translate(text, tokenizer, model):
    input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
    output_ids = model.generate(input_ids)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

def conditional_translation(row, de_tokenizer, de_model, it_tokenizer, it_model):
    if row['language'] == 'de':
        return translate(row['conversational'], de_tokenizer, de_model)
    if row['language'] == 'it':
        return translate(row['conversational'], it_tokenizer, it_model)
    return row['conversational']


de_tokenizer, de_model = load_model("Helsinki-NLP/opus-mt-de-en")
it_tokenizer, it_model = load_model("Helsinki-NLP/opus-mt-it-en")
prompts['conversational'] = prompts.apply(
    lambda row: conditional_translation(row, de_tokenizer, de_model, it_tokenizer, it_model), axis=1
)

prompts



Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en
...,...,...,...,...,...,...,...,...,...,...
750,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code. I want to: Get nodes and edge...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92,en
751,1845,37,user,\n nun möchte ich judgement balancing m...,Now I want to bring judgement balancing into t...,,,Woman (cisgender),29,de
752,1847,37,user,\n ich sehe keine veränderung im Plot. Was ...,I don't see any change in the plot.,,,Woman (cisgender),29,de
753,1849,2,user,\n I am working on the problem of reconstru...,\n I am working on the problem of reconstru...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8,en


## Write back to database

In [4]:
prompts.to_sql('translated_prompts', conn, if_exists='replace', index=False)
conn.close()