## Import Libraries

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import pandas as pd

from tqdm import tqdm

## Read Data 

In [4]:
data = pd.read_csv("../Data/normal_dataset.csv")
data

Unnamed: 0,Text
0,That's really convenient. I should consider ge...
1,Hey do you have any favorite pet-related chari...
2,I make it a point to network with professional...
3,Hi I make it a habit to read industry-specific...
4,Hello plea bargaining can be seen as a practic...
...,...
45094,It sounds cool . The rhythms look really diffi...
45095,How about the cinema ?
45096,"In some way , she is more modern ."
45097,What's the house rent ? When is the rent due ?...


In [5]:
data['Text'].values

array(["That's really convenient. I should consider getting a smartwatch too.",
       'Hey do you have any favorite pet-related charities or organizations?',
       'I make it a point to network with professionals in our field and engage in discussions on industry forums. It helps me stay current.',
       ..., 'In some way , she is more modern .',
       "What's the house rent ? When is the rent due ? And how much security deposit do you require ?",
       "What's the weather like in your country ? I suppose it must be warmer than here ."],
      dtype=object)

In [6]:
pd.set_option('display.max_colwidth', None)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45099 entries, 0 to 45098
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    45099 non-null  object
dtypes: object(1)
memory usage: 352.5+ KB


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45099 entries, 0 to 45098
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    45099 non-null  object
dtypes: object(1)
memory usage: 352.5+ KB


In [9]:
data.head()

Unnamed: 0,Text
0,That's really convenient. I should consider getting a smartwatch too.
1,Hey do you have any favorite pet-related charities or organizations?
2,I make it a point to network with professionals in our field and engage in discussions on industry forums. It helps me stay current.
3,Hi I make it a habit to read industry-specific publications and research papers. They provide in-depth analysis and keep me informed.
4,Hello plea bargaining can be seen as a practical approach to resolving cases efficiently and reducing the caseload of the courts.


from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-tc-big-en-tr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Çeviri fonksiyonunu tanımla
def translate_text(text):
    inputs = tokenizer([text], return_tensors="pt", padding=True)
    translated = model.generate(**inputs)
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_text

# DataFrame'deki her satır için çeviri işlemi uygula ve yeni bir kolona ekle
data['Translated_Text'] = data['Text'].apply(translate_text)

# Sonuçları göster
print(data)


In [10]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-tc-big-en-tr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

device = "cpu"
model.to(device)



MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(57060, 1024, padding_idx=57059)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(57060, 1024, padding_idx=57059)
      (embed_positions): MarianSinusoidalPositionalEmbedding(1024, 1024)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1

In [11]:
def batch_translate_text(texts, batch_size=4):
    translated_texts = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Translating Process"):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True).to(device)
        translated = model.generate(**inputs)
        translated_batch = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
        translated_texts.extend(translated_batch)
        del inputs, translated  # Free up memory
        torch.cuda.empty_cache()
    return translated_texts

texts = data['Text'].tolist()
translated_texts = batch_translate_text(texts)

data['Translated_Text'] = translated_texts

Translating Process: 100%|████████████████████████████████████████████████████████████| 11275/11275 [6:44:37<00:00,  2.15s/it]


In [12]:
data.to_excel("../Data/temp.xlsx", index=False)

In [None]:
data.head()

data.to_csv("agoraphobia_translated_to_tr", index=False)

df = pd.read_csv("agoraphobia_translated_to_tr")