In [3]:
import pandas as pd
import pickle

# Function to load and inspect pickle files
def load_and_inspect(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data

# Load and inspect the data files
train_infor_his = load_and_inspect('./train/train_infor_his')
val_id_history = load_and_inspect('./val/val_id_history')
val_interaction = load_and_inspect('./val/val_interaction')
train_interaction = load_and_inspect('./train/train_infor_his')
news_information = load_and_inspect('./news_infor')
test_information_history = load_and_inspect('./test/test_infor_his')
test_news_information = load_and_inspect('./test/test_news_infor')
test_interaction = load_and_inspect('./test/test_interaction')

print("news_information:", list(news_information.items())[:10])
print("train_infor_his:", list(train_infor_his.items())[:5])
print("train_interaction:", list(train_interaction.items())[:5])
print("val_id_history:", list(val_id_history.items())[:10])
print("val_interaction:", list(val_interaction)[:10])
print("test_information_history:", list(test_information_history.items())[:5])
print("test_news_information:", list(test_news_information.items())[:5])
print("test_interaction:", list(test_interaction)[:5])



news_information: [('3037230', "Ishockey-spiller: Jeg troede jeg skulle dø ISHOCKEY: Ishockey-spilleren Sebastian Harts håber stadig, at karrieren kan fortsætte i USA, efter han fik maven skåret op på diskotek Ambitionerne om at komme til USA og spille ishockey har 21-årige Sebastian Harts stadig.\nMen den tidligere Rødovre-spillere var tæt på ikke kun at miste sine sportslige ambitioner, men også livet, da han var på det forkerte sted på det forkerte tidspunkt.\nStedet var Club Mirage ved Boltens Gård i København, og tidspunktet var ved to-tiden natten til søndag 3. august.\nSammen med kæresten gennem de seneste fem år, Marie, og hendes veninde var de netop ankommet til diskoteket, da Sebastian Harts stødte ind i en person, som han ikke kendte.\n- Han gik ind i mig. Vi kom i diskussion, og jeg sagde o.k., da han foreslog, vi kunne ordne 'det' udenfor. Men vi kom på talefod igen, og han gav mig hånden og gik, husker Sebastian Harts, som i dag er hjemme og efter omstændighederne har det

In [5]:
import pandas as pd
import pickle
import os
from transformers import MarianMTModel, MarianTokenizer

# Function to load and inspect pickle files
def load_and_inspect(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data

# Function to save the translated data
def save_translated_data(data, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)

# Function to translate text
def translate_text(texts, model, tokenizer):
    translated = []
    batch_size = 32
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        translated_batch = model.generate(**tokenizer(batch, return_tensors="pt", padding=True))
        translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_batch]
        translated.extend(translated_texts)
    return translated

# Initialize the model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-da-en'
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Load the data files
data_files = {
    'train_infor_his': './train/train_infor_his',
    'val_id_history': './val/val_id_history',
    'val_interaction': './val/val_interaction',
    'train_interaction': './train/train_interaction',
    'news_information': './news_infor',
    'test_information_history': './test/test_infor_his',
    'test_news_information': './test/test_news_infor',
    'test_interaction': './test/test_interaction'
}

# Create the translated folder if it doesn't exist
os.makedirs('./translated', exist_ok=True)

# Process each data file
for name, path in data_files.items():
    data = load_and_inspect(path)
    if name == 'train_infor_his' or name == 'test_information_history':
        # Translate dictionary values (list of texts)
        translated_data = {k: translate_text(v, model, tokenizer) for k, v in data.items()}
    elif name == 'train_interaction':
        # No translation needed for this format
        translated_data = data
    elif name == 'val_id_history':
        # No translation needed for this format
        translated_data = data
    elif name == 'val_interaction' or name == 'test_interaction':
        # No translation needed for this format
        translated_data = data
    elif name == 'news_information' or name == 'test_news_information':
        # Translate dictionary values (textual content)
        translated_data = {k: translate_text([v], model, tokenizer)[0] for k, v in data.items()}
    else:
        raise ValueError(f"Unsupported data type: {type(data)} for file: {name}")
    
    # Save the translated data
    save_translated_data(translated_data, f'./translated/{name}.pkl')

print("Translation and saving completed.")


KeyboardInterrupt: 