In [1]:
import csv
import torch
from transformers import MarianMTModel, MarianTokenizer
from transformers import BartForConditionalGeneration, BartTokenizer

In [2]:
# Инициализация переводчиков и суммаризатора
translation_model_name = "Helsinki-NLP/opus-mt-ru-en"
translation_model = MarianMTModel.from_pretrained(translation_model_name)
translation_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)

back_translation_model_name = "Helsinki-NLP/opus-mt-en-ru"
back_translation_model = MarianMTModel.from_pretrained(back_translation_model_name)
back_translation_tokenizer = MarianTokenizer.from_pretrained(back_translation_model_name)

summarization_model_name = 'facebook/bart-large-cnn'
summarization_model = BartForConditionalGeneration.from_pretrained(summarization_model_name)
summarization_tokenizer = BartTokenizer.from_pretrained(summarization_model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
summarization_model.to(device)




BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

In [3]:
def translate_to_english(text):
    inputs = translation_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    translated = translation_model.generate(**inputs)
    return translation_tokenizer.decode(translated[0], skip_special_tokens=True)

def translate_to_russian(text):
    inputs = back_translation_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    translated = back_translation_model.generate(**inputs)
    return back_translation_tokenizer.decode(translated[0], skip_special_tokens=True)

def summarize_text(text):
    inputs = summarization_tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = summarization_model.generate(inputs['input_ids'].to(device), max_length=300, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [5]:
def process_file(input_file, output_file):
    # Считывание уже обработанных ID
    existing_ids = set()
    try:
        with open(output_file, 'r', encoding='utf-8') as outfile:
            reader = csv.DictReader(outfile)
            for row in reader:
                existing_ids.add(row['id'])
    except FileNotFoundError:
        pass

    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'a', newline='', encoding='utf-8') as outfile:
        reader = csv.DictReader(infile)
        writer = csv.DictWriter(outfile, fieldnames=['id', 'summarized_text'])

        # Записываем заголовок только если файл пуст
        if outfile.tell() == 0:
            writer.writeheader()

        total_lines = sum(1 for _ in open(input_file, 'r', encoding='utf-8')) - 1  # Подсчёт строк с текстом
        processed_count = 0

        for row in reader:
            if row['id'] in existing_ids:
                print(f"Skipping id: {row['id']} (already processed)")
                continue

            text = row['text'].strip('"')[:512]  # Обрезка текста до 512 символов

            print(f"Processing id: {row['id']} ({processed_count + 1}/{total_lines})...")

            # Перевод на английский
            english_text = translate_to_english(text)

            # Суммаризация текста
            summarized_english_text = summarize_text(english_text)

            # Перевод обратно на русский
            summarized_russian_text = translate_to_russian(summarized_english_text)

            # Запись в файл
            writer.writerow({
                'id': row['id'],
                'summarized_text': summarized_russian_text
            })

            processed_count += 1
            print(f"Completed id: {row['id']} ({processed_count}/{total_lines}). Remaining: {total_lines - processed_count}")

input_file = 'video_transcriptions.csv'
output_file = 'summarized_video_transcriptions.csv'
process_file(input_file, output_file)


Skipping id: 2f6be0a89ed03d5ad2a91165d728322e (already processed)
Skipping id: 0ac1817349ccfc9cd7252a95e22eb087 (already processed)
Skipping id: 2f4bd0532525895de06ac9411287cc33 (already processed)
Skipping id: 1e0a5151efc26a3a8e038e132f6b80f4 (already processed)
Skipping id: 0b73cea7b2dfdd7048dd84b95f8b9b0e (already processed)
Skipping id: 4a6d6e98413508867dc1adafef8c5e46 (already processed)
Skipping id: 0db157b2a0b65848184bf65e3cb2a57d (already processed)
Skipping id: 1a291fbe33c55a983b08f5fa60f71710 (already processed)
Skipping id: 4ae45fdfd387f5a00225798709fe0337 (already processed)
Skipping id: 2e4c7dae0c99f47839880e2da7fc6769 (already processed)
Skipping id: 3b69f98d51c1028633cff24c7d2937e0 (already processed)
Skipping id: 1c78f72bd8c56515486ad6a1eb464d7d (already processed)
Skipping id: 0e22ec9e51bd98309b4fe6870c2ce9c8 (already processed)
Skipping id: 2e4eca888ca4bb3fc09f967fcc500eb2 (already processed)
Skipping id: 3bb037f137368bea0b35524aad530e42 (already processed)
Skipping i

In [6]:
import pandas as pd

# Загрузка данных из файлов
summarized_df = pd.read_csv('summarized_video_transcriptions.csv')
filtered_df = pd.read_csv('/home/user1/hak/filtered_train_data_categories_test.csv')

# Объединение данных по 'id' и 'video_id'
merged_df = pd.merge(summarized_df, filtered_df, left_on='id', right_on='video_id')

# Выбираем только нужные колонки и сохраняем в новый DataFrame
result_df = merged_df[['id', 'summarized_text', 'tags']]

# Сохранение результата в новый CSV файл
result_df.to_csv('merged_data.csv', index=False)

print("Новый CSV-файл успешно создан!")


Новый CSV-файл успешно создан!
