In [1]:
import cv2
from PIL import Image
import torch
import os
import csv
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import MarianMTModel, MarianTokenizer
from transformers import BartForConditionalGeneration, BartTokenizer


In [2]:
# Load BLIP processor and model for image captioning
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load translation model and tokenizer
translation_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ru", cache_dir='./cache')
translation_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru", cache_dir='./cache')

# Load summarization model and tokenizer
summarization_model_name = 'facebook/bart-large-cnn'
summarization_model = BartForConditionalGeneration.from_pretrained(summarization_model_name)
summarization_tokenizer = BartTokenizer.from_pretrained(summarization_model_name)

# Determine device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)




BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-0

In [3]:
def extract_frames(video_path, num_frames=16):
    """Extract frames from a video file"""
    vidcap = cv2.VideoCapture(video_path)
    frames = []
    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_interval = max(total_frames // num_frames, 1)
    success, image = vidcap.read()
    count = 0
    while success and len(frames) < num_frames:
        if count % frame_interval == 0:
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            frames.append(Image.fromarray(image_rgb))
        success, image = vidcap.read()
        count += 1
    vidcap.release()
    return frames


In [4]:
def generate_captions(frames, processor, model, device):
    """Generate textual descriptions for each frame using BLIP"""
    captions = []
    unique_captions = set()  

    inputs = processor(images=frames, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=30, num_beams=5, early_stopping=True)

    for i, output in enumerate(outputs):
        caption = processor.decode(output, skip_special_tokens=True)
        if caption not in unique_captions:
            unique_captions.add(caption)
            captions.append(caption)
            print(f"Frame {i + 1}: Caption - {caption}")
    
    return captions


In [5]:
def summarize_text(text, summarization_tokenizer, summarization_model):
    """Summarize text"""
    inputs = summarization_tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = summarization_model.generate(
        inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
    )
    summarized_text = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summarized_text


In [6]:
def translate_to_russian(caption, translation_tokenizer, translation_model):
    """Translate summarized text to Russian"""
    inputs = translation_tokenizer(caption, return_tensors="pt", padding=True)
    translated = translation_model.generate(**inputs)
    translated_caption = translation_tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_caption


In [7]:
def process_videos_and_generate_translations(video_folder, output_csv_path, processor, model, summarization_model, translation_model, device):
    """Process all videos in a folder, generate descriptions, summarize, translate, and write to CSV"""
    video_files = [f for f in os.listdir(video_folder) if f.endswith('.mp4')]

    processed_ids = set()
    if os.path.exists(output_csv_path):
        with open(output_csv_path, mode='r', encoding='utf-8') as output_csv:
            reader = csv.DictReader(output_csv)
            processed_ids = {row['video_id'] for row in reader}

    with open(output_csv_path, mode='a', newline='', encoding='utf-8') as output_csv:
        fieldnames = ['video_id', 'translated_description']
        writer = csv.DictWriter(output_csv, fieldnames=fieldnames)
        
        if not processed_ids:
            writer.writeheader()

        for video_file in video_files:
            video_id = os.path.splitext(video_file)[0]

            if video_id in processed_ids:
                print(f"Skipping already processed video: {video_id}")
                continue

            video_path = os.path.join(video_folder, video_file)
            print(f"Processing video: {video_path}")

            frames = extract_frames(video_path)
            captions = generate_captions(frames, processor, model, device)
            combined_text = ", ".join(captions)
            summarized_text = summarize_text(combined_text, summarization_tokenizer, summarization_model)
            translated_caption = translate_to_russian(summarized_text, translation_tokenizer, translation_model)

            writer.writerow({'video_id': video_id, 'translated_description': translated_caption})


In [8]:
video_folder = "/home/user1/hak/video/videos_2"
output_csv_path = "video_captions_translations.csv"

process_videos_and_generate_translations(
    video_folder=video_folder, 
    output_csv_path=output_csv_path, 
    processor=processor, 
    model=model, 
    summarization_model=summarization_model, 
    translation_model=translation_model, 
    device=device
)


Skipping already processed video: 76ec80b7a2aafcf5a33b61cc9c298cb2
Skipping already processed video: bfb52b8a50a2034704dd08473c17ec6b
Skipping already processed video: 1371a8d3b78c2e724e4a10064e0c14b1
Skipping already processed video: 4dddf7e1d2686d40ecc9b4c8a7a2b1c2
Skipping already processed video: dc64cde6c0c15ba71ce648e778e9490f
Skipping already processed video: fdab98598407b22bdf49891fe3c02a60
Skipping already processed video: 975f32a24cb025fdbb39c71a9bce4d6f
Skipping already processed video: c02e68f64c25f55afb78ba5f7c8b66bc
Skipping already processed video: a0fc183f8d20199569310a7b1e34208e
Skipping already processed video: 72ad592130deb113cae8478fe8b74e78
Skipping already processed video: f9f04e67e34a955e458f93092a9f1efb
Skipping already processed video: ded35ffe65135e06e72a196c950bc595
Skipping already processed video: f4f357cd4342b49c06edae02c307cd37
Skipping already processed video: fa7bdaa6ec0bcb973b7c013e7caaabf7
Skipping already processed video: 05f167f9777b73e552b23fdd0101

In [9]:
import pandas as pd

# Загрузка данных из файлов
summarized_df = pd.read_csv('video_captions_translations.csv')
filtered_df = pd.read_csv('/home/user1/hak/filtered_train_data_categories_test.csv')

# Объединение данных по 'id' и 'video_id'
merged_df = pd.merge(summarized_df, filtered_df, left_on='video_id', right_on='video_id')

# Выбираем только нужные колонки и сохраняем в новый DataFrame
result_df = merged_df[['video_id', 'translated_description', 'tags']]

# Сохранение результата в новый CSV файл
result_df.to_csv('merged_data.csv', index=False)

print("Новый CSV-файл успешно создан!")


Новый CSV-файл успешно создан!
