In [None]:
import csv
import requests
from urllib.parse import urlencode
from tqdm import tqdm

links = [
    "https://disk.yandex.ru/d/PEOGKbj5qJmGlQ",
    "https://disk.yandex.ru/d/1sqxBbA1hRumDQ",
    "https://disk.yandex.ru/d/Be8jLxVcQZ70lQ",
]

filenames = [
    "labels_json.zip",
    "data_train_short.zip",
    "data_test_short.zip",
]

actual_links = []
for link in links:
    base_url = 'https://cloud-api.yandex.net/v1/disk/public/resources/download?'
    final_url = base_url + urlencode(dict(public_key=link))
    response = requests.get(final_url)
    actual_links.append(response.json()['href'])

for filename,link in zip(filenames, actual_links):
    response = requests.get(link, stream=True)
    file_size = int(response.headers.get("Content-Length", 0))
    chunk_size = 1024 * 1024

    with tqdm(total=file_size, unit="B", unit_scale=True, desc="Скачивание") as pbar:
        with open(filename, "wb") as f:
            for chunk in response.iter_content(chunk_size=chunk_size):
                if chunk:
                    f.write(chunk)
                    pbar.update(len(chunk))

    print(f"Файл '{filename}' успешно скачан.")

Скачивание: 100%|██████████| 4.34k/4.34k [00:00<00:00, 1.83MB/s]


Файл 'labels_json.zip' успешно скачан.


Скачивание: 100%|██████████| 20.0G/20.0G [17:54<00:00, 18.6MB/s]


Файл 'data_train_short.zip' успешно скачан.


Скачивание: 100%|██████████| 12.9G/12.9G [12:35<00:00, 17.1MB/s]

Файл 'data_test_short.zip' успешно скачан.





In [None]:
!unzip /content/labels_json.zip
!unzip /content/data_train_short.zip
!unzip /content/data_test_short.zip

Archive:  /content/labels_json.zip
  inflating: labels_json/test_labels.json  
  inflating: labels_json/train_labels.json  
Archive:  /content/data_train_short.zip
   creating: data_train_short/
   creating: data_train_short/-220020068_456255414/
  inflating: data_train_short/-220020068_456255414/-220020068_456255414.mp4  
   creating: data_train_short/-220020068_456249693/
  inflating: data_train_short/-220020068_456249693/-220020068_456249693.mp4  
   creating: data_train_short/-220020068_456255339/
  inflating: data_train_short/-220020068_456255339/-220020068_456255339.mp4  
   creating: data_train_short/-220020068_456241755/
  inflating: data_train_short/-220020068_456241755/-220020068_456241755.mp4  
   creating: data_train_short/-220020068_456241671/
  inflating: data_train_short/-220020068_456241671/-220020068_456241671.mp4  
   creating: data_train_short/-220020068_456255340/
  inflating: data_train_short/-220020068_456255340/-220020068_456255340.mp4  
   creating: data_train_s

In [None]:
!pip install pytorch-crf
!pip install av

In [11]:
import torch
import torch.nn as nn
from transformers import Wav2Vec2Model, Wav2Vec2Processor
from timm import create_model
from torchcrf import CRF
from torchvision import transforms
from torchvision.io import read_video
from torch.cuda.amp import autocast
from einops import rearrange
import numpy as np
import librosa
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, DataLoader

In [2]:
class IntroDetectionModel(nn.Module):
    def __init__(self, num_classes=2, vit_name='vit_base_patch16_224', wav2vec_name='facebook/wav2vec2-base'):
        super().__init__()

        self.vit = create_model(vit_name, pretrained=True, num_classes=0)
        self.vit_fc = nn.Linear(self.vit.num_features, 512)

        self.wav2vec = Wav2Vec2Model.from_pretrained(wav2vec_name)
        self.audio_processor = Wav2Vec2Processor.from_pretrained(wav2vec_name)
        self.audio_fc = nn.Linear(768, 512)

        self.fusion_fc = nn.Linear(512 * 2, 512)

        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=512, nhead=8, dim_feedforward=2048),
            num_layers=2
        )

        self.crf = CRF(num_classes)
        self.classifier = nn.Linear(512, num_classes)

    def forward(self, images, audio_input, attention_mask=None, labels=None):
        T, B = images.shape[:2]

        if len(images.shape) == 5:
            images = images.view(T * B, *images.shape[2:])
            audio_input = audio_input.view(T * B, -1)
            attention_mask = attention_mask.view(T * B, -1)
        with torch.no_grad():
            image_features = self.vit(images)
        image_features = self.vit_fc(image_features)

        audio_out = self.wav2vec(audio_input, attention_mask).last_hidden_state.mean(dim=1)
        audio_features = self.audio_fc(audio_out)

        image_features = image_features.view(T, B, -1)
        audio_features = audio_features.view(T, B, -1)

        fused = torch.cat([image_features, audio_features], dim=-1)
        features = self.fusion_fc(fused)

        time_features = self.transformer(features)

        emissions = self.classifier(time_features)

        if labels is not None:
            loss = -self.crf(emissions, labels, reduction='mean')
            return loss, emissions
        else:
            predicted_tags = self.crf.decode(emissions)
            return predicted_tags, emissions

    def predict(self, images, audio_input):
        tags, _ = self.forward(images, audio_input)
        return tags

In [3]:
model = IntroDetectionModel()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
def format_params(num):
    if num >= 1_000_000:
        m = int(num / 1_000_000)
        rest = num % 1_000_000
        if rest >= 1000:
            k = rest // 1000
            return f"{m}M {k}K"
        else:
            return f"{m}M"
    elif num >= 1000:
        k = num // 1000
        return f"{k}K"
    else:
        return str(num)

print(f"Trainable params: {format_params(sum(p.numel() for p in model.parameters() if p.requires_grad))}")

Trainable params: 187M 788K


In [5]:
def extract_frames_and_audio(video_path, fps=1, start=None, duration=None):
    kwargs = {}
    if start is not None or duration is not None:
        kwargs['pts_unit'] = 'sec'
        if start is not None:
            kwargs['start_pts'] = start
        if duration is not None:
            kwargs['end_pts'] = start + duration if start is not None else duration

    video_tensor, audio_tensor, info = read_video(video_path, **kwargs)

    video_fps = info.get('video_fps', 25)
    audio_fps = info.get('audio_fps', 48000)
    total_frames = video_tensor.shape[0]

    if duration is not None:
        T = max(1, int(duration))
    else:
        T = max(1, total_frames // video_fps)

    selected_indices = torch.linspace(0, total_frames - 1, T).long()

    frames = video_tensor[selected_indices].float() / 255.0
    frames = rearrange(frames, 't h w c -> t c h w')

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    frames = transform(frames)

    if audio_tensor.numel() == 0:
        raise ValueError("No audio track in this video")

    audio = audio_tensor.mean(dim=0).numpy().astype(np.float32)
    target_sr = 16000

    if audio_fps != target_sr:
        audio = librosa.resample(audio, orig_sr=audio_fps, target_sr=target_sr)
        audio_fps = target_sr

    samples_per_second = audio_fps
    start_sample = int(start * samples_per_second) if start is not None else 0
    end_sample = start_sample + int(duration * samples_per_second) if duration is not None else len(audio)
    audio_segment = audio[start_sample:end_sample]

    required_samples = T * samples_per_second
    if len(audio_segment) < required_samples:
        pad_len = required_samples - len(audio_segment)
        audio_segment = np.pad(audio_segment, (0, pad_len), mode='constant')

    audio_segments = audio_segment[:T * samples_per_second].reshape(T, samples_per_second)
    audio_segments = torch.tensor(audio_segments)

    return frames, audio_segments, audio_fps

def process_audio(audio, sample_rate):
    if sample_rate != 16000:
        audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
    return audio

In [7]:
video_path = "/content/data_train_short/-220020068_456239859/-220020068_456239859.mp4"
frames, audio, sr = extract_frames_and_audio(video_path, start=4, duration=5)
audio = process_audio(audio, sr)

frames = frames.unsqueeze(1) # (frames, batch, channel, w, h)
audio_input = torch.tensor(audio).unsqueeze(1) # (frames, batch, seq_len)

attention_mask = torch.ones_like(audio_input).bool()

print("Аудио:", audio_input.shape)
print("Картинки:", frames.shape)

model = IntroDetectionModel().eval()

with torch.no_grad(), autocast():
    predicted_tags, emissions = model(frames, audio_input, attention_mask)

print("Предсказанные метки:", predicted_tags)
print(len(predicted_tags), emissions.shape)

  audio_input = torch.tensor(audio).unsqueeze(1) # (frames, batch, seq_len)


Аудио: torch.Size([5, 1, 16000])
Картинки: torch.Size([5, 1, 3, 224, 224])


  with torch.no_grad(), autocast():


Предсказанные метки: [[1, 1, 1, 1, 1]]
1 torch.Size([5, 1, 2])


In [None]:
import json
import os

root_dir = "/content/data_train_short"
all_items = os.listdir(root_dir)
folders = [item for item in all_items if os.path.isdir(os.path.join(root_dir, item))]

with open("/content/labels_json/train_labels.json", "r") as f:
    labels_data = json.load(f)

for name in labels_data.keys():
    print(labels_data[name]['start'], labels_data[name]['end'])

# По 5 секунд, до 3 минут

In [16]:
class IntroDetectionDataset(Dataset):
    def __init__(self, root_dir, labels_path, fps=1, fragment_duration=5, max_total_duration=180):
        self.root_dir = root_dir
        self.fps = fps
        self.fragment_duration = fragment_duration
        self.max_total_duration = max_total_duration

        all_items = os.listdir(root_dir)
        self.folder_names = [item for item in all_items if os.path.isdir(os.path.join(root_dir, item))]

        with open(labels_path, "r") as f:
            self.labels_data = json.load(f)

    def __len__(self):
        return len(self.folder_names) * (self.max_total_duration // self.fragment_duration)

    def __getitem__(self, idx):
        folder_idx = idx // (self.max_total_duration // self.fragment_duration)
        fragment_idx = idx % (self.max_total_duration // self.fragment_duration)

        folder_name = self.folder_names[folder_idx]
        video_path = os.path.join(self.root_dir, folder_name, f"{folder_name}.mp4")

        start_sec = fragment_idx * self.fragment_duration
        duration_sec = self.fragment_duration

        frames, audio_segments, _ = extract_frames_and_audio(
            video_path,
            fps=self.fps,
            start=start_sec,
            duration=duration_sec
        )

        video_key = f"{folder_name}.mp4"

        fragment_seconds = list(range(start_sec, start_sec + duration_sec))
        labels_list = []

        intro_start = self.labels_data[folder_name]["start"]
        intro_end = self.labels_data[folder_name]["end"]

        def time_to_sec(time_str):
            h, m, s = map(int, time_str.split(":"))
            return h * 3600 + m * 60 + s

        intro_start_sec = time_to_sec(intro_start)
        intro_end_sec = time_to_sec(intro_end)

        for sec in fragment_seconds:
            if intro_start_sec <= sec < intro_end_sec:
                labels_list.append(1)
            else:
                labels_list.append(0)

        labels_tensor = torch.tensor(labels_list, dtype=torch.long)

        return {
            "frames": frames,
            "audio": audio_segments,
            "labels": labels_tensor
        }

def collate_fn(batch):
    frames = torch.stack([b["frames"] for b in batch])
    audio = torch.stack([b["audio"] for b in batch])
    labels = torch.stack([b["labels"] for b in batch])

    frames = frames.transpose(0, 1)
    audio = audio.transpose(0, 1)
    labels = labels.transpose(0, 1)

    return {
        "frames": frames,
        "audio": audio,
        "labels": labels
    }

In [18]:
dataset = IntroDetectionDataset("/content/data_train_short", "/content/labels_json/train_labels.json")
loader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn)

for batch in loader:
    print("Фреймы:", batch['frames'].shape)
    print("Аудио:", batch['audio'].shape)
    print("Метки:", batch['labels'].shape)
    break

Фреймы: torch.Size([5, 2, 3, 224, 224])
Аудио: torch.Size([5, 2, 16000])
Метки: torch.Size([5, 2])


In [None]:
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        frames = batch["frames"].to(device)
        audio = batch["audio"].to(device)
        labels = batch["labels"].to(device)

        emissions = model(frames, audio, labels=labels)
        loss, _ = emissions

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch Loss: {avg_loss:.4f}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = IntroDetectionModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
loader = DataLoader(dataset, batch_size=4, collate_fn=collate_fn, shuffle=True)

for epoch in range(15):
    train(model, loader, optimizer, device)