In [6]:
import mediapipe as mp
import cv2
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from ultralytics import YOLO
from transformers import CLIPProcessor
from models.models import EmotionMamba, PersonalityMamba, FusionTransformer
from data_loading.feature_extractor import PretrainedImageEmbeddingExtractor
from utils.config_loader import ConfigLoader

def draw_box(image, box, color=(255, 0, 255)):
    """Draw a rectangle on the image."""
    line_width = 2
    lw = line_width or max(round(sum(image.shape) / 2 * 0.003), 2)
    p1, p2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3]))
    cv2.rectangle(image, p1, p2, color, thickness=lw, lineType=cv2.LINE_AA)

def image_processing(image, image_processor):
    image = image_processor(images=image, return_tensors="pt").to("cuda")
    image = image['pixel_values']
    return image

def preprocess_face(face_roi: np.ndarray) -> np.ndarray:
    """Предобработка области лица (пример: нормализация + resize)."""
    # Пример: преобразуем в 112x112 и нормализуем [0, 1]
    face_roi = cv2.resize(face_roi, (112, 112))
    face_roi = face_roi.astype('float32') / 255.0
    return face_roi

def preprocess_body(body_roi: np.ndarray) -> np.ndarray:
    """Предобработка области тела (пример: нормализация + resize)."""
    # Пример: преобразуем в 224x224 и нормализуем [0, 1]
    body_roi = cv2.resize(body_roi, (224, 224))
    body_roi = body_roi.astype('float32') / 255.0
    return body_roi

def select_uniform_frames(frames, N):
    if len(frames) <= N:
        return frames
    else:
        indices = np.linspace(0, len(frames) - 1, num=N, dtype=int)
        return [frames[i] for i in indices]

def get_fusion_model(config, device):
    emo_model = EmotionMamba(
    input_dim_emotion     = config.image_embedding_dim,
    input_dim_personality = config.image_embedding_dim,
    len_seq               = config.counter_need_frames, 
    hidden_dim            = config.hidden_dim_emo,
    out_features          = config.out_features_emo,
    tr_layer_number       = config.tr_layer_number_emo,
    num_transformer_heads = config.num_transformer_heads_emo,
    positional_encoding   = config.positional_encoding_emo,
    mamba_d_model         = config.mamba_d_state_emo,
    mamba_layer_number    = config.mamba_layer_number_emo,
    dropout               = config.dropout,
    num_emotions          = 7,
    num_traits            = 5,
    device                = device
    ).to(device).eval()
    # параметры задаем для лучшей персональной модели
    per_model = PersonalityMamba(
    input_dim_emotion     = config.image_embedding_dim,
    input_dim_personality = config.image_embedding_dim,
    len_seq               = config.counter_need_frames, 
    hidden_dim            = config.hidden_dim_per,
    out_features          = config.out_features_per,
    per_activation        = config.best_per_activation,
    tr_layer_number       = config.tr_layer_number_per,
    num_transformer_heads = config.num_transformer_heads_per,
    positional_encoding   = config.positional_encoding_per,
    mamba_d_model         = config.mamba_d_state_per,
    mamba_layer_number    = config.mamba_layer_number_per,
    dropout               = config.dropout,
    num_emotions          = 7,
    num_traits            = 5,
    device                = device
    ).to(device).eval()

    # emo_state = torch.load(config.path_to_saved_emotion_model, map_location=device)
    # emo_model.load_state_dict(emo_state)

    # emo_state = torch.load(config.path_to_saved_personality_model, map_location=device)
    # per_model.load_state_dict(emo_state)
    model = FusionTransformer(
        emo_model             = emo_model,
        per_model             = per_model,
        input_dim_emotion     = config.image_embedding_dim,
        input_dim_personality = config.image_embedding_dim,
        hidden_dim            = config.hidden_dim,
        out_features          = config.out_features,
        per_activation        = config.per_activation,
        tr_layer_number       = config.tr_layer_number,
        num_transformer_heads = config.num_transformer_heads,
        positional_encoding   = config.positional_encoding,
        mamba_d_model         = config.mamba_d_state,
        mamba_layer_number    = config.mamba_layer_number,
        dropout               = config.dropout,
        num_emotions          = 7,
        num_traits            = 5,
        device                = device
        ).to(device).eval()

    return model

def transform_matrix(matrix):
    threshold1 = 1 - 1/7 
    threshold2 = 1/7
    mask1 = matrix[:, 0] >= threshold1
    result = np.zeros_like(matrix[:, 1:])
    transformed = (matrix[:, 1:] >= threshold2).astype(int)
    result[~mask1] = transformed[~mask1]
    return result

def process_predictions(pred_emo):
    pred_emo = torch.nn.functional.softmax(pred_emo, dim=1).cpu().detach().numpy()
    pred_emo = transform_matrix(pred_emo).tolist()
    return pred_emo

def get_metadata(video_path: str, segment_length: int, image_processor: None, image_feature_extractor: None, device: None) -> pd.DataFrame:
    """Основная функция: получает метаданные для видео."""
    if hasattr(body_detector.predictor, 'trackers'):
        body_detector.predictor.trackers[0].reset()
    
    cap = cv2.VideoCapture(video_path)
    video_name = os.path.basename(video_path)
    w, h, fps, total_frames = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT, cv2.CAP_PROP_FPS, cv2.CAP_PROP_FRAME_COUNT))
    need_frames = select_uniform_frames(list(range(total_frames)), segment_length)
    
    counter = 0
    embeds = []

    body_list = []
    face_list = []
    
    while True:
        ret, im0 = cap.read()
        if not ret:
            break

        if counter in need_frames:
            # Детекция всех лиц
            preprocessed_body = []
            preprocessed_face = []
            face_results = face_detector.process(cv2.cvtColor(im0, cv2.COLOR_BGR2RGB))
            # Детекция всех тел
            body_results = body_detector.track(im0, persist=True, imgsz=640, conf=0.01, iou=0.5, 
                                             augment=False, device=0, verbose=False)

            # Случай 1: Есть лица — обрабатываем каждое
            if face_results.detections:
                for face_idx, detection in enumerate(face_results.detections):
                    # Координаты лица
                    bbox = detection.location_data.relative_bounding_box
                    x1, y1 = max(int(bbox.xmin * w), 0), max(int(bbox.ymin * h), 0)
                    x2, y2 = min(int((bbox.xmin + bbox.width) * w), w), min(int((bbox.ymin + bbox.height) * h), h)
                    face_bbox = (x1, y1, x2, y2)
                    face_center = ((x1 + x2) // 2, (y1 + y2) // 2)

                    # Ищем тело, содержащее центр лица
                    body_bbox = None
                    body_id = -1
                    if body_results and len(body_results[0].boxes) > 0:
                        for box in body_results[0].boxes:
                            box_coords = box.xyxy.int().cpu().numpy()[0]
                            if (box_coords[0] <= face_center[0] <= box_coords[2] and 
                                box_coords[1] <= face_center[1] <= box_coords[3]):
                                body_bbox = box_coords
                                body_id = box.id.int().cpu().item() if box.id else -1
                                break

                    # Предобработка
                    face_roi = im0[y1:y2, x1:x2]
                    draw_box(im0, [x1, y1, x2, y2])
                    draw_box(im0, [body_bbox[0], body_bbox[1], body_bbox[2], body_bbox[3]])
                    preprocessed_face = image_processing(face_roi, image_processor) if face_roi.size > 0 else None
                    
                    if body_bbox is not None:
                        body_roi = im0[body_bbox[1]:body_bbox[3], body_bbox[0]:body_bbox[2]]
                        preprocessed_body = image_processing(body_roi, image_processor) if body_roi.size > 0 else None
                    else:
                        preprocessed_body = []

                    # Сохраняем результат
                    embeds.append([
                        video_name, counter, body_id,
                        x1, y1, x2, y2,
                        body_bbox[0] if body_bbox is not None else None,
                        body_bbox[1] if body_bbox is not None else None,
                        body_bbox[2] if body_bbox is not None else None,
                        body_bbox[3] if body_bbox is not None else None,
                        # preprocessed_face,
                        # preprocessed_body
                    ])
                    # print(preprocessed_body.shape)
                    # print(preprocessed_face.shape)
                    if preprocessed_body.shape[0] > 0:
                        body_list.append(preprocessed_body)
                    if preprocessed_face.shape[0] > 0:
                        face_list.append(preprocessed_face)
                    

            # Случай 2: Лиц нет — берём самое большое тело
            elif body_results and len(body_results[0].boxes) > 0:
                largest_body = max(
                    body_results[0].boxes,
                    key=lambda box: (box.xyxy[0,2] - box.xyxy[0,0]) * (box.xyxy[0,3] - box.xyxy[0,1])
                )
                body_coords = largest_body.xyxy.int().cpu().numpy()[0]
                body_id = largest_body.id.int().cpu().item() if largest_body.id else -1

                # Предобработка тела
                body_roi = im0[body_coords[1]:body_coords[3], body_coords[0]:body_coords[2]]
                preprocessed_body = preprocess_body(body_roi) if body_roi.size > 0 else []

                embeds.append([
                    video_name, counter, body_id,
                    None, None, None, None,  # Нет лица
                    body_coords[0], body_coords[1], body_coords[2], body_coords[3],
                    # None,  # Нет лица
                    # preprocessed_body
                ])

                if preprocessed_body.shape[0] > 0:
                    body_list.append(preprocessed_body)
                if preprocessed_face.shape[0] > 0:
                    face_list.append(preprocessed_face)

            plt.imshow(cv2.cvtColor(im0, cv2.COLOR_BGR2RGB))
            plt.show()

        counter += 1
        torch.cuda.empty_cache()

    cap.release()

    body_list = torch.cat(body_list, dim=0)
    body_feature = image_feature_extractor.extract(body_list).to(device)

    face_list = torch.cat(face_list, dim=0)
    face_feature = image_feature_extractor.extract(face_list).to(device)
    
    df = pd.DataFrame(embeds, columns=[
        "video_name", "frame", "person_id",
        "face_x1", "face_y1", "face_x2", "face_y2",
        "body_x1", "body_y1", "body_x2", "body_y2",
        # "preprocessed_face", "preprocessed_body"
    ])
    return df, body_feature, face_feature

In [7]:
mp_face_detection = mp.solutions.face_detection
face_detector = mp_face_detection.FaceDetection(model_selection=1, min_detection_confidence=0.6)
body_detector = YOLO('extractors/body/best.pt')
image_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
config_body = ConfigLoader("inference_config_body.toml")
config_face = ConfigLoader("inference_config_face.toml")
# image_feature_extractor = PretrainedImageEmbeddingExtractor(config_body)
image_feature_extractor = PretrainedImageEmbeddingExtractor(device="cuda")
# Models can download from https://drive.google.com/drive/folders/1APMtC4LXjuW9behd2TxVXz0DsjQKAgRR?usp=sharing

body_model = get_fusion_model(config_body, 'cuda')
face_model = get_fusion_model(config_face, 'cuda')
# results_clip_body_true_mamba_fusiontransformer_2025-06-27_16-10-57/metrics_by_epoch/metrics_epochlog_FusionTransformer_num_transformer_heads_16_20250627_183039_timestamp/best_model_dev.pt
body_fusion_model_path = 'extractors/body/clip_body_mamba_transformer_fusion_model.pt'
# results_fusiontransformer_2025-07-03_09-41-13/metrics_by_epoch/metrics_epochlog_FusionTransformer_tr_layer_number_3_20250703_124848_timestamp/best_model_dev.pt
face_fusion_model_path = 'extractors/face/clip_face_mamba_transformer_fusion_model.pt'

body_state = torch.load(body_fusion_model_path, map_location='cuda')
body_model.load_state_dict(body_state)

face_state = torch.load(face_fusion_model_path, map_location='cuda')
face_model.load_state_dict(face_state)

RuntimeError: Error(s) in loading state_dict for FusionTransformer:
	size mismatch for emo_proj.0.weight: copying a param with shape torch.Size([1024, 256]) from checkpoint, the shape in current model is torch.Size([256, 256]).
	size mismatch for emo_proj.0.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for emo_proj.1.weight: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for emo_proj.1.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for per_proj.0.weight: copying a param with shape torch.Size([1024, 1024]) from checkpoint, the shape in current model is torch.Size([256, 1024]).
	size mismatch for per_proj.0.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for per_proj.1.weight: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for per_proj.1.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for emotion_to_personality_attn.0.self_attention.in_proj_weight: copying a param with shape torch.Size([3072, 1024]) from checkpoint, the shape in current model is torch.Size([768, 256]).
	size mismatch for emotion_to_personality_attn.0.self_attention.in_proj_bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([768]).
	size mismatch for emotion_to_personality_attn.0.self_attention.out_proj.weight: copying a param with shape torch.Size([1024, 1024]) from checkpoint, the shape in current model is torch.Size([256, 256]).
	size mismatch for emotion_to_personality_attn.0.self_attention.out_proj.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for emotion_to_personality_attn.0.feed_forward.layer_1.weight: copying a param with shape torch.Size([1024, 1024]) from checkpoint, the shape in current model is torch.Size([256, 256]).
	size mismatch for emotion_to_personality_attn.0.feed_forward.layer_1.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for emotion_to_personality_attn.0.feed_forward.layer_2.weight: copying a param with shape torch.Size([1024, 1024]) from checkpoint, the shape in current model is torch.Size([256, 256]).
	size mismatch for emotion_to_personality_attn.0.feed_forward.layer_2.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for emotion_to_personality_attn.0.add_norm_after_attention.norm.weight: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for emotion_to_personality_attn.0.add_norm_after_attention.norm.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for emotion_to_personality_attn.0.add_norm_after_ff.norm.weight: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for emotion_to_personality_attn.0.add_norm_after_ff.norm.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for emotion_to_personality_attn.0.positional_encoding.pe: copying a param with shape torch.Size([5000, 1024]) from checkpoint, the shape in current model is torch.Size([5000, 256]).
	size mismatch for personality_to_emotion_attn.0.self_attention.in_proj_weight: copying a param with shape torch.Size([3072, 1024]) from checkpoint, the shape in current model is torch.Size([768, 256]).
	size mismatch for personality_to_emotion_attn.0.self_attention.in_proj_bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([768]).
	size mismatch for personality_to_emotion_attn.0.self_attention.out_proj.weight: copying a param with shape torch.Size([1024, 1024]) from checkpoint, the shape in current model is torch.Size([256, 256]).
	size mismatch for personality_to_emotion_attn.0.self_attention.out_proj.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for personality_to_emotion_attn.0.feed_forward.layer_1.weight: copying a param with shape torch.Size([1024, 1024]) from checkpoint, the shape in current model is torch.Size([256, 256]).
	size mismatch for personality_to_emotion_attn.0.feed_forward.layer_1.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for personality_to_emotion_attn.0.feed_forward.layer_2.weight: copying a param with shape torch.Size([1024, 1024]) from checkpoint, the shape in current model is torch.Size([256, 256]).
	size mismatch for personality_to_emotion_attn.0.feed_forward.layer_2.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for personality_to_emotion_attn.0.add_norm_after_attention.norm.weight: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for personality_to_emotion_attn.0.add_norm_after_attention.norm.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for personality_to_emotion_attn.0.add_norm_after_ff.norm.weight: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for personality_to_emotion_attn.0.add_norm_after_ff.norm.bias: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for personality_to_emotion_attn.0.positional_encoding.pe: copying a param with shape torch.Size([5000, 1024]) from checkpoint, the shape in current model is torch.Size([5000, 256]).
	size mismatch for emotion_personality_fc_out.0.weight: copying a param with shape torch.Size([128, 2048]) from checkpoint, the shape in current model is torch.Size([128, 512]).
	size mismatch for personality_emotion_fc_out.0.weight: copying a param with shape torch.Size([128, 2048]) from checkpoint, the shape in current model is torch.Size([128, 512]).

In [33]:
import pickle

# путь к твоему .pickle файлу
pickle_path = "../features/cmu_mosei_test_seed_42_subset_size_2_average_features_True_feature_norm_False.pickle"

with open(pickle_path, "rb") as f:
    data = pickle.load(f)

# Посмотреть первый элемент
print("🔍 Первый элемент:")
item = data[0]
print(item)



🔍 Первый элемент:
{'sample_name': '-6rXp3zJ3kc_14.4680_22.8820', 'video_path': 'E:/CMU-MOSEI//video/test/-6rXp3zJ3kc_14.4680_22.8820.mp4', 'audio_path': 'E:/CMU-MOSEI//audio/test/-6rXp3zJ3kc_14.4680_22.8820.wav', 'features': {'body': {'emotion_logits': tensor([ 0.4595,  1.5333, -0.9546,  0.3222, -0.7833,  1.2438, -0.9155]), 'personality_scores': tensor([0.3668, 0.6629, 0.3182, 0.5780, 0.5149]), 'last_emo_encoder_features': tensor([ 1.6797,  1.1995,  0.0361,  ...,  1.3864,  5.1510, -0.0212]), 'last_per_encoder_features': tensor([-0.1187,  3.0620, -1.4489,  ..., -0.2055, -0.5260,  0.7611])}, 'face': {'emotion_logits': tensor([ 0.6361,  1.7097, -0.3265, -0.9575, -1.2868,  0.9896, -0.9342]), 'personality_scores': tensor([0.4642, 0.6192, 0.3096, 0.5205, 0.4690]), 'last_emo_encoder_features': tensor([-8.2230e+00, -6.9223e+00, -5.7896e+00, -3.3821e+00, -3.7589e+00,
         2.9955e+00,  3.7664e+00,  1.5413e+00, -1.1885e-01,  6.8728e-01,
        -2.7760e+00, -4.4210e+00,  2.4823e+00,  3.9500e+

In [34]:
import torch
import numpy as np
# Дополнительно: распечатать шейпы признаков по модальностям
print("\n🔎 Шейпы признаков:")
modalities = item.get("features", {})
for mod_name, features in modalities.items():
    print(f"\n[{mod_name.upper()}]")
    for feat_name, feat_val in features.items():
        if isinstance(feat_val, torch.Tensor):
            print(f"  {feat_name}: {feat_val.shape}")
        elif isinstance(feat_val, np.ndarray):
            print(f"  {feat_name}: {feat_val.shape}")
        else:
            print(f"  {feat_name}: not a tensor ({type(feat_val)})")


🔎 Шейпы признаков:

[BODY]
  emotion_logits: torch.Size([7])
  personality_scores: torch.Size([5])
  last_emo_encoder_features: torch.Size([1024])
  last_per_encoder_features: torch.Size([1024])

[FACE]
  emotion_logits: torch.Size([7])
  personality_scores: torch.Size([5])
  last_emo_encoder_features: torch.Size([512])
  last_per_encoder_features: torch.Size([512])

[SCENE]
  emotion_logits: torch.Size([7])
  personality_scores: torch.Size([5])
  last_emo_encoder_features: torch.Size([768])
  last_per_encoder_features: torch.Size([768])

[AUDIO]
  emotion_logits: torch.Size([7])
  personality_scores: torch.Size([5])
  last_emo_encoder_features: torch.Size([256])
  last_per_encoder_features: torch.Size([256])

[TEXT]
  emotion_logits: torch.Size([7])
  personality_scores: torch.Size([5])
  last_emo_encoder_features: torch.Size([256])
  last_per_encoder_features: torch.Size([256])


In [35]:
import pickle

# путь к твоему .pickle файлу
pickle_path = "../features/cmu_mosei_test_seed_42_subset_size_2_average_features_False_feature_norm_False.pickle"

with open(pickle_path, "rb") as f:
    data = pickle.load(f)

# Посмотреть первый элемент
print("🔍 Первый элемент:")
item = data[0]
print(item)



🔍 Первый элемент:
{'sample_name': '-6rXp3zJ3kc_14.4680_22.8820', 'video_path': 'E:/CMU-MOSEI//video/test/-6rXp3zJ3kc_14.4680_22.8820.mp4', 'audio_path': 'E:/CMU-MOSEI//audio/test/-6rXp3zJ3kc_14.4680_22.8820.wav', 'features': {'body': {'emotion_logits': tensor([ 0.4595,  1.5333, -0.9546,  0.3222, -0.7833,  1.2438, -0.9155]), 'personality_scores': tensor([0.3668, 0.6629, 0.3182, 0.5780, 0.5149]), 'last_emo_encoder_features': tensor([[ 0.8910,  1.9902,  0.0095,  ...,  1.1676,  5.9335,  0.1280],
        [ 2.5225,  0.8325,  0.3014,  ...,  1.2802,  5.2854, -0.2099],
        [ 1.9918,  1.4558,  0.7890,  ...,  1.4387,  4.8233,  0.6186],
        ...,
        [ 2.1812,  1.0188,  0.6352,  ...,  1.4023,  5.2032,  0.4299],
        [ 1.3845,  1.0713,  0.4135,  ...,  2.0127,  5.1946,  0.1978],
        [ 1.1576,  0.4487, -0.5617,  ...,  1.3371,  5.4190, -0.4237]]), 'last_per_encoder_features': tensor([[ 0.1147,  3.0628, -1.2588,  ..., -0.4051, -0.6193,  0.7605],
        [ 0.0674,  3.0821, -1.4463,  ..

In [32]:
import pickle

# путь к твоему .pickle файлу
pickle_path = "../features/fiv2_test_seed_42_subset_size_2_average_features_False_feature_norm_False.pickle"

with open(pickle_path, "rb") as f:
    data = pickle.load(f)

# Посмотреть первый элемент
print("🔍 Первый элемент:")
item = data[0]
print(item)



🔍 Первый элемент:
{'sample_name': 'htH89DBizno.004', 'video_path': 'E:/FirstImpressionsV2//video/test/htH89DBizno.004.mp4', 'audio_path': 'E:/FirstImpressionsV2//audio/test/htH89DBizno.004.wav', 'features': {'body': {'emotion_logits': tensor([-1.3183, -0.8238, -0.6552, -0.6306,  1.2407, -0.7158,  1.3891]), 'personality_scores': tensor([0.5326, 0.5365, 0.4849, 0.6321, 0.5423]), 'last_emo_encoder_features': tensor([[-6.0984, -1.2685, -2.9987,  ..., -4.1940, -0.9488, -2.8461],
        [-4.2881, -1.9723, -2.6279,  ..., -4.4866, -0.6987, -2.1308],
        [-4.4905, -2.9841, -2.6212,  ..., -4.8182, -0.5226, -2.0801],
        ...,
        [-3.2482, -2.0572, -2.8292,  ..., -3.3157, -1.0039, -1.2865],
        [-3.4654, -3.4890, -2.1004,  ..., -3.6636, -0.5618, -0.3304],
        [-3.5330, -3.4800, -2.3680,  ..., -3.8267, -0.4109,  0.1079]]), 'last_per_encoder_features': tensor([[ 4.5091,  1.5342,  0.8601,  ...,  0.1812, -1.9097, -0.6636],
        [ 4.4139,  1.5332,  0.4595,  ...,  0.0348, -2.183

In [29]:
import torch
import numpy as np
# Дополнительно: распечатать шейпы признаков по модальностям
print("\n🔎 Шейпы признаков:")
modalities = item.get("features", {})
for mod_name, features in modalities.items():
    print(f"\n[{mod_name.upper()}]")
    for feat_name, feat_val in features.items():
        if isinstance(feat_val, torch.Tensor):
            print(f"  {feat_name}: {feat_val.shape}")
        elif isinstance(feat_val, np.ndarray):
            print(f"  {feat_name}: {feat_val.shape}")
        else:
            print(f"  {feat_name}: not a tensor ({type(feat_val)})")


🔎 Шейпы признаков:

[BODY]
  emotion_logits: torch.Size([7])
  personality_scores: torch.Size([5])
  last_emo_encoder_features: torch.Size([30, 1024])
  last_per_encoder_features: torch.Size([30, 1024])

[FACE]
  emotion_logits: torch.Size([7])
  personality_scores: torch.Size([5])
  last_emo_encoder_features: torch.Size([30, 512])
  last_per_encoder_features: torch.Size([30, 512])

[SCENE]
  emotion_logits: torch.Size([7])
  personality_scores: torch.Size([5])
  last_emo_encoder_features: torch.Size([30, 768])
  last_per_encoder_features: torch.Size([30, 768])

[AUDIO]
  emotion_logits: torch.Size([7])
  personality_scores: torch.Size([5])
  last_emo_encoder_features: torch.Size([421, 256])
  last_per_encoder_features: torch.Size([421, 256])

[TEXT]
  emotion_logits: torch.Size([7])
  personality_scores: torch.Size([5])
  last_emo_encoder_features: torch.Size([33, 256])
  last_per_encoder_features: torch.Size([33, 256])


In [2]:
print(type(data))
print(f"Элементов в списке: {len(data)}")

<class 'list'>
Элементов в списке: 10


In [3]:
import pandas as pd
import os


def clean_csv_to_copy(csv_path: str, output_path: str):
    df = pd.read_csv(csv_path)

    # Удаляем неназванные колонки
    unnamed_cols = [col for col in df.columns if col.startswith("Unnamed") or col.strip() == ""]
    if unnamed_cols:
        print(f"Удаляем колонку/и: {unnamed_cols}")
        df = df.drop(columns=unnamed_cols)

    # Чистим расширения в video_name
    if 'video_name' not in df.columns:
        raise ValueError("Файл без 'video_name'. Ты что, решил пошутить?")
    
    df['video_name'] = df['video_name'].apply(lambda x: os.path.splitext(str(x))[0])

    # Сохраняем в новый файл
    df.to_csv(output_path, index=False)
    print(f"Готово. Чистый CSV сохранён сюда: {output_path}")
    
# Пример использования
csv_path = "E:/FirstImpressionsV2/dev_FIv2.csv"
output_path = "E:/FirstImpressionsV2/dev_full.csv"
clean_csv_to_copy(csv_path, output_path)


Удаляем колонку/и: ['Unnamed: 0']
Готово. Чистый CSV сохранён сюда: E:/FirstImpressionsV2/dev_full.csv


In [4]:
csv_path = "E:/FirstImpressionsV2/test_FIv2.csv"
output_path = "E:/FirstImpressionsV2/test_full.csv"
clean_csv_to_copy(csv_path, output_path)

Удаляем колонку/и: ['Unnamed: 0']
Готово. Чистый CSV сохранён сюда: E:/FirstImpressionsV2/test_full.csv


In [5]:
csv_path = "E:/FirstImpressionsV2/train_FIv2.csv"
output_path = "E:/FirstImpressionsV2/train_full.csv"
clean_csv_to_copy(csv_path, output_path)

Удаляем колонку/и: ['Unnamed: 0']
Готово. Чистый CSV сохранён сюда: E:/FirstImpressionsV2/train_full.csv


In [16]:
import torch

checkpoint = torch.load("../modalities/text/checkpoints/Mamba_Transformer_bge-small_fusion.pt", map_location="cpu")

print(checkpoint.keys())

odict_keys(['emo_model.emo_proj.0.weight', 'emo_model.emo_proj.0.bias', 'emo_model.emo_proj.1.weight', 'emo_model.emo_proj.1.bias', 'emo_model.emotion_encoder.0.in_proj.weight', 'emo_model.emotion_encoder.0.in_proj.bias', 'emo_model.emotion_encoder.0.s_B.weight', 'emo_model.emotion_encoder.0.s_B.bias', 'emo_model.emotion_encoder.0.s_C.weight', 'emo_model.emotion_encoder.0.s_C.bias', 'emo_model.emotion_encoder.0.out_proj.weight', 'emo_model.emotion_encoder.0.out_proj.bias', 'emo_model.emotion_encoder.0.norm.weight', 'emo_model.emotion_encoder.0.norm.bias', 'emo_model.emotion_encoder.1.in_proj.weight', 'emo_model.emotion_encoder.1.in_proj.bias', 'emo_model.emotion_encoder.1.s_B.weight', 'emo_model.emotion_encoder.1.s_B.bias', 'emo_model.emotion_encoder.1.s_C.weight', 'emo_model.emotion_encoder.1.s_C.bias', 'emo_model.emotion_encoder.1.out_proj.weight', 'emo_model.emotion_encoder.1.out_proj.bias', 'emo_model.emotion_encoder.1.norm.weight', 'emo_model.emotion_encoder.1.norm.bias', 'emo_mod