# Import dependencies

In [1]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import time

# Import library with current code functions
sys.path.append(os.path.join("..", "lib"))
import manual_labeler_functions as man_lab_fun, automatic_labeler_functions as aut_lab_fun, general_functions as gf, files_paths as fp

## Getting the list of measure files to be labeled and the list of seed available

In [15]:
FILE_LIST_VD_MEASURE = gf.find_files_in_all_subdirectories([fp.DATASET_YT_NOT_NORMALIZED, fp.DATASET_LOCAL_NOT_NORMALIZED], fp.VD_MEASURE_L0)
FILE_LIST_LABELED_SEED = gf.find_files_in_all_subdirectories([fp.DATASET_SEED_NOT_NORMALIZED], fp.VD_LABELED_L0)

## Remove all VD_LABELED_L0 file

In [18]:
def remove_file(FILE_LIST_PATH, file_name):
    for current_path in FILE_LIST_PATH:
        vd_labeled_path = os.path.join(os.path.dirname(current_path), file_name)
        if os.path.exists(vd_labeled_path):
            os.remove(vd_labeled_path)
            print(f"The file was removed: {vd_labeled_path}")
        else:
            print("File not found.")

In [19]:
remove_file(FILE_LIST_VD_MEASURE, fp.VD_LABELED_L0)

File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.
File not found.


## Select reference SEED

In [20]:
FILE_LIST_SEED_VD_INFO = gf.find_files_in_all_subdirectories([fp.DATASET_SEED_NOT_NORMALIZED], fp.VD_INFO)

## Verificar quantidade de uma classe

In [21]:
def count_class_totals_and_instances(FILE_LIST_PATH, file_name, column_name, conditions, length_threshold=30):
    class_totals = {condition: 0 for condition in conditions}  # Total de frames por classe
    class_instances = {condition: 0 for condition in conditions}  # Total de instâncias por classe
    short_instances = {condition: 0 for condition in conditions}  # Total de instâncias menores que o limite
    results_per_folder = {condition: [] for condition in conditions}

    for current_path in FILE_LIST_PATH:
        vd_labeled_path = os.path.join(os.path.dirname(current_path), file_name)

        # Verifica se o arquivo VD_LABELED_L0 existe
        if os.path.exists(vd_labeled_path):
            try:
                # Lê o arquivo VD_LABELED_L0
                df = pd.read_csv(vd_labeled_path)
                
                # Verifica se a coluna existe
                if column_name in df.columns:
                    folder_name = os.path.basename(os.path.dirname(current_path))

                    for condition in conditions:
                        # Conta o total de frames com a classe
                        mask = df[column_name].str.contains(condition, case=False, na=False)
                        total_frames = mask.sum()

                        # Conta o total de instâncias consecutivas
                        mask_diff = mask & ~mask.shift(fill_value=False)
                        total_instances = mask_diff.sum()

                        # Conta instâncias menores que o limite de comprimento
                        short_instance_count = 0
                        if total_instances > 0:
                            current_length = 0
                            for val in mask:
                                if val:
                                    current_length += 1
                                elif current_length > 0:
                                    if current_length < length_threshold:
                                        short_instance_count += 1
                                    current_length = 0
                            # Verifica a última instância
                            if current_length > 0 and current_length < length_threshold:
                                short_instance_count += 1

                        # Armazena resultados
                        results_per_folder[condition].append(
                            (folder_name, total_frames, total_instances, short_instance_count)
                        )
                        class_totals[condition] += total_frames
                        class_instances[condition] += total_instances
                        short_instances[condition] += short_instance_count
            except Exception as e:
                print(f"Erro ao processar o arquivo {vd_labeled_path}: {e}")
        else:
            print(f"Arquivo não encontrado: {vd_labeled_path}")

    # Imprime os resultados
    for condition in conditions:
        print(f"\nResultados para '{condition}':")
        for folder_name, total_frames, total_instances, short_count in results_per_folder[condition]:
            print(
                f"{folder_name} - {total_frames} frames, {total_instances} instâncias, "
                f"{short_count} instâncias com tamanho < {length_threshold}"
            )
        print(
            f"Total geral: {class_totals[condition]} frames, {class_instances[condition]} instâncias, "
            f"{short_instances[condition]} instâncias com tamanho < {length_threshold}"
        )

# Configuração dos caminhos
FILE_LIST_VD_MEASURE = gf.find_files_in_all_subdirectories([fp.DATASET_YT, fp.DATASET_LOCAL], fp.VD_MEASURE_L0)

# Executa a função
count_class_totals_and_instances(
    FILE_LIST_PATH=FILE_LIST_VD_MEASURE,
    file_name=fp.VD_LABELED_L0,
    column_name="label_measures",
    conditions=["OTHERS", "HAPPY", "NEUTRAL"]
)



Resultados para 'OTHERS':
VD_D_0000000043 - 133 frames, 4 instâncias, 0 instâncias com tamanho < 30
VD_D_0000000044 - 244 frames, 8 instâncias, 1 instâncias com tamanho < 30
VD_D_0000000045 - 200 frames, 7 instâncias, 2 instâncias com tamanho < 30
VD_D_0000000046 - 854 frames, 28 instâncias, 8 instâncias com tamanho < 30
VD_D_0000000047 - 196 frames, 7 instâncias, 2 instâncias com tamanho < 30
VD_D_0000000048 - 348 frames, 11 instâncias, 2 instâncias com tamanho < 30
VD_D_0000000049 - 283 frames, 9 instâncias, 1 instâncias com tamanho < 30
VD_D_0000000050 - 239 frames, 8 instâncias, 2 instâncias com tamanho < 30
VD_D_0000000051 - 794 frames, 28 instâncias, 10 instâncias com tamanho < 30
VD_D_0000000052 - 536 frames, 18 instâncias, 5 instâncias com tamanho < 30
VD_D_0000000053 - 60 frames, 2 instâncias, 0 instâncias com tamanho < 30
VD_D_0000000054 - 281 frames, 10 instâncias, 3 instâncias com tamanho < 30
VD_D_0000000055 - 91 frames, 3 instâncias, 0 instâncias com tamanho < 30
VD_D_00

In [22]:
count_happy = count_neutral = count_others = 0

for vd_info_file in FILE_LIST_SEED_VD_INFO:
    vd_info_df = pd.read_csv(vd_info_file)
    #print("vd_info_df", vd_info_df['link_video'])

    count_happy += len([emotion for emotion in vd_info_df['link_video'] if 'happy' in str(vd_info_df['link_video'])])
    count_neutral += len([emotion for emotion in vd_info_df['link_video'] if 'neutral' in str(vd_info_df['link_video'])])
    count_others += len([emotion for emotion in vd_info_df['link_video']if 'others' in str(vd_info_df['link_video'])])

print("Number of happy seeds:", count_happy)
print("Number of neutral seeds:", count_neutral)
print("Number of other seeds:", count_others)

Number of happy seeds: 54
Number of neutral seeds: 84
Number of other seeds: 73


In [23]:
FILE_LIST_LABELED_SEED

['..\\Dataset_not_normalized\\REF-Gold-Label_not_normalized\\VD_R_0000000001\\VD_LABELED_L0.CSV',
 '..\\Dataset_not_normalized\\REF-Gold-Label_not_normalized\\VD_R_0000000002\\VD_LABELED_L0.CSV',
 '..\\Dataset_not_normalized\\REF-Gold-Label_not_normalized\\VD_R_0000000003\\VD_LABELED_L0.CSV',
 '..\\Dataset_not_normalized\\REF-Gold-Label_not_normalized\\VD_R_0000000004\\VD_LABELED_L0.CSV',
 '..\\Dataset_not_normalized\\REF-Gold-Label_not_normalized\\VD_R_0000000005\\VD_LABELED_L0.CSV',
 '..\\Dataset_not_normalized\\REF-Gold-Label_not_normalized\\VD_R_0000000006\\VD_LABELED_L0.CSV',
 '..\\Dataset_not_normalized\\REF-Gold-Label_not_normalized\\VD_R_0000000007\\VD_LABELED_L0.CSV',
 '..\\Dataset_not_normalized\\REF-Gold-Label_not_normalized\\VD_R_0000000008\\VD_LABELED_L0.CSV',
 '..\\Dataset_not_normalized\\REF-Gold-Label_not_normalized\\VD_R_0000000009\\VD_LABELED_L0.CSV',
 '..\\Dataset_not_normalized\\REF-Gold-Label_not_normalized\\VD_R_0000000010\\VD_LABELED_L0.CSV',
 '..\\Dataset_not_no

## Automatic Labeler

In [24]:
current_labeling_class = 0 # Change for the correspondent number of your labeling class
frame_distance_threshold = 2
euclidean_distance_threshold = 1.2
current_total_saved_series = 0

matches_memory = []
all_matches_memory = []
all_mass_memory = []
all_idxs_match_frame_seq_memory = []
all_seeds_occurrences_len = []

all_data_memory = []

for j, labeled_file in enumerate(FILE_LIST_LABELED_SEED[:]):
    RESUME_DT = pd.DataFrame()
    print(f'\nSearch with seed: {os.path.basename(os.path.dirname(labeled_file))}\n')

    current_seed_matches_memory = []
    current_seed_all_matches_memory = []
    current_seed_all_mass_memory = []

    current_seed_data_memory = []
    
    for i, current_path_location in enumerate(FILE_LIST_VD_MEASURE):

        path_dir = os.path.dirname(labeled_file)
        vd_labeled_path = os.path.join(path_dir, fp.VD_LABELED_L0)
        vd_labeled = pd.read_csv(vd_labeled_path)
        vd_labeled.drop(columns=['Unnamed: 0'], inplace=True)

        all_class = man_lab_fun.GET_ALL_CLASSES(vd_labeled)
        label_name = all_class[current_labeling_class]

        reference_measures = man_lab_fun.GET_MEASURES_FROM_CLASS(vd_labeled, label_name)

        frames = man_lab_fun.GET_FRAMES_FROM_CLASS(vd_labeled, label_name)

        all_measures_in_frame_interval = vd_labeled.loc[0:len(frames)]

        selected_measures_in_frame_interval = all_measures_in_frame_interval[reference_measures]

        dict_label_parameters = {'label_name': label_name, 'reference_measures': reference_measures}

        RESUME_DT, matches, all_matches, all_mass, idxs_match_frame_seq, occurrences_len = aut_lab_fun.label_current_series(current_path_location, RESUME_DT, selected_measures_in_frame_interval, dict_label_parameters, os.path.dirname(labeled_file), LABELED_FILE_NAME=fp.VD_LABELED_L0, distance_threshold=euclidean_distance_threshold, frame_threshold=frame_distance_threshold)  

        current_seed_data_memory.append([])

        for k, (frame, dist) in enumerate(idxs_match_frame_seq):
            frame_dist_len_data = []
            frame_dist_len_data.append(frame)
            frame_dist_len_data.append(dist)
            frame_dist_len_data.append(occurrences_len[k])

            current_seed_data_memory[i].append(frame_dist_len_data)

    all_data_memory.append(current_seed_data_memory)

    final_sum = RESUME_DT['final'].sum()
    current_total_saved_series += final_sum
    print(f'Number of occurrences found for the current seed: {final_sum}') 
    print(f'Total of occurrences: {current_total_saved_series}') 


Search with seed: VD_R_0000000001

Number of occurrences found for the current seed: 4
Total of occurrences: 4

Search with seed: VD_R_0000000002

Number of occurrences found for the current seed: 2
Total of occurrences: 6

Search with seed: VD_R_0000000003

Number of occurrences found for the current seed: 2
Total of occurrences: 8

Search with seed: VD_R_0000000004



KeyboardInterrupt: 

In [None]:
current_labeling_class_others = 0  # Este valor será ajustado dinamicamente com base na presença de "others"
frame_distance_threshold_others = 2
euclidean_distance_threshold_others = 1.2
current_total_saved_series_others = 0

matches_memory_others = []
all_matches_memory_others = []
all_mass_memory_others = []
all_idxs_match_frame_seq_memory_others = []
all_seeds_occurrences_len_others = []

all_data_memory_others = []

for j_others, labeled_file_others in enumerate(FILE_LIST_LABELED_SEED[:]):
    RESUME_DT_others = pd.DataFrame()

    # Carregar o DataFrame rotulado
    path_dir_others = os.path.dirname(labeled_file_others)
    vd_labeled_path_others = os.path.join(path_dir_others, fp.VD_LABELED_L0)
    vd_labeled_others = pd.read_csv(vd_labeled_path_others)
    vd_labeled_others.drop(columns=['Unnamed: 0'], inplace=True)

    # Obter todas as classes disponíveis na seed
    all_class_others = man_lab_fun.GET_ALL_CLASSES(vd_labeled_others)

    # Verificar se a classe "others" está presente
    if "others" not in all_class_others:
        print(f"Skipping seed: {os.path.basename(os.path.dirname(labeled_file_others))} (class 'others' not found)")
        continue

    # Ajustar o índice para a classe "others"
    current_labeling_class_others = all_class_others.index("others")
    label_name_others = all_class_others[current_labeling_class_others]

    print(f'\nProcessing seed: {os.path.basename(os.path.dirname(labeled_file_others))} with class "others"\n')

    current_seed_matches_memory_others = []
    current_seed_all_matches_memory_others = []
    current_seed_all_mass_memory_others = []
    current_seed_data_memory_others = []

    for i_others, current_path_location_others in enumerate(FILE_LIST_VD_MEASURE):
        # Obter as medidas e os frames associados à classe "others"
        reference_measures_others = man_lab_fun.GET_MEASURES_FROM_CLASS(vd_labeled_others, label_name_others)
        frames_others = man_lab_fun.GET_FRAMES_FROM_CLASS(vd_labeled_others, label_name_others)

        # Selecionar os frames de interesse e medidas correspondentes
        all_measures_in_frame_interval_others = vd_labeled_others.loc[0:len(frames_others)]
        selected_measures_in_frame_interval_others = all_measures_in_frame_interval_others[reference_measures_others]

        dict_label_parameters_others = {'label_name': label_name_others, 'reference_measures': reference_measures_others}

        # Processar a série atual
        RESUME_DT_others, matches_others, all_matches_others, all_mass_others, idxs_match_frame_seq_others, occurrences_len_others = aut_lab_fun.label_current_series(
            current_path_location_others, RESUME_DT_others, selected_measures_in_frame_interval_others, dict_label_parameters_others, 
            os.path.dirname(labeled_file_others), LABELED_FILE_NAME="VD_LABELED_L0_others",  # Nome do arquivo ajustado
            distance_threshold=euclidean_distance_threshold_others, frame_threshold=frame_distance_threshold_others
        )

        # Armazenar os resultados
        current_seed_data_memory_others.append([])

        for k_others, (frame_others, dist_others) in enumerate(idxs_match_frame_seq_others):
            frame_dist_len_data_others = []
            frame_dist_len_data_others.append(frame_others)
            frame_dist_len_data_others.append(dist_others)
            frame_dist_len_data_others.append(occurrences_len_others[k_others])

            current_seed_data_memory_others[i_others].append(frame_dist_len_data_others)

    all_data_memory_others.append(current_seed_data_memory_others)

    final_sum_others = RESUME_DT_others['final'].sum()
    current_total_saved_series_others += final_sum_others
    print(f'Number of occurrences found for the current seed: {final_sum_others}')
    print(f'Total of occurrences: {current_total_saved_series_others}')


Skipping seed: VD_R_0000000001 (class 'others' not found)
Skipping seed: VD_R_0000000002 (class 'others' not found)
Skipping seed: VD_R_0000000003 (class 'others' not found)
Skipping seed: VD_R_0000000004 (class 'others' not found)
Skipping seed: VD_R_0000000005 (class 'others' not found)
Skipping seed: VD_R_0000000006 (class 'others' not found)
Skipping seed: VD_R_0000000007 (class 'others' not found)
Skipping seed: VD_R_0000000008 (class 'others' not found)
Skipping seed: VD_R_0000000009 (class 'others' not found)
Skipping seed: VD_R_0000000010 (class 'others' not found)
Skipping seed: VD_R_0000000011 (class 'others' not found)
Skipping seed: VD_R_0000000012 (class 'others' not found)
Skipping seed: VD_R_0000000013 (class 'others' not found)
Skipping seed: VD_R_0000000014 (class 'others' not found)
Skipping seed: VD_R_0000000015 (class 'others' not found)
Skipping seed: VD_R_0000000016 (class 'others' not found)
Skipping seed: VD_R_0000000017 (class 'others' not found)
Skipping seed:

KeyboardInterrupt: 

## True matches

In [47]:
number_irregulars = 0
for i, seed in enumerate(all_data_memory):
    #print(f"=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= Seed {i+1} =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=")
    for j, video in enumerate(seed):
        #print(f" =-=-=-=-=-= Video: {j+1}, number of occurrences: {len(video)} =-=-=-=-=-=")
        for occurrence in video:
            if occurrence[2] != 30:
                #print(f"Frame start: {occurrence[0]}, euclidean_dist: {occurrence[1]}, len occurrence: {occurrence[2]}")
                print(f"Seed: {i+1}, Video: {j+1}, Frame start: {occurrence[0]}, euclidean_dist: {occurrence[1]}, len occurrence: {occurrence[2]}")
                number_irregulars += 1
print("Number irregulars:", number_irregulars)

Seed: 5, Video: 3, Frame start: 325, euclidean_dist: 4.45922929946909, len occurrence: 16
Seed: 5, Video: 14, Frame start: 6700, euclidean_dist: 1.9169064294619393, len occurrence: 24
Seed: 7, Video: 57, Frame start: 2211, euclidean_dist: 2.101794128426182, len occurrence: 28
Seed: 20, Video: 8, Frame start: 355, euclidean_dist: 5.0711307549862354, len occurrence: 18
Seed: 20, Video: 14, Frame start: 6296, euclidean_dist: 5.808902011147796, len occurrence: 16
Seed: 21, Video: 20, Frame start: 3488, euclidean_dist: 4.334903618668159, len occurrence: 21
Seed: 21, Video: 34, Frame start: 4750, euclidean_dist: 3.7323127715735898, len occurrence: 12
Seed: 22, Video: 5, Frame start: 1819, euclidean_dist: 4.355284369910563, len occurrence: 19
Seed: 23, Video: 4, Frame start: 4858, euclidean_dist: 3.6295397083061522, len occurrence: 26
Seed: 23, Video: 4, Frame start: 356, euclidean_dist: 5.469529143955719, len occurrence: 26
Seed: 23, Video: 5, Frame start: 2435, euclidean_dist: 3.58814250223