# Import dependencies

In [2]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import time

# Import library with current code functions
sys.path.append(os.path.join("..", "lib"))
import manual_labeler_functions as man_lab_fun, automatic_labeler_functions as aut_lab_fun, general_functions as gf, files_paths as fp

## Getting the list of measure files to be labeled and the list of seed available

In [6]:
FILE_LIST_VD_MEASURE = gf.find_files_in_all_subdirectories([fp.DATASET_YT, fp.DATASET_LOCAL], fp.VD_MEASURE_L0)
FILE_LIST_LABELED_SEED = gf.find_files_in_all_subdirectories([fp.DATASET_SEED], fp.VD_LABELED_L0)


start_range = 71
end_range = 100

filtered_vd_measure_files = [
    f for f in FILE_LIST_VD_MEASURE
    if start_range <= int(os.path.basename(os.path.dirname(f)).split('_')[-1]) <= end_range
]


filtered_vd_measure_files

['..\\Dataset\\DD-Local\\VD_D_0000000071\\VD_MEASURE_L0.CSV',
 '..\\Dataset\\DD-Local\\VD_D_0000000072\\VD_MEASURE_L0.CSV',
 '..\\Dataset\\DD-Local\\VD_D_0000000073\\VD_MEASURE_L0.CSV',
 '..\\Dataset\\DD-Local\\VD_D_0000000074\\VD_MEASURE_L0.CSV',
 '..\\Dataset\\DD-Local\\VD_D_0000000075\\VD_MEASURE_L0.CSV',
 '..\\Dataset\\DD-Local\\VD_D_0000000076\\VD_MEASURE_L0.CSV',
 '..\\Dataset\\DD-Local\\VD_D_0000000077\\VD_MEASURE_L0.CSV',
 '..\\Dataset\\DD-Local\\VD_D_0000000078\\VD_MEASURE_L0.CSV',
 '..\\Dataset\\DD-Local\\VD_D_0000000079\\VD_MEASURE_L0.CSV',
 '..\\Dataset\\DD-Local\\VD_D_0000000080\\VD_MEASURE_L0.CSV',
 '..\\Dataset\\DD-Local\\VD_D_0000000081\\VD_MEASURE_L0.CSV',
 '..\\Dataset\\DD-Local\\VD_D_0000000082\\VD_MEASURE_L0.CSV',
 '..\\Dataset\\DD-Local\\VD_D_0000000083\\VD_MEASURE_L0.CSV',
 '..\\Dataset\\DD-Local\\VD_D_0000000084\\VD_MEASURE_L0.CSV',
 '..\\Dataset\\DD-Local\\VD_D_0000000085\\VD_MEASURE_L0.CSV',
 '..\\Dataset\\DD-Local\\VD_D_0000000086\\VD_MEASURE_L0.CSV',
 '..\\Da

## Remove all VD_LABELED_L0 file

In [5]:
def remove_file(FILE_LIST_PATH, file_name):
    for current_path in FILE_LIST_PATH:
        vd_labeled_path = os.path.join(os.path.dirname(current_path), file_name)
        if os.path.exists(vd_labeled_path):
            os.remove(vd_labeled_path)
            print(f"The file was removed: {vd_labeled_path}")
        else:
            print("File not found.")

In [None]:
remove_file(FILE_LIST_VD_MEASURE, fp.VD_LABELED_L0)

## Select reference SEED

In [7]:
FILE_LIST_SEED_VD_INFO = gf.find_files_in_all_subdirectories([fp.DATASET_SEED], fp.VD_INFO)

## Verificar quantidade de uma classe

In [16]:
import os
import re
import pandas as pd

def count_class_totals_and_instances(FILE_LIST_PATH, file_name, column_name, conditions, length_threshold=30):
    class_totals = {condition: 0 for condition in conditions}  
    class_instances = {condition: 0 for condition in conditions}  
    short_instances = {condition: 0 for condition in conditions}  
    instances_between_29_32 = {condition: 0 for condition in conditions}  
    instances_greater_32 = {condition: 0 for condition in conditions}  
    instance_lengths = {condition: [] for condition in conditions}  
    results_per_folder = {condition: [] for condition in conditions}
    paths_greater_32 = {condition: set() for condition in conditions}  
    paths_greater_70 = {condition: set() for condition in conditions}  

    for current_path in FILE_LIST_PATH:
        vd_labeled_path = os.path.join(os.path.dirname(current_path), file_name)

        if os.path.exists(vd_labeled_path):
            try:
                df = pd.read_csv(vd_labeled_path)

                if column_name in df.columns:
                    folder_name = os.path.basename(os.path.dirname(current_path))

                    match = re.search(r'VD_D_(\d+)', folder_name)
                    folder_number = int(match.group(1)) if match else -1  

                    for condition in conditions:
                        mask = df[column_name].str.contains(condition, case=False, na=False)
                        total_frames = mask.sum()
                        mask_diff = mask & ~mask.shift(fill_value=False)
                        total_instances = mask_diff.sum()

                        current_length = 0
                        short_instance_count = 0
                        between_29_32_count = 0
                        greater_32_count = 0
                        lengths = []

                        for val in mask:
                            if val:
                                current_length += 1
                            elif current_length > 0:
                                lengths.append(current_length)
                                if current_length < length_threshold:
                                    short_instance_count += 1
                                elif 29 <= current_length <= 32:
                                    between_29_32_count += 1
                                else:
                                    greater_32_count += 1
                                    paths_greater_32[condition].add(vd_labeled_path)
                                    if folder_number > 70:
                                        paths_greater_70[condition].add(vd_labeled_path)
                                current_length = 0

                        if current_length > 0:
                            lengths.append(current_length)
                            if current_length < length_threshold:
                                short_instance_count += 1
                            elif 29 <= current_length <= 32:
                                between_29_32_count += 1
                            else:
                                greater_32_count += 1
                                paths_greater_32[condition].add(vd_labeled_path)
                                if folder_number > 70:
                                    paths_greater_70[condition].add(vd_labeled_path)

                        instance_lengths[condition].extend(lengths)
                        results_per_folder[condition].append(
                            (folder_name, total_frames, total_instances, short_instance_count, between_29_32_count, greater_32_count)
                        )
                        class_totals[condition] += total_frames
                        class_instances[condition] += total_instances
                        short_instances[condition] += short_instance_count
                        instances_between_29_32[condition] += between_29_32_count
                        instances_greater_32[condition] += greater_32_count

            except Exception as e:
                print(f"Erro ao processar o arquivo {vd_labeled_path}: {e}")
        else:
            print(f"Arquivo não encontrado: {vd_labeled_path}")

    # Imprime os resultados
    for condition in conditions:
        print(f"\nResultados para '{condition}':")
        for folder_name, total_frames, total_instances, short_count, between_29_32_count, greater_32_count in results_per_folder[condition]:
            print(
                f"{folder_name} - {total_frames} frames, {total_instances} instâncias, "
                f"{short_count} instâncias < {length_threshold}, "
                f"{between_29_32_count} instâncias entre 29 e 32, "
                f"{greater_32_count} instâncias > 32"
            )

        total_inst = class_instances[condition]
        short_perc = (short_instances[condition] / total_inst * 100) if total_inst else 0
        between_29_32_perc = (instances_between_29_32[condition] / total_inst * 100) if total_inst else 0
        greater_32_perc = (instances_greater_32[condition] / total_inst * 100) if total_inst else 0
        avg_length = sum(instance_lengths[condition]) / len(instance_lengths[condition]) if instance_lengths[condition] else 0

        print(
            f"Total geral: {class_totals[condition]} frames, {total_inst} instâncias\n"
            f"{short_perc:.2f}% das instâncias são < {length_threshold}\n"
            f"{between_29_32_perc:.2f}% das instâncias estão entre 29 e 32\n"
            f"{greater_32_perc:.2f}% das instâncias são > 32\n"
            f"Média do tamanho das instâncias: {avg_length:.2f}"
        )

        # Imprime os caminhos dos arquivos com instâncias maiores que 32 e numeração > 70
        total_greater_32 = len(paths_greater_32[condition])
        total_greater_70 = len(paths_greater_70[condition])

        if total_greater_32 > 0:
            print(f"\nDe todos os {total_greater_32} vídeos com instâncias > 32 para '{condition}',")
            print(f"{total_greater_70} vídeos possuem VD_D_XX > 70.")
        
            print(f"\nArquivos contendo instâncias > 32 para '{condition}' (somente se VD_D_XX > 70):")
            for path in paths_greater_70[condition]:  
                print(path)

# Configuração dos caminhos
FILE_LIST_VD_MEASURE = gf.find_files_in_all_subdirectories([fp.DATASET_YT, fp.DATASET_LOCAL], fp.VD_MEASURE_L0)

# Executa a função
count_class_totals_and_instances(
    FILE_LIST_PATH=FILE_LIST_VD_MEASURE,
    file_name=fp.VD_LABELED_L0,
    column_name="label_measures",
    conditions=["OTHERS", "HAPPY", "NEUTRAL"]
)



Resultados para 'OTHERS':
VD_D_0000000043 - 226 frames, 7 instâncias, 0 instâncias < 30, 6 instâncias entre 29 e 32, 1 instâncias > 32
VD_D_0000000044 - 407 frames, 12 instâncias, 1 instâncias < 30, 8 instâncias entre 29 e 32, 3 instâncias > 32
VD_D_0000000045 - 343 frames, 11 instâncias, 2 instâncias < 30, 6 instâncias entre 29 e 32, 3 instâncias > 32
VD_D_0000000046 - 1017 frames, 33 instâncias, 8 instâncias < 30, 20 instâncias entre 29 e 32, 5 instâncias > 32
VD_D_0000000047 - 227 frames, 8 instâncias, 2 instâncias < 30, 6 instâncias entre 29 e 32, 0 instâncias > 32
VD_D_0000000048 - 696 frames, 22 instâncias, 2 instâncias < 30, 18 instâncias entre 29 e 32, 2 instâncias > 32
VD_D_0000000049 - 446 frames, 14 instâncias, 1 instâncias < 30, 10 instâncias entre 29 e 32, 3 instâncias > 32
VD_D_0000000050 - 449 frames, 14 instâncias, 2 instâncias < 30, 7 instâncias entre 29 e 32, 5 instâncias > 32
VD_D_0000000051 - 1289 frames, 43 instâncias, 9 instâncias < 30, 30 instâncias entre 29 e 3

In [None]:
count_happy = count_neutral = count_others = 0

for vd_info_file in FILE_LIST_SEED_VD_INFO:
    vd_info_df = pd.read_csv(vd_info_file)
    #print("vd_info_df", vd_info_df['link_video'])

    count_happy += len([emotion for emotion in vd_info_df['link_video'] if 'happy' in str(vd_info_df['link_video'])])
    count_neutral += len([emotion for emotion in vd_info_df['link_video'] if 'neutral' in str(vd_info_df['link_video'])])
    count_others += len([emotion for emotion in vd_info_df['link_video']if 'others' in str(vd_info_df['link_video'])])

print("Number of happy seeds:", count_happy)
print("Number of neutral seeds:", count_neutral)
print("Number of others seeds:", count_others)

In [None]:
FILE_LIST_LABELED_SEED

## Automatic Labeler

In [8]:
current_labeling_class = 0 # Change for the correspondent number of your labeling class
frame_distance_threshold = 2
euclidean_distance_threshold = 1.2
current_total_saved_series = 0

matches_memory = []
all_matches_memory = []
all_mass_memory = []
all_idxs_match_frame_seq_memory = []
all_seeds_occurrences_len = []

all_data_memory = []

for j, labeled_file in enumerate(FILE_LIST_LABELED_SEED[:]):
    RESUME_DT = pd.DataFrame()
    print(f'\nSearch with seed: {os.path.basename(os.path.dirname(labeled_file))}\n')

    current_seed_matches_memory = []
    current_seed_all_matches_memory = []
    current_seed_all_mass_memory = []

    current_seed_data_memory = []
    
    for i, current_path_location in enumerate(filtered_vd_measure_files):

        path_dir = os.path.dirname(labeled_file)
        vd_labeled_path = os.path.join(path_dir, fp.VD_LABELED_L0)
        vd_labeled = pd.read_csv(vd_labeled_path)
        vd_labeled.drop(columns=['Unnamed: 0'], inplace=True)

        all_class = man_lab_fun.GET_ALL_CLASSES(vd_labeled)
        label_name = all_class[current_labeling_class]

        reference_measures = man_lab_fun.GET_MEASURES_FROM_CLASS(vd_labeled, label_name)

        frames = man_lab_fun.GET_FRAMES_FROM_CLASS(vd_labeled, label_name)

        all_measures_in_frame_interval = vd_labeled.loc[0:len(frames)]

        selected_measures_in_frame_interval = all_measures_in_frame_interval[reference_measures]

        dict_label_parameters = {'label_name': label_name, 'reference_measures': reference_measures}

        RESUME_DT, matches, all_matches, all_mass, idxs_match_frame_seq, occurrences_len = aut_lab_fun.label_current_series(current_path_location, RESUME_DT, selected_measures_in_frame_interval, dict_label_parameters, os.path.dirname(labeled_file), LABELED_FILE_NAME=fp.VD_LABELED_L0, distance_threshold=euclidean_distance_threshold, frame_threshold=frame_distance_threshold)  

        current_seed_data_memory.append([])

        for k, (frame, dist) in enumerate(idxs_match_frame_seq):
            frame_dist_len_data = []
            frame_dist_len_data.append(frame)
            frame_dist_len_data.append(dist)
            frame_dist_len_data.append(occurrences_len[k])

            current_seed_data_memory[i].append(frame_dist_len_data)

    all_data_memory.append(current_seed_data_memory)

    final_sum = RESUME_DT['final'].sum()
    current_total_saved_series += final_sum
    print(f'Number of occurrences found for the current seed: {final_sum}') 
    print(f'Total of occurrences: {current_total_saved_series}') 


Search with seed: VD_R_0000000001

Number of occurrences found for the current seed: 27
Total of occurrences: 27

Search with seed: VD_R_0000000002

Number of occurrences found for the current seed: 8
Total of occurrences: 35

Search with seed: VD_R_0000000003

Number of occurrences found for the current seed: 6
Total of occurrences: 41

Search with seed: VD_R_0000000004

Number of occurrences found for the current seed: 25
Total of occurrences: 66

Search with seed: VD_R_0000000005

Number of occurrences found for the current seed: 42
Total of occurrences: 108

Search with seed: VD_R_0000000006

Number of occurrences found for the current seed: 42
Total of occurrences: 150

Search with seed: VD_R_0000000007

Number of occurrences found for the current seed: 42
Total of occurrences: 192

Search with seed: VD_R_0000000008

Number of occurrences found for the current seed: 34
Total of occurrences: 226

Search with seed: VD_R_0000000009

Number of occurrences found for the current seed: 

In [None]:
current_labeling_class_others = 0  # Este valor será ajustado dinamicamente com base na presença de "others"
frame_distance_threshold_others = 2
euclidean_distance_threshold_others = 1.2
current_total_saved_series_others = 0

matches_memory_others = []
all_matches_memory_others = []
all_mass_memory_others = []
all_idxs_match_frame_seq_memory_others = []
all_seeds_occurrences_len_others = []

all_data_memory_others = []

for j_others, labeled_file_others in enumerate(FILE_LIST_LABELED_SEED[:]):
    RESUME_DT_others = pd.DataFrame()

    # Carregar o DataFrame rotulado
    path_dir_others = os.path.dirname(labeled_file_others)
    vd_labeled_path_others = os.path.join(path_dir_others, fp.VD_LABELED_L0)
    vd_labeled_others = pd.read_csv(vd_labeled_path_others)
    vd_labeled_others.drop(columns=['Unnamed: 0'], inplace=True)

    # Obter todas as classes disponíveis na seed
    all_class_others = man_lab_fun.GET_ALL_CLASSES(vd_labeled_others)

    # Verificar se a classe "others" está presente
    if "others" not in all_class_others:
        print(f"Skipping seed: {os.path.basename(os.path.dirname(labeled_file_others))} (class 'others' not found)")
        continue

    # Ajustar o índice para a classe "others"
    current_labeling_class_others = all_class_others.index("others")
    label_name_others = all_class_others[current_labeling_class_others]

    print(f'\nProcessing seed: {os.path.basename(os.path.dirname(labeled_file_others))} with class "others"\n')

    current_seed_matches_memory_others = []
    current_seed_all_matches_memory_others = []
    current_seed_all_mass_memory_others = []
    current_seed_data_memory_others = []

    for i_others, current_path_location_others in enumerate(FILE_LIST_VD_MEASURE):
        # Obter as medidas e os frames associados à classe "others"
        reference_measures_others = man_lab_fun.GET_MEASURES_FROM_CLASS(vd_labeled_others, label_name_others)
        frames_others = man_lab_fun.GET_FRAMES_FROM_CLASS(vd_labeled_others, label_name_others)

        # Selecionar os frames de interesse e medidas correspondentes
        all_measures_in_frame_interval_others = vd_labeled_others.loc[0:len(frames_others)]
        selected_measures_in_frame_interval_others = all_measures_in_frame_interval_others[reference_measures_others]

        dict_label_parameters_others = {'label_name': label_name_others, 'reference_measures': reference_measures_others}

        # Processar a série atual
        RESUME_DT_others, matches_others, all_matches_others, all_mass_others, idxs_match_frame_seq_others, occurrences_len_others = aut_lab_fun.label_current_series(
            current_path_location_others, RESUME_DT_others, selected_measures_in_frame_interval_others, dict_label_parameters_others, 
            os.path.dirname(labeled_file_others), LABELED_FILE_NAME="VD_LABELED_L0_OTHERS",  # Nome do arquivo ajustado
            distance_threshold=euclidean_distance_threshold_others, frame_threshold=frame_distance_threshold_others
        )

        # Armazenar os resultados
        current_seed_data_memory_others.append([])

        for k_others, (frame_others, dist_others) in enumerate(idxs_match_frame_seq_others):
            frame_dist_len_data_others = []
            frame_dist_len_data_others.append(frame_others)
            frame_dist_len_data_others.append(dist_others)
            frame_dist_len_data_others.append(occurrences_len_others[k_others])

            current_seed_data_memory_others[i_others].append(frame_dist_len_data_others)

    all_data_memory_others.append(current_seed_data_memory_others)

    final_sum_others = RESUME_DT_others['final'].sum()
    current_total_saved_series_others += final_sum_others
    print(f'Number of occurrences found for the current seed: {final_sum_others}')
    print(f'Total of occurrences: {current_total_saved_series_others}')


## True matches

In [None]:
number_irregulars = 0
for i, seed in enumerate(all_data_memory):
    #print(f"=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= Seed {i+1} =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=")
    for j, video in enumerate(seed):
        #print(f" =-=-=-=-=-= Video: {j+1}, number of occurrences: {len(video)} =-=-=-=-=-=")
        for occurrence in video:
            if occurrence[2] != 30:
                #print(f"Frame start: {occurrence[0]}, euclidean_dist: {occurrence[1]}, len occurrence: {occurrence[2]}")
                print(f"Seed: {i+1}, Video: {j+1}, Frame start: {occurrence[0]}, euclidean_dist: {occurrence[1]}, len occurrence: {occurrence[2]}")
                number_irregulars += 1
print("Number irregulars:", number_irregulars)