# Data Analysis

## Development environment setup

In [1]:
%%sx
#@title Colab development environment settings
if [ -n "${COLAB_RELEASE_TAG}" ]; then
    git clone -b main https://github.com/MigeoDaSelva/MachineLApplVioDeSurVideos.git;
    python3 -m pip install --upgrade pip;
    python3 -m pip install -r /content/MachineLApplVioDeSurVideos/requirements.txt;
    kill -9 pid "${PPID}";
fi

[]

In [2]:
#@title Imports libraries
from random import randint, sample
from statistics import mean
from pprint import pprint
from pathlib import Path
import sys
import os

In [3]:
#@title System path manager
%reload_ext autoreload
%autoreload 2

project_name = "MachineLApplVioDeSurVideos"

working_directory = os.popen("echo $PWD").read().rstrip()

project_root_path = working_directory \
    if project_name in working_directory \
        else os.popen(
            f"readlink -f $(find -name {project_name} \
                -not -path '*/drive/*')"
        ).read().rstrip()

if str(project_root_path) not in sys.path:
    sys.path.insert(0, str(project_root_path))

pprint(f"Environment paths: {sys.path}")

("Environment paths: ['/workspaces/MachineLApplVioDeSurVideos', "
 "'/usr/lib/python311.zip', '/usr/lib/python3.11', "
 "'/usr/lib/python3.11/lib-dynload', '', "
 "'/usr/local/lib/python3.11/dist-packages', '/usr/lib/python3/dist-packages']")


In [4]:
#@title Imports project stuff
from src.data_handler.strategies.video_creator import OpenCVVideoCreator, DecordVideoCreator
from src.data_handler.strategies.class_names_finder import UniqueClassNamesFinder
from src.data_handler.strategies.file_path_finder import (
    RecursiveFilePathFinder,
    FilePathFinderByLoad
    )
from src.data_handler.data_splitter import DataSplitter
from configs import settings

In [5]:
#@title Colab resource settings
if settings.COLAB_ENV: 
    from google.colab import output
    from google.colab import drive
    output.enable_custom_widget_manager()
    drive.mount("/content/drive", force_remount=True)

## Dataset attribute analysis

In [6]:
file_extensions = ["avi", "mp4"]

file_path_finder = RecursiveFilePathFinder(file_extensions=file_extensions)
class_finder = UniqueClassNamesFinder()
# video_creator = DecordVideoCreator()
video_creator = OpenCVVideoCreator()

file_paths = file_path_finder.finds(Path(os.path.join(settings.DATASETS_PATH, f"{settings.DATASET_NAME}/")))
total_of_videos = len(file_paths)
video_extensions = {path.suffix for path in file_paths}
video_classes = class_finder.finds(file_paths)
video_per_class = {
    label: len(
        list(
            filter(
                lambda path: path.parent.name == label, file_paths
            )
        )
    ) for label in video_classes
}

print(
    f"Total of videos: {total_of_videos}",
    f"Extentions: {', '.join(video_extensions)}",
    f"Video classes: {', '.join(video_classes)}",
    f"Total of classes: {len(video_classes)}",
    sep="\n"
    )

pprint(
    {"Total of videos per class": video_per_class},
)

def gets_length(path: Path) -> int:
    video_creator.opens(path)
    return video_creator.gets_total_length()

frames_per_video = list(
    gets_length(path)
    for path in file_paths
)
overall_frame_rate = round(mean(frames_per_video))
higher_amount_of_frames = max(frames_per_video)
lowest_amount_of_frames = min(frames_per_video)

overall_frame_rate_per_class = {
    label: round(
        mean(
            gets_length(path) for path in list(
                filter(
                    lambda path: path.parent.name == label, file_paths
                )
            )
        )
    ) for label in video_classes
}

print(
    f"Overall frame rate: {overall_frame_rate}",
    ("Higher and lowest amount of frames: "
        f"{higher_amount_of_frames}, "
        f"{lowest_amount_of_frames}"),
    sep="\n"
)
pprint(
    {"Overall frame rate per class": overall_frame_rate_per_class},
)

Total of videos: 1000
Extentions: .mp4
Video classes: Abuse, Arrest, Arson, Assault, Burglary, Explosion, Fighting, Normal, RoadAccidents, Robbery, Shooting, Shoplifting, Stealing, Vandalism
Total of classes: 14
{'Total of videos per class': {'Abuse': 50,
                               'Arrest': 50,
                               'Arson': 50,
                               'Assault': 50,
                               'Burglary': 100,
                               'Explosion': 50,
                               'Fighting': 50,
                               'Normal': 50,
                               'RoadAccidents': 150,
                               'Robbery': 150,
                               'Shooting': 50,
                               'Shoplifting': 50,
                               'Stealing': 100,
                               'Vandalism': 50}}
Overall frame rate: 3791
Higher and lowest amount of frames: 141900, 104
{'Overall frame rate per class': {'Abuse': 3870,
     

## Single random sample analysis

In [7]:
index = randint(0, len(file_paths)-1)

video_creator = DecordVideoCreator(required_length=60)
# video_creator = OpenCVVideoCreator(required_length=60)

video = video_creator.creates(file_paths[index])

print(video)

Name: Burglary008_x264.mp4
Length: 60
Label: Burglary
Path: /workspaces/MachineLApplVioDeSurVideos/src/assets/violence_detection_datasets/UCF-Crime/Burglary/Burglary008_x264.mp4
Array Shape: (60, 240, 320, 3)


## Cross validation analysis

In [None]:
#@title Selects the top ten
sorted_classes = dict(sorted(video_per_class.items(), key=lambda item: item[1], reverse=True))
top_10_classes = sorted(list(sorted_classes.keys())[:10])

file_paths = file_path_finder.finds(Path(os.path.join(settings.DATASETS_PATH, f"{settings.DATASET_NAME}/")))

print(f"Total of videos before selecting the top 10 classes with the largest sample: {len(file_paths)}\n")

dataset_paths = []

for label in top_10_classes:
    dataset_paths.extend(
        list(
            filter(
                lambda path: path.match(f"*/{label}/*"), file_paths
            )
        )
    )

file_paths = dataset_paths
video_classes = top_10_classes

pprint({"Top 10 classes": top_10_classes})
print(f"\nTotal of videos after selection: {len(dataset_paths)}")

In [8]:
#@title Class balancing

sample_size = 50

new_file_paths = []

for label in video_classes:
    new_file_paths.extend(
        sample(
            list(
                filter(
                    lambda path: path.match(f"*/{label}/*"), 
                    file_paths
                )
            ), sample_size
        )
    )
file_paths = new_file_paths

In [9]:
#@title Splitation

data_splitter = DataSplitter(
    file_paths,
    k_folds=5,
    n_iterations=10,
    train_size=0.8,
    validation_size=0.3
)

data_splitter.splits()

In [10]:
iteration = randint(0, data_splitter.n_iterations-1)
fold = randint(0, data_splitter.k_folds-1)

train_file_paths = FilePathFinderByLoad().finds(Path(f'{settings.CROSS_VALIDATION_PATH}/{iteration}_{fold}_train.pickle'))
test_file_paths = FilePathFinderByLoad().finds(Path(f'{settings.CROSS_VALIDATION_PATH}/{iteration}_{fold}_test.pickle'))
validation_file_paths = FilePathFinderByLoad().finds(Path(f'{settings.CROSS_VALIDATION_PATH}/{iteration}_{fold}_validation.pickle'))

train_classes =  UniqueClassNamesFinder().finds(train_file_paths)
test_classes =  UniqueClassNamesFinder().finds(test_file_paths)
validation_classes =  UniqueClassNamesFinder().finds(validation_file_paths)


video_per_class_train = {
    label: len(
        list(
            filter(
                lambda path: path.parent.name == label, train_file_paths
            )
        )
    ) for label in train_classes
}

video_per_class_test = {
    label: len(
        list(
            filter(
                lambda path: path.parent.name == label, test_file_paths
            )
        )
    ) for label in test_classes
}

video_per_class_validation = {
    label: len(
        list(
            filter(
                lambda path: path.parent.name == label, validation_file_paths
            )
        )
    ) for label in validation_classes
}

print(
    f"Random observation\nIteration: {iteration}\nFold: {fold}\n",
    f"Train size: {len(train_file_paths)}",
    f"Test size: {len(test_file_paths)}",
    f"Validation size: {len(validation_file_paths)}",
    f"Total: {len(train_file_paths)+len(test_file_paths)+len(validation_file_paths)}\n",
    sep="\n"
)

pprint(
    {"Videos per class in train set": video_per_class_train,
    "Videos per class in test set": video_per_class_test,
    "Videos per class in validation set": video_per_class_validation}
)

Random observation
Iteration: 7
Fold: 4

Train size: 392
Test size: 140
Validation size: 168
Total: 700

{'Videos per class in test set': {'Abuse': 10,
                                  'Arrest': 10,
                                  'Arson': 10,
                                  'Assault': 10,
                                  'Burglary': 10,
                                  'Explosion': 10,
                                  'Fighting': 10,
                                  'Normal': 10,
                                  'RoadAccidents': 10,
                                  'Robbery': 10,
                                  'Shooting': 10,
                                  'Shoplifting': 10,
                                  'Stealing': 10,
                                  'Vandalism': 10},
 'Videos per class in train set': {'Abuse': 28,
                                   'Arrest': 28,
                                   'Arson': 28,
                                   'Assault': 28,
 