# Datasets Analysis

In [None]:
%load_ext autoreload
%autoreload 2

from collections import Counter

import pandas as pd 
from project_paths import paths
from tqdm import tqdm 
import matplotlib.pyplot as plt 
plt.rcParams['figure.figsize'] = [10, 7]
plt.style.use('seaborn-v0_8')
import seaborn as sns
sns.set(style="darkgrid")
from ipywidgets import interact, IntSlider


from lane_detection_medium.datasets.file_datasets import DatasetMode
from lane_detection_medium.utils.fs import read_yolo_labels, read_image
from lane_detection_medium.utils.load import get_label_map
from lane_detection_medium.types.detection_types import ImageDetections
from lane_detection_medium.utils.viz import render_bbox


In [None]:
label_map = get_label_map()
label_map

## Data Loading

In [None]:
data_dpath = paths.yolo_dpath / "data" / "2023_07_10"

columns = [
    "img_fname", "lbl_fname", "data_mode", "height", "width", #*list(label_map.keys())[:2]
] 
metadata = []
for data_mode in DatasetMode:
    mode_dpath = data_dpath / data_mode.name

    img_dpath = mode_dpath / "images"
    lbl_dpath = mode_dpath / "labels"

    images = list(img_dpath.glob("*.PNG"))
    labels = list(lbl_dpath.glob("*.txt"))

    stream = tqdm(images, desc=f"{data_mode.name} processing")

    for img_fpath in stream: 
        img_stem = img_fpath.stem

        lbl_fname = None 
        label_vals = [0, 0]
        if (lbl_dpath / f"{img_stem}.txt").is_file():
            lbl_fname = (lbl_dpath / f"{img_stem}.txt").name
         
        img = read_image(str(img_fpath))

        metadata.append(
            [
                img_fpath.name, 
                lbl_fname, 
                data_mode.name, 
                *img.shape[:2]
            ]
        )


    print(f"--- {data_mode.name.upper()} ---")
    print(f"\tThe number of images: {len(images)}")
    print(f"\tThe number of labels: {len(labels)}")
    print(f"\tThe number of background images: {len(images) - len(labels)}")

    lbl_counter = None 
    for lbl_fpath in labels:
        labels_np = read_yolo_labels(lbl_fpath)[:, 0].astype(int)
        if lbl_counter is None: 
            lbl_counter = Counter(labels_np)
            continue
        lbl_counter += Counter(labels_np)
    print(f"\t{lbl_counter}")

metadata = pd.DataFrame(metadata, columns=columns)

In [None]:
label_info = pd.DataFrame(
    [
        ["solid_white", 18892, "train"], 
        ["break_white", 6824, "train"], 
        ["solid_white", 5569, "valid"], 
        ["break_white", 1610, "valid"], 
        ["solid_white", 5480, "test"], 
        ["break_white", 1582, "test"], 
    ], 
    columns=["label", "count", "mode"]
)

plt.figure(figsize=(8, 4))

_fig = sns.barplot(x=label_info["label"], y=label_info["count"], hue=label_info["mode"])

for container in _fig.containers:
    plt.bar_label(container)

plt.xlabel("Класс разметки")
plt.ylabel("Количество")

plt.legend(title="Датасет")
plt.title("Распределение экземпляров разметки")

plt.grid(True)
plt.tight_layout()
plt.show() 

In [None]:
label_info = pd.DataFrame(
    [
        ["solid_white", 18892, "train"], 
        ["break_white", 6824, "train"], 
        ["solid_white", 5569, "valid"], 
        ["break_white", 1610, "valid"], 
        ["solid_white", 5480, "test"], 
        ["break_white", 1582, "test"], 
    ], 
    columns=["label", "count", "mode"]
)

plt.figure(figsize=(8, 4))

_fig = sns.barplot(x=label_info["label"], y=label_info["count"], hue=label_info["mode"])

for container in _fig.containers:
    plt.bar_label(container)

plt.xlabel("Annotation Label")
plt.ylabel("Count")

plt.legend(title="Dataset")
plt.title("The annotation classes distribution")

plt.grid(True)
plt.tight_layout()
plt.show() 

In [None]:
metadata["shape"] = "(" + metadata["height"].astype(str) + ", " + metadata["width"].astype(str) + ")"

## Shape 

In [None]:
shape_cnts = metadata["shape"].value_counts()
shape_prcs = metadata["shape"].value_counts(normalize=True)

pd.concat((shape_cnts, shape_prcs), axis=1)

## Annotation Example

In [None]:
with_labels_df = metadata.loc[(metadata["lbl_fname"].notnull()) & (metadata["data_mode"] == "train")]

color_map = { 
  "solid_white": (255, 0, 0), 
  "break_white": (0, 0, 255), 
}

index = 50 

# @interact 
# def show_inference(index=IntSlider(val=0, min=0, max=len(with_labels_df) - 1)):
row = with_labels_df.iloc[index]

img_fpath = data_dpath / row["data_mode"] / "images" / row["img_fname"]
test_image = read_image(str(img_fpath)) 

lbl_fpath = data_dpath / row["data_mode"] / "labels" / row["lbl_fname"]
lbl_np = read_yolo_labels(lbl_fpath)
gt_detections = ImageDetections.from_yolo_labels(lbl_np, *test_image.shape[:2])

canva = test_image.copy()

plt.figure(figsize=(12, 12), frameon=False)

for det in gt_detections:
  for key, val in label_map.items():
      if val == det.label_id:
        label_name = key 
        break 
      
  render_bbox(
     canva, 
     det.bbox, 
     label=label_name, 
     line_thickness=6,
     color=color_map[label_name], 
     font_color=(255, 255, 255),
     font_size=1.5
  )

plt.grid(False)

plt.imshow(canva)
plt.tight_layout()
plt.savefig("exp.jpg")

plt.show()

In [None]:
metadata["shape"].value_counts(normalize=True)

In [None]:
_fig = sns.countplot(x=metadata["shape"])

In [None]:
metadata.head()

In [None]:
10562+2136+2136