## Introduction
Whether you're working on image classification, object detection, or semantic segmentation, DataGradients helps you gain insights and analyze your datasets effectively.

In this tutorial, you'll explore the features and functionalities of DataGradients, guiding you through comprehensive data analysis for computer vision projects.

With DataGradients, you can:

Analyze image features such as color distribution, brightness, and size.
Profile object detection datasets with metrics like bounding box area, intersection, and class frequency.
Understand segmentation datasets using object area, width, height, and class frequency.
Visualize samples for a better understanding.
And much more
Profiling your datasets has never been easier!

## Imports - librairies


In [None]:
!pip install data-gradients

# for displaying pdfs as images in notebook
!pip install pdf2image
!apt-get -y install poppler-utils

# for pretty printing json
!pip install Pygments

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [None]:

import seaborn as sns  # library for visualization

sns.set_style("darkgrid")
import matplotlib.pyplot as plt  # library for visualization
%matplotlib inline

from tqdm import tqdm
tqdm.pandas()
import os
from glob import glob
import random
from datetime import datetime
import pandas

from typing import List, Tuple, Dict, Union

from concurrent.futures import ThreadPoolExecutor, as_completed
import pickle
import warnings
import re

## Set up Google Drive

In [None]:
SET_UP_GOOGLE_DRIVE = True

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
if SET_UP_GOOGLE_DRIVE:
    DATASETS_DIR_ROOT_PATH = r"/content/gdrive/MyDrive/KESKIA Drive Mlamali/datasets"
    EDA_DATAGRADIENT_OUTPUTS_PATH =  r"/content/gdrive/MyDrive/KESKIA Drive Mlamali/CDuPropreMantes/outputs/eda-datagradients"
else:
    EDA_DATAGRADIENT_OUTPUTS_PATH = "/outputs/eda-datagradients"
print(DATASETS_DIR_ROOT_PATH)
print(os.listdir(DATASETS_DIR_ROOT_PATH))
MY_DATASET_PATH = os.path.join(DATASETS_DIR_ROOT_PATH,'taco-2gb-updated-2023121718')
if not os.path.exists(MY_DATASET_PATH):
    raise FileExistsError("ehhh")
MY_DATASET_PATH

/content/gdrive/MyDrive/KESKIA Drive Mlamali/datasets
['taco-2gb', 'taco-2gb-updated', 'taco-2gb-updated-2023121620', 'taco-2gb-updated-2023121621', 'taco-2gb-updated-2023121718']


'/content/gdrive/MyDrive/KESKIA Drive Mlamali/datasets/taco-2gb-updated-2023121718'

In [None]:
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing import image
from sklearn.manifold import TSNE
import os

# Charger le modèle VGG16 pré-entraîné
model = VGG16(weights='imagenet', include_top=False)

# Fonction pour charger et prétraiter les images
def load_and_preprocess_img(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x, img

# Charger les images du dataset
def load_dataset_images(dataset_path):
    images = []
    raw_images = []
    for img_file in os.listdir(dataset_path):
        if img_file.lower().endswith('.jpg') or img_file.lower().endswith('.png'):
            img_path = os.path.join(dataset_path, img_file)
            processed_img, raw_img = load_and_preprocess_img(img_path)
            images.append(processed_img)
            raw_images.append(raw_img)
    return np.vstack(images), raw_images

# Chemin vers le dossier d'images d'entraînement
train_images_path = os.path.join(MY_DATASET_PATH, "train","images")

# Charger les images d'entraînement
train_images, raw_train_images = load_dataset_images(train_images_path)

In [None]:
# Extraire les features avec VGG16
features = model.predict(train_images)

# Aplatir les features pour t-SNE
flattened_features = features.reshape(features.shape[0], -1)

# Réduction de dimensionnalité avec t-SNE
tsne = TSNE(n_components=2)
tsne_results = tsne.fit_transform(flattened_features)



In [None]:

# Visualiser les résultats
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

# Visualiser les résultats
fig, ax = plt.subplots(figsize=(50,50))
for i, img in tqdm(enumerate(raw_train_images)):
    x, y = tsne_results[i, 0], tsne_results[i, 1]
    im = OffsetImage(img, zoom=0.125)  # Ajustez le zoom si nécessaire
    ab = AnnotationBbox(im, (x, y), xycoords='data', frameon=False)
    ax.add_artist(ab)
ax.set_xlim(tsne_results[:, 0].min() - 1, tsne_results[:, 0].max() + 1)
ax.set_ylim(tsne_results[:, 1].min() - 1, tsne_results[:, 1].max() + 1)
plt.show()

In [None]:
import yaml

# define the path to your YAML file
yaml_file_path = os.path.join(MY_DATASET_PATH, "data.yaml")

# open the YAML file and load it into a dictionary
with open(yaml_file_path, 'r') as f:
    data_yaml = yaml.safe_load(f)

data_yaml

{'train': '../train/images',
 'val': '../val/images',
 'nc': 59,
 'names': {0: 'Aluminium foil',
  1: 'Battery',
  2: 'Aluminium blister pack',
  3: 'Carded blister pack',
  4: 'Other plastic bottle',
  5: 'Clear plastic bottle',
  6: 'Glass bottle',
  7: 'Plastic bottle cap',
  8: 'Metal bottle cap',
  9: 'Broken glass',
  10: 'Food Can',
  11: 'Aerosol',
  12: 'Drink can',
  13: 'Toilet tube',
  14: 'Other carton',
  15: 'Egg carton',
  16: 'Drink carton',
  17: 'Corrugated carton',
  18: 'Meal carton',
  19: 'Pizza box',
  20: 'Paper cup',
  21: 'Disposable plastic cup',
  22: 'Foam cup',
  23: 'Glass cup',
  24: 'Other plastic cup',
  25: 'Food waste',
  26: 'Glass jar',
  27: 'Plastic lid',
  28: 'Metal lid',
  29: 'Other plastic',
  30: 'Magazine paper',
  31: 'Tissues',
  32: 'Wrapping paper',
  33: 'Normal paper',
  34: 'Paper bag',
  35: 'Plastic film',
  36: 'Six pack rings',
  37: 'Garbage bag',
  38: 'Other plastic wrapper',
  39: 'Single-use carrier bag',
  40: 'Polypropyl