In [1]:
# /data/home/eak/learning/nganga_ai/AminiCocoa/Amini-Cocoa-Contamination-Challenge/data_exploration.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import cv2
from tqdm import tqdm
from IPython.display import display

In [2]:
# --- Configuration ---
DATA_DIR = "dataset"
TRAIN_CSV_PATH = os.path.join(DATA_DIR, "Train_df.csv")
TRAIN_IMAGE_DIR = os.path.join(DATA_DIR, "images", "train")
OUTPUT_DIR = "exploration_outputs"  # Directory to save plots

In [3]:
# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Load Data ---
print(f"Loading data from {TRAIN_CSV_PATH}...")
try:
    df = pd.read_csv(TRAIN_CSV_PATH)
    print("Data loaded successfully.")
    print(f"Total number of bounding box entries: {len(df)}")
    print(f"Number of unique images: {df['Image_ID'].nunique()}")
except FileNotFoundError:
    print(
        f"Error: {TRAIN_CSV_PATH} not found. Please ensure the dataset is correctly placed."
    )
    exit()

Loading data from dataset/Train_df.csv...
Data loaded successfully.
Total number of bounding box entries: 9294
Number of unique images: 5252


In [4]:
# --- Basic Info ---
print("\n--- Basic DataFrame Info ---")
display(df.info())
print("\n--- First 5 Rows ---")
display(df.head())
print("\n--- Missing Values ---")
display(df.isnull().sum())  # Check for missing values


--- Basic DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9294 entries, 0 to 9293
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Image_ID    9294 non-null   object 
 1   class       9294 non-null   object 
 2   confidence  9294 non-null   float64
 3   ymin        9294 non-null   float64
 4   xmin        9294 non-null   float64
 5   ymax        9294 non-null   float64
 6   xmax        9294 non-null   float64
 7   class_id    9294 non-null   int64  
 8   ImagePath   9294 non-null   object 
dtypes: float64(5), int64(1), object(3)
memory usage: 653.6+ KB


None


--- First 5 Rows ---


Unnamed: 0,Image_ID,class,confidence,ymin,xmin,ymax,xmax,class_id,ImagePath
0,ID_nBgcAR.jpg,healthy,1.0,75.0,15.0,162.0,195.0,2,dataset/images/train/ID_nBgcAR.jpg
1,ID_nBgcAR.jpg,healthy,1.0,58.0,1.0,133.0,171.0,2,dataset/images/train/ID_nBgcAR.jpg
2,ID_nBgcAR.jpg,healthy,1.0,42.0,29.0,377.0,349.0,2,dataset/images/train/ID_nBgcAR.jpg
3,ID_Kw2v8A.jpg,healthy,1.0,112.0,124.0,404.0,341.0,2,dataset/images/train/ID_Kw2v8A.jpg
4,ID_Kw2v8A.jpg,healthy,1.0,148.0,259.0,413.0,412.0,2,dataset/images/train/ID_Kw2v8A.jpg



--- Missing Values ---


Image_ID      0
class         0
confidence    0
ymin          0
xmin          0
ymax          0
xmax          0
class_id      0
ImagePath     0
dtype: int64

In [5]:
# --- Class Distribution ---
print("\n--- Class Distribution ---")
class_counts = df["class"].value_counts()
print(class_counts)

plt.figure(figsize=(10, 6))
sns.barplot(x=class_counts.index, y=class_counts.values, palette="viridis")
plt.title("Class Distribution")
plt.xlabel("Class Name")
plt.ylabel("Number of Bounding Boxes")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "class_distribution.png"))
print(
    f"Saved class distribution plot to {os.path.join(OUTPUT_DIR, 'class_distribution.png')}"
)
# plt.show() # Optionally display plot interactively
plt.close()


--- Class Distribution ---
class
healthy        4061
cssvd          3071
anthracnose    2162
Name: count, dtype: int64
Saved class distribution plot to exploration_outputs/class_distribution.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=class_counts.index, y=class_counts.values, palette="viridis")


In [6]:
# --- Bounding Box Analysis ---
print("\n--- Bounding Box Analysis ---")
# Calculate width, height, area, aspect ratio
df["bbox_width"] = df["xmax"] - df["xmin"]
df["bbox_height"] = df["ymax"] - df["ymin"]
df["bbox_area"] = df["bbox_width"] * df["bbox_height"]
# Avoid division by zero for aspect ratio
df["aspect_ratio"] = df["bbox_width"] / (df["bbox_height"] + 1e-6)

print("\n--- Bounding Box Statistics (Width, Height, Area, Aspect Ratio) ---")
print(df[["bbox_width", "bbox_height", "bbox_area", "aspect_ratio"]].describe())


--- Bounding Box Analysis ---

--- Bounding Box Statistics (Width, Height, Area, Aspect Ratio) ---
        bbox_width  bbox_height     bbox_area  aspect_ratio
count  9294.000000  9294.000000  9.294000e+03   9294.000000
mean   1014.680439  1331.383150  1.905404e+06      0.923712
std     808.609728  1020.951386  2.498296e+06      0.652592
min      16.000000     3.000000  9.600000e+01      0.133588
25%     433.000000   554.250000  2.760908e+05      0.547031
50%     794.500000  1080.000000  8.530610e+05      0.717549
75%    1337.000000  1856.000000  2.403886e+06      1.096320
max    4098.000000  4128.000000  1.249744e+07     19.736111


In [7]:
# Plot distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle("Bounding Box Distributions")

sns.histplot(df["bbox_width"], kde=True, ax=axes[0, 0])
axes[0, 0].set_title("Width Distribution")

sns.histplot(df["bbox_height"], kde=True, ax=axes[0, 1])
axes[0, 1].set_title("Height Distribution")

sns.histplot(df["bbox_area"], kde=True, ax=axes[1, 0])
axes[1, 0].set_title("Area Distribution")
axes[1, 0].set_yscale("log")  # Use log scale for potentially large range

sns.histplot(df["aspect_ratio"], kde=True, ax=axes[1, 1])
axes[1, 1].set_title("Aspect Ratio Distribution")
# axes[1, 1].set_xscale('log') # Consider log scale if skewed

plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to prevent title overlap
plt.savefig(os.path.join(OUTPUT_DIR, "bbox_distributions.png"))
print(
    f"Saved bounding box distribution plots to {os.path.join(OUTPUT_DIR, 'bbox_distributions.png')}"
)
# plt.show()
plt.close()

Saved bounding box distribution plots to exploration_outputs/bbox_distributions.png


In [8]:
# --- Bounding Boxes per Image ---
print("\n--- Bounding Boxes per Image ---")
bboxes_per_image = df.groupby("Image_ID").size()
print(bboxes_per_image.describe())

plt.figure(figsize=(10, 6))
sns.histplot(
    bboxes_per_image, kde=False, bins=max(1, bboxes_per_image.max())
)  # Adjust bins
plt.title("Number of Bounding Boxes per Image")
plt.xlabel("Number of Bounding Boxes")
plt.ylabel("Number of Images")
plt.yscale("log")  # Often useful for counts
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "bboxes_per_image.png"))
print(
    f"Saved bboxes per image plot to {os.path.join(OUTPUT_DIR, 'bboxes_per_image.png')}"
)
# plt.show()
plt.close()


--- Bounding Boxes per Image ---
count    5252.000000
mean        1.769612
std         1.232706
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max        13.000000
dtype: float64
Saved bboxes per image plot to exploration_outputs/bboxes_per_image.png


In [9]:
# --- Visualize Sample Images with Bounding Boxes ---
print("\n--- Visualizing Sample Images ---")
num_samples_to_show = 5
sample_image_ids = (
    df["Image_ID"].drop_duplicates().sample(num_samples_to_show, random_state=42)
)


--- Visualizing Sample Images ---


In [10]:
# Define colors for different classes (add more if needed)
class_colors = {
    "Healthy": (0, 255, 0),  # Green
    "Diseased": (0, 0, 255),  # Red
    "Contaminated": (255, 0, 0),  # Blue
    # Add other classes and their BGR colors here
}
default_color = (255, 255, 255)  # White for unknown classes

In [11]:
for image_id in tqdm(sample_image_ids, desc="Drawing samples"):
    image_path = os.path.join(TRAIN_IMAGE_DIR, image_id)
    if not os.path.exists(image_path):
        print(f"Warning: Image not found at {image_path}, skipping visualization.")
        continue

    img = cv2.imread(image_path)
    if img is None:
        print(f"Warning: Could not read image {image_path}, skipping visualization.")
        continue

    bboxes = df[df["Image_ID"] == image_id]

    for _, row in bboxes.iterrows():
        xmin, ymin, xmax, ymax = (
            int(row["xmin"]),
            int(row["ymin"]),
            int(row["xmax"]),
            int(row["ymax"]),
        )
        class_name = row["class"]
        confidence = row[
            "confidence"
        ]  # Although confidence is usually for predictions, it's in the train CSV
        color = class_colors.get(class_name, default_color)

        # Draw bounding box
        cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 2)  # Thickness 2

        # Put label text
        label = f"{class_name} ({confidence:.2f})"
        (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1)
        # Ensure text background doesn't go out of bounds
        text_ymin = max(ymin - h - 4, 0)
        text_ymax = max(ymin, h + 4)  # Use max to handle cases where ymin is small
        cv2.rectangle(
            img, (xmin, text_ymin), (xmin + w, text_ymax), color, -1
        )  # Filled background
        cv2.putText(
            img,
            label,
            (xmin, ymin - 4),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.6,
            (255, 255, 255),
            1,
            cv2.LINE_AA,
        )  # White text

    output_path = os.path.join(OUTPUT_DIR, f"sample_{image_id}")
    cv2.imwrite(output_path, img)
    # print(f"Saved sample image with boxes to {output_path}") # Can be verbose

print(
    f"\nSaved {num_samples_to_show} sample images with bounding boxes to {OUTPUT_DIR}"
)
print("\n--- Data Exploration Complete ---")

Drawing samples: 100%|██████████| 5/5 [00:00<00:00, 10.28it/s]


Saved 5 sample images with bounding boxes to exploration_outputs

--- Data Exploration Complete ---





# data_exploration_report.md
# Data Exploration Report: Cocoa Contamination Detection

This report summarizes the initial exploration of the cocoa bean dataset provided for the contamination detection challenge.

## 1. Dataset Overview

*   **Training Data CSV:** `dataset/Train_df.csv`
*   **Training Images:** `dataset/images/train/`
*   **Training Labels:** `dataset/labels/train/` (Note: Primary analysis used the CSV)

The `Train_df.csv` contains bounding box annotations for objects within the training images.

## 2. Initial Analysis (from `Train_df.csv`)

*   **Total Bounding Box Entries:** [Insert Number from Script Output]
*   **Number of Unique Images:** [Insert Number from Script Output]
*   **Columns:** `Image_ID`, `class`, `confidence`, `ymin`, `xmin`, `ymax`, `xmax`, `class_id`, `ImagePath`
*   **Missing Values:** Checked for missing values. [Report if any were found, e.g., "No missing values found." or specify columns/counts].

## 3. Class Distribution

*   The distribution of bounding boxes per class was analyzed.
*   **Counts:**
    *   [Class 1 Name]: [Count]
    *   [Class 2 Name]: [Count]
    *   [Class 3 Name]: [Count]
    *   ... (List all classes and their counts from script output)
*   **Observations:** [Comment on class balance/imbalance, e.g., "The dataset shows a significant imbalance, with 'Healthy' beans being the most frequent class."]

![Class Distribution Plot](exploration_outputs/class_distribution.png)

## 4. Bounding Box Analysis

Bounding box dimensions (width, height, area) and aspect ratios were calculated and analyzed.

*   **Summary Statistics:**
    *   **Width:** Mean=[...], Std=[...], Min=[...], Max=[...]
    *   **Height:** Mean=[...], Std=[...], Min=[...], Max=[...]
    *   **Area:** Mean=[...], Std=[...], Min=[...], Max=[...]
    *   **Aspect Ratio:** Mean=[...], Std=[...], Min=[...], Max=[...]
    *(Fill in the values from the script's `.describe()` output)*
*   **Observations:** [Comment on the typical size and shape of bounding boxes, e.g., "Bounding boxes vary significantly in size.", "Most boxes have an aspect ratio close to 1, suggesting roughly square objects, but there's a long tail.", "The area distribution is heavily skewed, suggesting many small objects."]

![Bounding Box Distributions Plot](exploration_outputs/bbox_distributions.png)

## 5. Bounding Boxes per Image

The number of bounding boxes annotated in each image was analyzed.

*   **Summary Statistics:** Mean=[...], Std=[...], Min=[...], Max=[...]
    *(Fill in the values from the script's `.describe()` output for `bboxes_per_image`)*
*   **Observations:** [Comment on the density of objects per image, e.g., "Most images contain a small number of bounding boxes (e.g., 1-5), but some images have a very high density of objects.", "The distribution is right-skewed."]

![Bounding Boxes per Image Plot](exploration_outputs/bboxes_per_image.png)

## 6. Sample Visualizations

Several sample images were visualized with their corresponding bounding boxes.

*   *(Optionally embed or link to a few sample images saved in `exploration_outputs/`)*
    *   `exploration_outputs/sample_[image_id_1].jpg`
    *   `exploration_outputs/sample_[image_id_2].jpg`
    *   ...
*   **Observations:** [Comment on visual aspects, e.g., "Visual inspection confirms annotations generally align with visible objects.", "Some images are crowded.", "Lighting conditions vary.", "Object appearance (e.g., color, texture) differs between classes."]

## 7. Next Steps & Considerations

*   **Class Imbalance:** Address the class imbalance during training (e.g., using weighted loss, over/undersampling, data augmentation).
*   **Bounding Box Size Variance:** The wide range of bounding box sizes suggests the need for a model robust to scale variations (e.g., using Feature Pyramid Networks).
*   **Anchor Boxes:** The distribution of aspect ratios and sizes can inform the selection or generation of appropriate anchor boxes if using anchor-based detectors.
*   **Data Augmentation:** Apply relevant data augmentation techniques (e.g., scaling, rotation, color jitter) to improve model generalization.
*   **Confidence Column:** The `confidence` column in the training data is unusual. Investigate its meaning or potentially ignore it if it represents annotation confidence rather than model prediction confidence. If it's annotation confidence, it could potentially be used for weighting samples.