# 1. Import Packages

In [None]:
# !pip install pycocotools

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pycocotools.coco import COCO
from collections import defaultdict

# 2. Data Description

In [None]:
# Define COCO 2017 annotation files and their purposes
annotation_files = [
    {"File Name": "instances_train2017.json", "Description": "Training set of things. (person, dog, cat)"},
    {"File Name": "instances_val2017.json", "Description":  "Validation set of things. (person, dog, cat)"},
    {"File Name": "person_keypoints_train2017.json", "Description": "Keypoint annotations for human pose estimation in training set."},
    {"File Name": "person_keypoints_val2017.json", "Description": "Keypoint annotations for human pose estimation in validation set."},
    {"File Name": "captions_train2017.json", "Description": "Image captions for NLP tasks in training set."},
    {"File Name": "captions_val2017.json", "Description": "Image captions for NLP tasks in validation set."},
    {"File Name": "image_info_test2017.json", "Description": "Image metadata for test set without annotations (for evaluation)."},
    {"File Name": "image_info_unlabeled2017.json", "Description": "Image metadata for additional unlabeled images in COCO 2017 dataset."},
    {"File Name": "stuff_train2017.json", "Description": "Annotations for background stuff segmentation (sky, grass, etc.) in training set."},
    {"File Name": "stuff_val2017.json", "Description": "Annotations for background stuff segmentation (sky, grass, etc.) in validation set."}
]

# Create DataFrame
table = pd.DataFrame(annotation_files)

# Ensure full text is displayed
pd.set_option('display.max_colwidth', None)

# Display table
table

# 3. Display Class Labels

In [None]:
# Path to the annotations
ann_file = os.path.join('annotations/instances_train2017.json')

# Initialize COCO API
coco = COCO(ann_file)

# Get all category names
cat_ids = coco.getCatIds()
categories = [coco.loadCats(cat_id)[0]['name'] for cat_id in cat_ids]

# Convert to DataFrame with 5 rows and 16 columns
num_cols = 16
num_rows = (len(categories) + num_cols - 1) // num_cols
category_matrix = [categories[i * num_cols:(i + 1) * num_cols] for i in range(num_rows)]
df = pd.DataFrame(category_matrix)

# Display table
df

# 4. Display Class Information

## Class Distributions

In [None]:
# Get the number of instances per category
class_distribution = {}
for cat_id in cat_ids:
    cat_name = coco.loadCats(cat_id)[0]['name']
    img_ids = coco.getImgIds(catIds=cat_id)
    class_distribution[cat_name] = len(img_ids)

# Sort the class distribution by the number of instances
sorted_class_distribution = dict(sorted(class_distribution.items(), key=lambda item: item[1], reverse=True))

# Plot
plt.figure(figsize=(20, 10))
plt.bar(sorted_class_distribution.keys(), sorted_class_distribution.values())
plt.xticks(rotation=90, fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Class', fontsize=15)
plt.ylabel('Number of Instances', fontsize=15)
plt.title('Class Distribution in COCO 2017 Dataset (Images Count)', fontsize=20)
plt.show()

In [None]:
# Get the number of instances per category using annotations
class_distribution = {}
for cat_id in cat_ids:
    cat_name = coco.loadCats(cat_id)[0]['name']
    ann_ids = coco.getAnnIds(catIds=cat_id)  # Get annotation IDs for the category
    class_distribution[cat_name] = len(ann_ids)  # Count of annotations

# Sort the class distribution by the number of instances (annotations)
sorted_class_distribution = dict(sorted(class_distribution.items(), key=lambda item: item[1], reverse=True))

# Plot
plt.figure(figsize=(20, 10))
plt.bar(sorted_class_distribution.keys(), sorted_class_distribution.values())
plt.xticks(rotation=90, fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Class', fontsize=15)
plt.ylabel('Number of Annotations', fontsize=15)
plt.title('Class Distribution in COCO 2017 Dataset (Annotation Count)', fontsize=20)
plt.show()

## Heatmap of Class Co-occurrence

In [None]:
categories = {cat_id: coco.loadCats(cat_id)[0]['name'] for cat_id in cat_ids}

# Create object co-occurrence matrix
co_occurrence = defaultdict(lambda: defaultdict(int))
img_ids = coco.getImgIds()
for img_id in img_ids:
    ann_ids = coco.getAnnIds(imgIds=img_id)
    anns = coco.loadAnns(ann_ids)
    present_cats = sorted(set([categories[ann['category_id']] for ann in anns]))
    
    for cat1 in present_cats:
        for cat2 in present_cats:
            if cat1 != cat2:
                co_occurrence[cat1][cat2] += 1

# Convert to DataFrame and ensure consistent ordering
category_labels = sorted(set(co_occurrence.keys()))
co_occurrence_df = pd.DataFrame(co_occurrence, index=category_labels, columns=category_labels).fillna(0)
co_occurrence_df = co_occurrence_df.reindex(index=category_labels, columns=category_labels)

# Plot Object Co-occurrence Heatmap
plt.figure(figsize=(15, 10))
ax = sns.heatmap(co_occurrence_df, cmap='Blues', xticklabels=category_labels, yticklabels=category_labels)
ax.set_xticklabels(category_labels, rotation=90, fontsize=8)
ax.set_yticklabels(category_labels, fontsize=8)
plt.title("Object Co-occurrence Heatmap")
plt.xlabel("Class")
plt.ylabel("Class")
plt.show()

# 5. Display Category Information

## Category Distributions

In [None]:
# Define category groups
group_mapping = {
    "person": "People",
    "bicycle": "Vehicles", "car": "Vehicles", "motorcycle": "Vehicles", "airplane": "Vehicles", "bus": "Vehicles", "train": "Vehicles", "truck": "Vehicles", "boat": "Vehicles",
    "traffic light": "Road Objects", "fire hydrant": "Road Objects", "stop sign": "Road Objects", "parking meter": "Road Objects", "bench": "Road Objects",
    "bird": "Animals", "cat": "Animals", "dog": "Animals", "horse": "Animals", "sheep": "Animals", "cow": "Animals", "elephant": "Animals", "bear": "Animals", "zebra": "Animals", "giraffe": "Animals",
    "backpack": "Accessories", "umbrella": "Accessories", "handbag": "Accessories", "tie": "Accessories", "suitcase": "Accessories",
    "frisbee": "Sports", "skis": "Sports", "snowboard": "Sports", "sports ball": "Sports", "kite": "Sports", "baseball bat": "Sports", "baseball glove": "Sports", "skateboard": "Sports", "surfboard": "Sports", "tennis racket": "Sports",
    "bottle": "Kitchenware", "wine glass": "Kitchenware", "cup": "Kitchenware", "fork": "Kitchenware", "knife": "Kitchenware", "spoon": "Kitchenware", "bowl": "Kitchenware",
    "banana": "Food", "apple": "Food", "sandwich": "Food", "orange": "Food", "broccoli": "Food", "carrot": "Food", "hot dog": "Food", "pizza": "Food", "donut": "Food", "cake": "Food",
    "chair": "Furniture", "couch": "Furniture", "potted plant": "Furniture", "bed": "Furniture", "dining table": "Furniture", "toilet": "Furniture",
    "tv": "Electronics", "laptop": "Electronics", "mouse": "Electronics", "remote": "Electronics", "keyboard": "Electronics", "cell phone": "Electronics",
    "microwave": "Appliances", "oven": "Appliances", "toaster": "Appliances", "sink": "Appliances", "refrigerator": "Appliances",
    "book": "Objects", "clock": "Objects", "vase": "Objects", "scissors": "Objects", "teddy bear": "Objects", "hair drier": "Objects", "toothbrush": "Objects"
}

# Get the number of instances per category
category_distribution = defaultdict(int)
for cat_id in cat_ids:
    cat_name = coco.loadCats(cat_id)[0]['name']
    img_ids = coco.getImgIds(catIds=cat_id)
    category_group = group_mapping.get(cat_name, "Other")
    category_distribution[category_group] += len(img_ids)

# Sort the class distribution by the number of instances
sorted_category_distribution = dict(sorted(category_distribution.items(), key=lambda item: item[1], reverse=True))

# Plot
plt.figure(figsize=(12, 6))
plt.bar(sorted_category_distribution.keys(), sorted_category_distribution.values(), color='skyblue')
plt.xticks(rotation=45, ha='right')
plt.xlabel('Category Group')
plt.ylabel('Number of Instances')
plt.title('Grouped Class Distribution in COCO 2017 Dataset (Images Count)')
plt.show()

In [None]:
# Get the number of instances per category using annotations
category_distribution = defaultdict(int)
for cat_id in cat_ids:
    cat_name = coco.loadCats(cat_id)[0]['name']
    ann_ids = coco.getAnnIds(catIds=cat_id)  # Use annotations instead of images
    category_group = group_mapping.get(cat_name, "Other")
    category_distribution[category_group] += len(ann_ids)

# Sort the grouped distribution by the number of instances (annotations)
sorted_category_distribution = dict(sorted(category_distribution.items(), key=lambda item: item[1], reverse=True))

# Plot
plt.figure(figsize=(12, 6))
plt.bar(sorted_category_distribution.keys(), sorted_category_distribution.values(), color='skyblue')
plt.xticks(rotation=45, ha='right')
plt.xlabel('Category Group')
plt.ylabel('Number of Annotations')
plt.title('Grouped Class Distribution in COCO 2017 Dataset (Annotation Count)')
plt.show()

## Heatmap of Category Co-occurrence

In [None]:
# Get all category names
categories = {cat_id: coco.loadCats(cat_id)[0]['name'] for cat_id in cat_ids}

# Create category co-occurrence matrix
category_co_occurrence = defaultdict(lambda: defaultdict(int))
img_ids = coco.getImgIds()
for img_id in img_ids:
    ann_ids = coco.getAnnIds(imgIds=img_id)
    img_ids = coco.getImgIds(catIds=[cat_id])
    anns = coco.loadAnns(ann_ids)
    present_categories = sorted(set([group_mapping.get(categories[ann['category_id']], "Other") for ann in anns]))
    
    for cat1 in present_categories:
        for cat2 in present_categories:
            if cat1 != cat2:
                category_co_occurrence[cat1][cat2] += 1

# Convert to DataFrame and ensure consistent ordering
category_labels = sorted(set(category_co_occurrence.keys()))
category_co_occurrence_df = pd.DataFrame(category_co_occurrence, index=category_labels, columns=category_labels).fillna(0)

# Plot Category Co-occurrence Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(category_co_occurrence_df, cmap='Blues', xticklabels=True, yticklabels=True)
plt.title("Category Group Co-occurrence Heatmap")
plt.xlabel("Category Group")
plt.ylabel("Category Group")
plt.show()

# 6. Class & Category Information after dropping "persons"

In [None]:
# Define filtered_img_ids (exclude images containing "person")
person_cat_ids = coco.getCatIds(catNms=['person'])
person_img_ids = set(coco.getImgIds(catIds=person_cat_ids))
all_img_ids = set(coco.getImgIds())
filtered_img_ids = list(all_img_ids - person_img_ids)

# Count the number of images in which each class appears
image_counts = defaultdict(int)
for img_id in filtered_img_ids:
    ann_ids = coco.getAnnIds(imgIds=img_id)
    anns = coco.loadAnns(ann_ids)
    # Create a set to hold unique classes present in the image
    present_categories = set()
    for ann in anns:
        if ann['category_id'] in categories:
            cat_name = categories[ann['category_id']]
            present_categories.add(cat_name)
    # Count each unique category once per image
    for cat_name in present_categories:
        image_counts[cat_name] += 1

# Convert to pandas Series for easy plotting
image_counts_series = pd.Series(image_counts).sort_values(ascending=False)

# Plot bar plot for individual classes (using image counts)
plt.figure(figsize=(12, 6))
plt.bar(image_counts_series.index, image_counts_series.values, color='skyblue')
plt.xticks(rotation=90)
plt.xlabel("Individual Classes")
plt.ylabel("Image Count")
plt.title("Bar Plot: Image Count per Individual Class (Image Counts)")
plt.tight_layout()
plt.show()

########################## Annotations Count ##########################
object_counts = defaultdict(int)
for img_id in filtered_img_ids:
    ann_ids = coco.getAnnIds(imgIds=img_id)
    anns = coco.loadAnns(ann_ids)
    for ann in anns:
        if ann['category_id'] in categories:
            cat_name = categories[ann['category_id']]
            object_counts[cat_name] += 1

# Convert to pandas Series for easy plotting
object_counts_series = pd.Series(object_counts).sort_values(ascending=False)

# Plot bar plot for individual classes
plt.figure(figsize=(12, 6))
plt.bar(object_counts_series.index, object_counts_series.values, color='skyblue')
plt.xticks(rotation=90)
plt.xlabel("Individual Classes")
plt.ylabel("Object Count")
plt.title("Bar Plot: Object Count per Individual Class (Annotations Count)")
plt.tight_layout()
plt.show()

# Compute Co-occurrence Matrix for Individual Classes
co_occurrence = defaultdict(lambda: defaultdict(int))
for img_id in filtered_img_ids:
    ann_ids = coco.getAnnIds(imgIds=img_id)
    anns = coco.loadAnns(ann_ids)
    # Get a set of unique classes present in the image
    present_cats = sorted({categories[ann['category_id']] for ann in anns if ann['category_id'] in categories})
    for i, cat1 in enumerate(present_cats):
        for cat2 in present_cats[i+1:]:
            # Increment both [cat1][cat2] and [cat2][cat1] for symmetry
            co_occurrence[cat1][cat2] += 1
            co_occurrence[cat2][cat1] += 1

# Convert the co-occurrence dictionary to a DataFrame
individual_classes = sorted(co_occurrence.keys())
co_occurrence_df = pd.DataFrame(co_occurrence, index=individual_classes, columns=individual_classes).fillna(0)

# Plot heatmap for individual classes
plt.figure(figsize=(15, 10))
sns.heatmap(co_occurrence_df, cmap='Blues', annot=False)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.title("Heatmap: Co-occurrence Matrix for Individual Classes")
plt.tight_layout()
plt.show()

# Aggregate image counts by category groups using group_mapping
group_image_counts = defaultdict(int)
for cls, count in image_counts.items():
    if cls in group_mapping:
        group_name = group_mapping[cls]
        group_image_counts[group_name] += count

group_image_counts_series = pd.Series(group_image_counts).sort_values(ascending=False)

# Plot bar plot for grouped categories (using image counts)
plt.figure(figsize=(12, 6))
plt.bar(group_image_counts_series.index, group_image_counts_series.values, color='coral')
plt.xticks(rotation=90, ha='right')
plt.xlabel("Category Groups")
plt.ylabel("Image Count")
plt.title("Bar Plot: Image Count per Category Group (Image Counts)")
plt.tight_layout()
plt.show()

######################## Annotations ########################

group_object_counts = defaultdict(int)
for cls, count in object_counts.items():
    if cls in group_mapping:
        group_name = group_mapping[cls]
        group_object_counts[group_name] += count

group_object_counts_series = pd.Series(group_object_counts).sort_values(ascending=False)

# Plot bar plot for grouped categories
plt.figure(figsize=(12, 6))
plt.bar(group_object_counts_series.index, group_object_counts_series.values, color='coral')
plt.xticks(rotation=90)
plt.xlabel("Category Groups")
plt.ylabel("Object Count")
plt.title("Bar Plot: Object Count per Category Group (Annotations Count)")
plt.tight_layout()
plt.show()

# Group the Co-occurrence Matrix by Category Groups for Heatmap
# Group rows first by mapping each individual class to its group,
# then group the resulting columns by the same mapping.
grouped_co_occurrence_df = pd.DataFrame(
    co_occurrence_df.groupby(group_mapping, axis=0).sum()
                     .groupby(group_mapping, axis=1).sum()
)

# Plot heatmap for grouped categories
plt.figure(figsize=(12, 6))
sns.heatmap(grouped_co_occurrence_df, cmap='Blues', annot=False)
plt.xlabel("Category Groups")
plt.ylabel("Category Groups")
plt.title("Heatmap: Grouped Co-occurrence Matrix for Category Groups")
plt.tight_layout()
plt.show()