# downloading the dataset

In [None]:
import fiftyone.zoo as foz
import fiftyone as fo
dataset = foz.load_zoo_dataset(
    "coco-2017", 
    split="train", 
    label_types=["detections"], 
    classes=["person", "car", "truck", "bicycle", "motorcycle", "cat", "dog"]
)

export_dir = "C:\\Users\\jithi\\OneDrive\\Desktop\\VsCode\\coco"

# Export dataset in YOLO format
dataset.export(
    export_dir=export_dir,
    dataset_type=fo.types.YOLOv4Dataset,
    label_field="ground_truth",  # this should be the name of the detections field in your FiftyOne dataset
)


# loading dataset

In [None]:
import fiftyone as fo

name = "coco_yolo"

# Check if a dataset with the given name already exists
if name in fo.list_datasets():
    # Delete the existing dataset
    fo.delete_dataset(name)

# The directory containing the dataset to import
dataset_dir = "C:\\Users\\jithi\\OneDrive\\Desktop\\VsCode\\coco"

# The type of the dataset being imported
dataset_type = fo.types.YOLOv4Dataset # for example

# Import the dataset
dataset = fo.Dataset.from_dir(
    dataset_dir=dataset_dir,
    dataset_type=dataset_type,
    name=name,
)


# removing annotations

In [None]:
import os
from tqdm import tqdm

# Path to the obj.names file and data directory
names_path = "C:\\Users\\jithi\\OneDrive\\Desktop\\VsCode\\coco\\obj.names"
data_dir = "C:\\Users\\jithi\\OneDrive\\Desktop\\VsCode\\coco\\data"

# Read obj.names and get mapping of class names to indices
with open(names_path, 'r') as f:
    names = f.read().splitlines()

name_to_idx = {name: idx for idx, name in enumerate(names)}

# Define the desired classes
desired_classes = ["person", "car", "truck", "bicycle", "motorcycle", "cat", "dog"]
desired_indices = [str(name_to_idx[name]) for name in desired_classes if name in name_to_idx]

# Iterate over all .txt files in the data directory and filter annotations
txt_files = [f for f in os.listdir(data_dir) if f.endswith('.txt')]
for txt_file in tqdm(txt_files, desc="Processing annotations"):
    file_path = os.path.join(data_dir, txt_file)
    
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    filtered_lines = [line for line in lines if line.split()[0] in desired_indices]
    
    with open(file_path, 'w') as f:
        f.writelines(filtered_lines)


# renaming annotations

In [None]:
# 1. Mapping Original Classes to Indices:

original_classes = [
    "umbrella", "person", "dog", "horse", "potted plant", "elephant", "car",
    "truck", "stop sign", "clock", "train", "motorcycle", "bicycle", "skateboard",
    "handbag", "bench", "chair", "fork", "knife", "pizza", "dining table", "cup",
    "cake", "spoon", "book", "giraffe", "kite", "tie", "scissors", "baseball bat",
    "snowboard", "bottle", "couch", "remote", "airplane", "traffic light", "backpack",
    "bus", "suitcase", "microwave", "frisbee", "wine glass", "teddy bear", "cell phone",
    "refrigerator", "oven", "baseball glove", "sports ball", "broccoli", "skis", "boat",
    "tennis racket", "donut", "cat", "bird", "surfboard", "bed", "toothbrush", "vase",
    "tv", "laptop", "mouse", "bowl", "sandwich", "hot dog", "parking meter", "fire hydrant",
    "banana", "orange", "cow", "sink", "carrot", "sheep", "apple", "toilet", "keyboard",
    "zebra", "hair drier", "bear", "toaster"
]

# Dictionary that maps class names to their indices
original_mapping = {cls: idx for idx, cls in enumerate(original_classes)}

# 2. Mapping Selected Classes to New Indices:

selected_classes = ["person", "car", "truck", "bicycle", "motorcycle", "cat", "dog"]
new_mapping = {cls: idx for idx, cls in enumerate(selected_classes)}

In [None]:
import os

data_directory = "C:\\Users\\jithi\\OneDrive\\Desktop\\VsCode\\coco\\data"

# Iterate through each file in the data directory
for filename in os.listdir(data_directory):
    filepath = os.path.join(data_directory, filename)
    
    if filepath.endswith(".txt"):
        with open(filepath, "r") as file:
            lines = file.readlines()

        updated_lines = []
        for line in lines:
            parts = line.strip().split()
            class_idx = int(parts[0])
            class_name = original_classes[class_idx]

            # If this class is one of our selected classes, update its index
            if class_name in new_mapping:
                new_idx = new_mapping[class_name]
                updated_lines.append(f"{new_idx} {' '.join(parts[1:])}\n")

        # Save the updated lines back to the file
        with open(filepath, "w") as file:
            file.writelines(updated_lines)


# splitting into train test and split

In [None]:
train_path_img = "C:\\Users\\jithi\\OneDrive\\Desktop\\VsCode\\coco_split\\images\\train"
train_path_label = "C:\\Users\\jithi\\OneDrive\\Desktop\\VsCode\\coco_split\\labels\\train"
val_path_img = "C:\\Users\\jithi\\OneDrive\\Desktop\\VsCode\\coco_split\\images\\val"
val_path_label = "C:\\Users\\jithi\\OneDrive\\Desktop\\VsCode\\coco_split\\labels\\val"


In [None]:
import os
import os
import shutil
import random
from tqdm import tqdm

In [None]:
def train_test_split(path,neg_path=None, split = 0.2):

    files = list(set([name[:-4] for name in os.listdir(path)])) ## removing duplicate names i.e. counting only number of images

    print (f"no of images:{len(files)} ")
    random.seed(42)
    random.shuffle(files)

    test_size = int(len(files) * split)
    train_size = len(files) - test_size

    # creating required directories

    os.makedirs(train_path_img, exist_ok = True)
    os.makedirs(train_path_label, exist_ok = True)
    os.makedirs(val_path_img, exist_ok = True)
    os.makedirs(val_path_label, exist_ok = True)


    # copying images to train folder
    for filex in tqdm(files[:train_size]):
      if filex == 'classes':
          continue
      shutil.copy2(path + filex + '.jpg',f"{train_path_img}/" + filex + '.jpg' )
      shutil.copy2(path + filex + '.txt', f"{train_path_label}/" + filex + '.txt')



    print(f"Training data created with 80% split {len(files[:train_size])} images ")

    if neg_path:
        neg_images = list(set([name[:-4] for name in os.listdir(neg_path)])) # removing duplicate names i.e. counting only number of images
        for filex in tqdm(neg_images):
            shutil.copy2(neg_path+filex+ ".jpg", f"{train_path_img}/" + filex + '.jpg')

        print(f"Total  {len(neg_images)} negative images added to the training data")

        print(f"TOTAL Training data created with {len(files[:train_size]) + len(neg_images)} images ")



    for filex in tqdm(files[train_size:]):
      if filex == 'classes':
          continue
      shutil.copy2(path + filex + '.jpg', f"{val_path_img}/" + filex + '.jpg' )
      shutil.copy2(path + filex + '.txt', f"{val_path_label}/" + filex + '.txt')

    print(f"Testing data created with a total of {len(files[train_size:])} images ")

train_test_split('C:\\Users\\jithi\\OneDrive\\Desktop\\VsCode\\coco\\data\\') 