In [None]:
# Import libraries
import pandas as pd
import os
from pathlib import Path
import shutil
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import yaml
import matplotlib.pyplot as plt
import multiprocessing
import numpy as np
from PIL import Image

TEST_SIZE = 0.05

In [None]:
# INPUT_DIRS
INPUT_DATA_DIR = Path('dataset')

os.listdir(INPUT_DATA_DIR)

In [None]:
## Drop the Folder if it already exists
DATASETS_DIR = Path('dataset')
DATASETS_DIR

In [None]:
# Image & labels directory
TRAIN_IMAGES_DIR = DATASETS_DIR / 'images' / 'train'
TRAIN_LABELS_DIR = DATASETS_DIR / 'labels'/ 'train'
TEST_IMAGES_DIR = DATASETS_DIR / 'images' / 'test'
VAL_IMAGES_DIR = DATASETS_DIR / 'images' /'val'
VAL_LABELS_DIR = DATASETS_DIR / 'labels' /'val'

In [None]:
# Check if DATASETS_DIR exists, if it does delete it and recreate it
for DIR in [
    TRAIN_IMAGES_DIR,
    VAL_IMAGES_DIR,
    TEST_IMAGES_DIR,
    VAL_LABELS_DIR,
    # DATASETS_DIR,
]:
    if DIR.exists():
        shutil.rmtree(DIR)
    DIR.mkdir(parents=True, exist_ok=True)
DATASETS_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
shutil.unpack_archive(INPUT_DATA_DIR / 'dataset.zip', DATASETS_DIR)

In [None]:
def count_files(directory):
    total_files = 0
    for root, _, files in os.walk(directory):
        total_files += len(files)
    return total_files

In [None]:
## Count the number files in TRAIN_IMAGES_DIR
num_train_images = count_files(TRAIN_IMAGES_DIR)
print(f"There are {num_train_images} in {TRAIN_IMAGES_DIR}")

In [None]:
## Count the number of files in TRAIN_LABELS_DIR
num_test_labels = count_files(TEST_IMAGES_DIR)
print(f"There are {num_test_labels} in {TEST_IMAGES_DIR}")

In [None]:
## Get the set of all the stems of the images in TRAIN_IMAGES_DIR
train_images_stems = set([str(Path(name).stem) for name in os.listdir(TRAIN_IMAGES_DIR)])
len(train_images_stems)

In [None]:
## Get the set of all the stems of the labes in TRAIN_LABELS_DIR
train_labels_stems = set([str(Path(name).stem) for name in os.listdir(TRAIN_LABELS_DIR)])
len(train_labels_stems)

In [None]:
# Check that the two sets are identitical
train_images_stems == train_labels_stems

In [None]:
TRAIN_IMAGES_DIR.absolute()

In [None]:
# Load train and test files
train = pd.read_csv(INPUT_DATA_DIR / 'Train.csv')
test = pd.read_csv(INPUT_DATA_DIR / 'Test.csv')
ss = pd.read_csv(INPUT_DATA_DIR / 'SampleSubmission.csv')

In [None]:
## Sample submission file
ss.head()

In [None]:
train.head()

In [None]:
train['class'].unique()

In [None]:
train['class_id'].unique()

In [None]:
train[['class', 'class_id']].value_counts()

In [None]:
class_map = {cls: i for i, cls in enumerate(sorted(train['class'].unique().tolist()))}
class_map

In [None]:
# Strip any spacing from the class item and make sure that it is a str
train['class'] = train['class'].str.strip()

# Map {'healthy': 2, 'cssvd': 1, anthracnose: 0}
train['class_id'] = train['class'].map(class_map)

In [None]:
train[['class', 'class_id']].value_counts()

In [None]:
# Number of unique images path
train['ImagePath'].nunique()

In [None]:
unique_train = train.drop_duplicates(subset=["Image_ID"])
len(unique_train)

In [None]:
# # Split the train dataset into train_df & val_df
# train_names, val_names = train_test_split(
#     unique_train["Image_ID"].values,
#     stratify=unique_train["class_id"],
#     test_size=0.01,
#     random_state=42,
# )
if os.path.exists(INPUT_DATA_DIR / "Val_df.csv") and False:
    print("Validation data already exists, loading from CSV")
    val_df = pd.read_csv(INPUT_DATA_DIR / "Val_df.csv")
    train_df = pd.read_csv(INPUT_DATA_DIR / "Train_df.csv")
    train_names = train_df["Image_ID"].unique()
    val_names = val_df["Image_ID"].unique()
else:
    train_names, val_names = train_test_split(
        unique_train["Image_ID"].values,
        stratify=unique_train["class_id"],
        test_size=TEST_SIZE,
        random_state=42,
    )
    train_df = train[train["Image_ID"].isin(train_names)]
    val_df = train[train["Image_ID"].isin(val_names)]

In [None]:
train_df.head()

In [None]:
train_df.shape, val_df.shape, test.shape

In [None]:
ss.head()

In [None]:
# Preview target distribution, seems there a class imbalance that needs to be handled
train['class'].value_counts().plot(kind='bar')
plt.title('Train-Val Class Distribution')
plt.show()

In [None]:
train_df['class'].value_counts().plot(kind='bar')
plt.title('Train - Class Distribution')
plt.show()

In [None]:
val_df['class'].value_counts().plot(kind='bar')
plt.title('Val - Class Distribution')
plt.show()

In [None]:
# Create a data.yaml file required by yolo
class_names = sorted(train['class'].unique().tolist())
num_classes = len(class_names)

data_yaml = {
    "path" : str(DATASETS_DIR.absolute()),
    'train': str(TRAIN_IMAGES_DIR.absolute()),
    'val': str(VAL_IMAGES_DIR.absolute()),
    'test': str(TEST_IMAGES_DIR.absolute()),
    'nc': num_classes,
    'names': class_names
}

yaml_path = 'data.yaml'
with open(yaml_path, 'w') as file:
    yaml.dump(data_yaml, file, default_flow_style=False)

# Preview data yaml file
data_yaml

In [None]:
len(val_names), len(set(val_names))

In [None]:
val_image_names = [str(Path(name).stem) for name in val_df['Image_ID'].unique()]

In [None]:
len(val_image_names)

In [None]:
# list of images in TRAIN_IMAGES_DIR
images_in_train_dir = os.listdir(TRAIN_IMAGES_DIR)
len(images_in_train_dir), len(train_names)

In [None]:
train_image_names = [str(Path(name).stem) for name in train['ImagePath'].unique()]

In [None]:
len(train_image_names)

In [None]:
# Check that VAL_IMAGES_DIR & VAL_LABELS_DIR exist if not create them
for DIR in [VAL_IMAGES_DIR, VAL_LABELS_DIR]:
    if not DIR.exists():
        DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# Copy images in val_names to dataset/val and do the same with the labels
for image_name in tqdm(val_names):
    if os.path.exists(TRAIN_IMAGES_DIR / image_name):
        shutil.move(TRAIN_IMAGES_DIR / image_name, VAL_IMAGES_DIR / image_name)
        img_ext = image_name.split('.')[-1]
        label_name = image_name.replace(f'.{img_ext}', '.txt')
        shutil.move(TRAIN_LABELS_DIR / label_name, VAL_LABELS_DIR / label_name)

In [None]:
train.sample(5)

In [None]:
train_df.to_csv(INPUT_DATA_DIR / "Train_df.csv", index=False)
val_df.to_csv(INPUT_DATA_DIR / "Val_df.csv", index=False)