### Downloading Dataset

So we will first need to download the python library by AIcrowd that will allow us to download the dataset by just inputting the API key. 

In [None]:
%%capture
!pip install -qq aicrowd-cli
%load_ext aicrowd.magic

#!pip install -qU wandb

%aicrowd login --api-key ba03f6555248226a41217fd4e9246e65

!rm -rf data
!mkdir data
%aicrowd ds dl -c mask-prediction -o data

!unzip data/train.zip -d data/ > /dev/null
!unzip data/val.zip -d data/ > /dev/null
!unzip data/test.zip -d data/ > /dev/null

# Importing Libraries

In [None]:
import numpy as np, pandas as pd
from glob import glob
import shutil, os
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import seaborn as sns

In [None]:
dim = 512
# fold = 0 # doing with fold 0 switching to sklearn train test split

# Diving in the dataset 🕵️‍♂️

In [None]:
train_images = './data/train'
val_images = './data/val'
test_images = './data/test'

In [None]:
train_df = pd.read_csv("./data/train.csv")
val_df = pd.read_csv("./data/val.csv")

test_df = pd.read_csv("./data/sample_submission.csv")

In [None]:
train_df.shape, val_df.shape

In [None]:
def get_train_path(image_id):
  return f'data/train/{image_id}.jpg'

def get_valid_path(image_id):
  return f'data/val/{image_id}.jpg'

def get_test_path(image_id):
  return f'data/test/{image_id}.jpg'

In [None]:
train_df['image_path'] = train_df['ImageID'].apply(get_train_path)
val_df['image_path'] = val_df['ImageID'].apply(get_valid_path)
test_df['image_path'] = test_df['ImageID'].apply(get_test_path)

In [None]:
train = pd.concat([train_df, val_df])
train.reset_index(drop = True, inplace = True)

In [None]:
train.head(1) #bbox - xmin , ymin , xmax , ymax

In [None]:
xmin = []
ymin = []
xmax = []
ymax = []

xmin = train.bbox.apply(lambda x : x.split(',')[0][1:])
ymin = train.bbox.apply(lambda x : x.split(',')[1])
xmax = train.bbox.apply(lambda x : x.split(',')[2])
ymax = train.bbox.apply(lambda x : x.split(',')[3][:-1])

In [None]:
train['xmin'] = xmin
train['ymin'] = ymin
train['xmax'] = xmax
train['ymax'] = ymax

train['width'] = [512] * 8000
train['height'] = [512] * 8000

In [None]:
train['xmin'] = train['xmin'].astype('int')
train['ymin'] = train['ymin'].astype('int')
train['xmax'] = train['xmax'].astype('int')
train['ymax'] = train['ymax'].astype('int')

In [None]:
train.head(2)

In [None]:
train.masktype.value_counts()

# pre processing

In [None]:
train['xmin'] = train.apply(lambda row: (row.xmin)/row.width, axis =1)
train['ymin'] = train.apply(lambda row: (row.ymin)/row.height, axis =1)

train['xmax'] = train.apply(lambda row: (row.xmax)/row.width, axis =1)
train['ymax'] = train.apply(lambda row: (row.ymax)/row.height, axis =1)

train['xmid'] = train.apply(lambda row: (row.xmax+row.xmin)/2, axis =1)
train['ymid'] = train.apply(lambda row: (row.ymax+row.ymin)/2, axis =1)

train['w'] = train.apply(lambda row: (row.xmax-row.xmin), axis =1)
train['h'] = train.apply(lambda row: (row.ymax-row.ymin), axis =1)

train['area'] = train['w']*train['h']
train.head()

In [None]:
train.masktype.unique()

In [None]:
def label_mapper(row):
    if row == 'N95': 
        row = 0
    elif row == 'surgical':
        row = 1
    elif row == 'cloth':
        row = 2
    elif row == 'KN95':
        row = 3
    return row

train['class_id'] = train['masktype'].apply(lambda x : label_mapper(x))

In [None]:
features = ['xmin', 'ymin', 'xmax', 'ymax', 'xmid', 'ymid', 'w', 'h', 'area']
X = train[features]
y = train['class_id']
X.shape, y.shape

In [None]:
skf  = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
train_df['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(skf.split(train,train.class_id)):
    train.loc[val_idx, 'fold'] = fold
train['fold'] = train['fold'].astype('int')
train.head()

In [None]:
print(train.groupby(['fold', train.class_id]).size())

In [None]:
train_files = []
val_files   = []
val_files += list(train[train.fold==fold].image_path.unique())
train_files += list(train[train.fold!=fold].image_path.unique())
len(train_files), len(val_files)

In [None]:
train.head(1)

In [None]:
os.makedirs('/kaggle/working/blitz-yolo-labels-dataset/labels', exist_ok = True)

In [None]:
for img in tqdm(train.ImageID.unique()):
    with open(f'/kaggle/working/blitz-yolo-labels-dataset/labels/{img}.txt', 'w') as f:
        row = train[train['ImageID'] == img]\
        [['class_id', 'xmid', 'ymid', 'w', 'h']].values
        #row[:, 1:] /= SIZE
        row = row.astype('str')
        for box in range(len(row)):
            text = ' '.join(row[box])
            f.write(text)
            f.write('\n')

In [None]:
os.listdir('./blitz-yolo-labels-dataset/labels')
with open('./blitz-yolo-labels-dataset/labels/k8o0f.txt') as f:
    print(f.read())

In [None]:
train.head(1)

In [None]:
os.makedirs('/kaggle/working/blitz/labels/train', exist_ok = True)
os.makedirs('/kaggle/working/blitz/labels/val', exist_ok = True)

os.makedirs('/kaggle/working/blitz/images/train', exist_ok = True)
os.makedirs('/kaggle/working/blitz/images/val', exist_ok = True)

label_dir = '/kaggle/working/blitz-yolo-labels-dataset/labels'

for file in tqdm(train_files):
    shutil.copy(file, '/kaggle/working/blitz/images/train')
    filename = file.split('/')[-1].split('.')[0]
    shutil.copy(os.path.join(label_dir, filename + '.txt'), '/kaggle/working/blitz/labels/train')
    
for file in tqdm(val_files):
    shutil.copy(file, '/kaggle/working/blitz/images/val')
    filename = file.split('/')[-1].split('.')[0]
    shutil.copy(os.path.join(label_dir, filename +'.txt'), '/kaggle/working/blitz/labels/val')

# yolo stuff

In [None]:
train.masktype.unique()

In [None]:
from os import listdir
from os.path import isfile, join
import yaml

cwd = '/kaggle/working/'

with open(join( cwd , 'train.txt'), 'w') as f:
    for path in glob('/kaggle/working/blitz/images/train/*'):
        f.write(path+'\n')
            
with open(join( cwd , 'val.txt'), 'w') as f:
    for path in glob('/kaggle/working/blitz/images/val/*'):
        f.write(path+'\n')

data = dict(
    train =  join( cwd , 'train.txt') ,
    val   =  join( cwd , 'val.txt' ),
    nc    = 4,
    names = ['N95', 'surgical', 'cloth', 'KN95']
    )

with open(join( cwd , 'blitz.yaml'), 'w') as outfile:
    yaml.dump(data, outfile, default_flow_style=False)

f = open(join( cwd , 'blitz.yaml'), 'r')
print('\nyaml:')
print(f.read())

In [None]:
# https://www.kaggle.com/ultralytics/yolov5
# !git clone https://github.com/ultralytics/yolov5  # clone repo
# %cd yolov5
shutil.copytree('/kaggle/input/yolov5-official-v31-dataset/yolov5', '/kaggle/working/yolov5')
os.chdir('/kaggle/working/yolov5')
# %pip install -qr requirements.txt # install dependencies

import torch
from IPython.display import Image, clear_output  # to display images

clear_output()
print('Setup complete. Using torch %s %s' % (torch.__version__, torch.cuda.get_device_properties(0) if torch.cuda.is_available() else 'CPU'))

# TRAIN

In [None]:
!WANDB_MODE="dryrun" python train.py --img 512 --batch 16 --epochs 30 --data /kaggle/working/blitz.yaml --weights yolov5x.pt --cache