<a href="https://colab.research.google.com/github/MoizAhmed2517/Deep_Learning_Projects/blob/main/Polyp_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Polyp Detection - Using YOLOv5

In [7]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et
import zipfile 
from google.colab import drive
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from shutil import move

In [8]:
os.chdir("/content/drive/MyDrive/Polyp Detection")

In [9]:
!pwd

/content/drive/MyDrive/Polyp Detection


In [10]:
!ls

 dataset.zip  'Polyp Detection.ipynb'   yolov5


In [5]:
!git clone https://github.com/ultralytics/yolov5.git

Cloning into 'yolov5'...
remote: Enumerating objects: 14887, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 14887 (delta 0), reused 0 (delta 0), pack-reused 14882[K
Receiving objects: 100% (14887/14887), 13.86 MiB | 6.83 MiB/s, done.
Resolving deltas: 100% (10246/10246), done.


In [11]:
zip_ref = zipfile.ZipFile("/content/drive/MyDrive/Polyp Detection/dataset.zip")
zip_ref.extractall()
zip_ref.close()

## Data Preparation - Reading XML Files

In [12]:
xml_lst = glob("/content/drive/MyDrive/Polyp Detection/dataset/*.xml")

In [13]:
xml_lst[:5]

['/content/drive/MyDrive/Polyp Detection/dataset/1.xml',
 '/content/drive/MyDrive/Polyp Detection/dataset/10.xml',
 '/content/drive/MyDrive/Polyp Detection/dataset/100.xml',
 '/content/drive/MyDrive/Polyp Detection/dataset/1000.xml',
 '/content/drive/MyDrive/Polyp Detection/dataset/101.xml']

In [14]:
tree = et.parse(xml_lst[1])
root = tree.getroot()

In [15]:
for child in root:
  print(child.tag, child.attrib)

folder {}
filename {}
path {}
source {}
size {}
object {}
object {}


In [16]:
for object_xml in root.findall('object'):
  name = object_xml.find('name').text
  xmin = object_xml.find('bndbox/xmin').text
  ymin = object_xml.find('bndbox/ymin').text
  xmax = object_xml.find('bndbox/xmax').text
  ymax = object_xml.find('bndbox/ymax').text
  print(name, xmin, xmax, ymin, ymax)

polyp 270 362 247 391
polyp 45 359 209 446


In [17]:
filename = root.find('filename').text
filename

'10.jpg'

In [18]:
width = root.find('size/width').text
height = root.find('size/height').text

print(width, height)

611 530


### Making final helper function for parsing XML files & making their list and converting into dataframe for furthur use.

In [19]:
def xml_parser(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    filename = root.find('filename').text
    width = root.find('size/width').text
    height = root.find('size/height').text
    parser = []

    for object_xml in root.findall('object'):
      name = object_xml.find('name').text
      xmin = object_xml.find('bndbox/xmin').text
      ymin = object_xml.find('bndbox/ymin').text
      xmax = object_xml.find('bndbox/xmax').text
      ymax = object_xml.find('bndbox/ymax').text
      parser.append([filename, width, height, name, xmin, xmax, ymin, ymax])

    return parser

In [20]:
xml_parser(xml_lst[1])

[['10.jpg', '611', '530', 'polyp', '270', '362', '247', '391'],
 ['10.jpg', '611', '530', 'polyp', '45', '359', '209', '446']]

In [21]:
xml_parser(xml_lst[100])

[['189.jpg', '543', '528', 'polyp', '215', '315', '224', '328']]

In [22]:
parsed_all_xml_files = list(map(xml_parser, xml_lst))

In [23]:
data = reduce(lambda x,y : x+y, parsed_all_xml_files)
df = pd.DataFrame(data, columns=['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])

In [24]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,1.jpg,619,529,polyp,243,427,19,142
1,1.jpg,619,529,polyp,187,256,224,347
2,1.jpg,619,529,polyp,345,412,159,267
3,1.jpg,619,529,polyp,229,321,303,416
4,1.jpg,619,529,polyp,340,382,407,459


In [25]:
df.shape

(1460, 8)

In [26]:
df['name'].value_counts()

polyp    1460
Name: name, dtype: int64

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  1460 non-null   object
 1   width     1460 non-null   object
 2   height    1460 non-null   object
 3   name      1460 non-null   object
 4   xmin      1460 non-null   object
 5   xmax      1460 non-null   object
 6   ymin      1460 non-null   object
 7   ymax      1460 non-null   object
dtypes: object(8)
memory usage: 91.4+ KB


In [28]:
df_updated = df.copy()
df_updated['width'] = df_updated['width'].astype("int64")
df_updated['height'] = df_updated['height'].astype("int64")
df_updated['xmin'] = df_updated['xmin'].astype("int64")
df_updated['xmax'] = df_updated['xmax'].astype("int64")
df_updated['ymin'] = df_updated['ymin'].astype("int64")
df_updated['ymax'] = df_updated['ymax'].astype("int64")
df_updated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  1460 non-null   object
 1   width     1460 non-null   int64 
 2   height    1460 non-null   int64 
 3   name      1460 non-null   object
 4   xmin      1460 non-null   int64 
 5   xmax      1460 non-null   int64 
 6   ymin      1460 non-null   int64 
 7   ymax      1460 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 91.4+ KB


In [29]:
df_updated['center_x'] = np.round((df_updated['xmin'] + df_updated['xmax'] / 2) / df_updated['width'], 3)
df_updated['center_y'] = np.round((df_updated['ymin'] + df_updated['ymax'] / 2) / df_updated['height'], 3)
df_updated['width_norm'] = np.round((df_updated['xmax'] - df_updated['xmin']) / df_updated['width'], 3)
df_updated['height_norm'] = np.round((df_updated['ymax'] - df_updated['ymin']) / df_updated['height'], 3)

In [30]:
#required labelling for YOLO -- Normalization of data as well
df_updated.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,width_norm,height_norm
0,1.jpg,619,529,polyp,243,427,19,142,0.737,0.17,0.297,0.233
1,1.jpg,619,529,polyp,187,256,224,347,0.509,0.751,0.111,0.233
2,1.jpg,619,529,polyp,345,412,159,267,0.89,0.553,0.108,0.204
3,1.jpg,619,529,polyp,229,321,303,416,0.629,0.966,0.149,0.214
4,1.jpg,619,529,polyp,340,382,407,459,0.858,1.203,0.068,0.098


In [31]:
images = df['filename'].unique()
len(images)

997

In [32]:
# Splitting in to 80%:20% percent ratios with random shuffle
img_df = pd.DataFrame(images, columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename'])
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename'])

In [33]:
train_df = df_updated.query(f"filename in {img_train}")
test_df = df_updated.query(f"filename in {img_test}")

In [34]:
len(train_df), len(test_df)

(1190, 270)

In [35]:
train_df = train_df[train_df['name'] == 'polyp'].assign(id=0) 
train_df

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,width_norm,height_norm,id
0,1.jpg,619,529,polyp,243,427,19,142,0.737,0.170,0.297,0.233,0
1,1.jpg,619,529,polyp,187,256,224,347,0.509,0.751,0.111,0.233,0
2,1.jpg,619,529,polyp,345,412,159,267,0.890,0.553,0.108,0.204,0
3,1.jpg,619,529,polyp,229,321,303,416,0.629,0.966,0.149,0.214,0
4,1.jpg,619,529,polyp,340,382,407,459,0.858,1.203,0.068,0.098,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1452,993.jpg,720,576,polyp,365,488,387,510,0.846,1.115,0.171,0.214,0
1454,995.jpg,720,576,polyp,321,401,295,363,0.724,0.827,0.111,0.118,0
1455,996.jpg,585,530,polyp,294,423,4,199,0.864,0.195,0.221,0.368,0
1456,996.jpg,585,530,polyp,430,500,380,469,1.162,1.159,0.120,0.168,0


In [36]:
test_df = test_df[test_df['name'] == 'polyp'].assign(id=0)
test_df

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,width_norm,height_norm,id
9,1000.jpg,1348,1070,polyp,217,914,369,1035,0.500,0.829,0.517,0.622,0
15,105.jpg,622,531,polyp,289,374,214,301,0.765,0.686,0.137,0.164,0
32,116.jpg,622,531,polyp,410,545,35,224,1.097,0.277,0.217,0.356,0
33,116.jpg,622,531,polyp,432,586,111,297,1.166,0.489,0.248,0.350,0
38,12.jpg,626,547,polyp,185,336,94,213,0.564,0.367,0.241,0.218,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1436,984.jpg,720,576,polyp,354,355,316,319,0.738,0.826,0.001,0.005,0
1444,987.jpg,1920,1072,polyp,652,999,401,768,0.600,0.732,0.181,0.342,0
1453,994.jpg,621,531,polyp,488,607,268,399,1.275,0.880,0.192,0.247,0
1457,997.jpg,600,528,polyp,287,391,304,427,0.804,0.980,0.173,0.233,0


In [37]:
train_folder = '/content/drive/MyDrive/Polyp Detection/dataset/train'
test_folder = '/content/drive/MyDrive/Polyp Detection/dataset/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

In [38]:
cols = ['filename', 'id', 'center_x',	'center_y',	'width_norm',	'height_norm']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [39]:
def save_data(filename, folder_path, group_obj):
  src = os.path.join('/content/drive/MyDrive/Polyp Detection/dataset', filename)
  dst = os.path.join(folder_path, filename)
  move(src, dst)

  text_filename = os.path.join(folder_path, os.path.splitext(filename)[0] + '.txt')
  group_obj.get_group(filename).set_index('filename').to_csv(text_filename, sep=' ', index=False, header=False)

In [40]:
filename_series = pd.Series(groupby_obj_train.groups.keys())
filename_series.apply(save_data, args=(train_folder, groupby_obj_train))

0      None
1      None
2      None
3      None
4      None
       ... 
793    None
794    None
795    None
796    None
797    None
Length: 798, dtype: object

In [41]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data, args=(test_folder, groupby_obj_test))

0      None
1      None
2      None
3      None
4      None
       ... 
194    None
195    None
196    None
197    None
198    None
Length: 199, dtype: object

In [42]:
for i in os.listdir('/content/drive/MyDrive/Polyp Detection/dataset'):
  if os.path.splitext(i)[1] == '.xml':
    path = '/content/drive/MyDrive/Polyp Detection/dataset/' + i
    os.remove(path)

In [43]:
!pwd

/content/drive/MyDrive/Polyp Detection


In [44]:
os.chdir('/content/drive/MyDrive/Polyp Detection/yolov5') 

In [45]:
!ls

benchmarks.py	 data.yaml   models	       runs	       utils
CITATION.cff	 detect.py   __pycache__       segment	       val.py
classify	 export.py   README.md	       setup.cfg       yolov5s.pt
CONTRIBUTING.md  hubconf.py  README.zh-CN.md   train.py
data		 LICENSE     requirements.txt  tutorial.ipynb


In [46]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [47]:
!python train.py --data data.yaml --cfg yolov5s.yaml --batch-size 8 --name Model --img 640 --epochs 10

[34m[1mtrain: [0mweights=yolov5s.pt, cfg=yolov5s.yaml, data=data.yaml, hyp=data/hyps/hyp.scratch-low.yaml, epochs=10, batch_size=8, imgsz=640, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, noplots=False, evolve=None, bucket=, cache=None, image_weights=False, device=, multi_scale=False, single_cls=False, optimizer=SGD, sync_bn=False, workers=8, project=runs/train, name=Model, exist_ok=False, quad=False, cos_lr=False, label_smoothing=0.0, patience=100, freeze=[0], save_period=-1, seed=0, local_rank=-1, entity=None, upload_dataset=False, bbox_interval=-1, artifact_alias=latest
[34m[1mgithub: [0mup to date with https://github.com/ultralytics/yolov5 ✅
fatal: cannot change to '/content/drive/MyDrive/Polyp': No such file or directory
YOLOv5 🚀 2022-12-26 Python-3.8.16 torch-1.13.0+cu116 CUDA:0 (Tesla T4, 15110MiB)

[34m[1mhyperparameters: [0mlr0=0.01, lrf=0.01, momentum=0.937, weight_decay=0.0005, warmup_epochs=3.0, warmup_momentum=0.8, warmup_bias_lr=0.1, b

In [48]:
!python train.py --data data.yaml --weights yolov5x.pt --batch 8 --name Model --img 640 --epochs 5

[34m[1mtrain: [0mweights=yolov5x.pt, cfg=, data=data.yaml, hyp=data/hyps/hyp.scratch-low.yaml, epochs=5, batch_size=8, imgsz=640, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, noplots=False, evolve=None, bucket=, cache=None, image_weights=False, device=, multi_scale=False, single_cls=False, optimizer=SGD, sync_bn=False, workers=8, project=runs/train, name=Model, exist_ok=False, quad=False, cos_lr=False, label_smoothing=0.0, patience=100, freeze=[0], save_period=-1, seed=0, local_rank=-1, entity=None, upload_dataset=False, bbox_interval=-1, artifact_alias=latest
[34m[1mgithub: [0mup to date with https://github.com/ultralytics/yolov5 ✅
fatal: cannot change to '/content/drive/MyDrive/Polyp': No such file or directory
YOLOv5 🚀 2022-12-26 Python-3.8.16 torch-1.13.0+cu116 CUDA:0 (Tesla T4, 15110MiB)

[34m[1mhyperparameters: [0mlr0=0.01, lrf=0.01, momentum=0.937, weight_decay=0.0005, warmup_epochs=3.0, warmup_momentum=0.8, warmup_bias_lr=0.1, box=0.05, cls=