<a href="https://colab.research.google.com/github/MoizAhmed2517/Deep_Learning_Projects/blob/main/Polyp_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Polyp Detection - Using YOLOv5

In [1]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et
import zipfile 
from google.colab import drive
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from shutil import move

In [2]:
os.chdir("/content/drive/MyDrive/Polyp Detection")

In [3]:
!pwd

/content/drive/MyDrive/Polyp Detection


In [4]:
!ls

'Polyp Detection.ipynb'


In [5]:
!git clone https://github.com/ultralytics/yolov5.git

Cloning into 'yolov5'...
remote: Enumerating objects: 14887, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 14887 (delta 0), reused 0 (delta 0), pack-reused 14882[K
Receiving objects: 100% (14887/14887), 13.86 MiB | 6.83 MiB/s, done.
Resolving deltas: 100% (10246/10246), done.


In [7]:
zip_ref = zipfile.ZipFile("/content/drive/MyDrive/Polyp Detection/dataset.zip")
zip_ref.extractall()
zip_ref.close()

In [8]:
os.chdir('yolov5')

In [9]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gitpython
  Downloading GitPython-3.1.29-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 4.2 MB/s 
Collecting thop>=0.1.1
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.10-py3-none-any.whl (62 kB)
[K     |████████████████████████████████| 62 kB 1.5 MB/s 
[?25hCollecting smmap<6,>=3.0.1
  Downloading smmap-5.0.0-py3-none-any.whl (24 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 55.5 MB/s 
Installing collected packages: smmap, jedi, gitdb, thop, gitpython
Successfully installed gitdb-4.0.10 gitpython-3.1.29 jedi-0.18.2 smmap-5.0.0 thop-0.1.1.post2209072238


## Data Preparation - Reading XML Files

In [10]:
xml_lst = glob("/content/drive/MyDrive/Polyp Detection/dataset/*.xml")

In [13]:
xml_lst[:5]

['/content/drive/MyDrive/Polyp Detection/dataset/1.xml',
 '/content/drive/MyDrive/Polyp Detection/dataset/10.xml',
 '/content/drive/MyDrive/Polyp Detection/dataset/100.xml',
 '/content/drive/MyDrive/Polyp Detection/dataset/1000.xml',
 '/content/drive/MyDrive/Polyp Detection/dataset/101.xml']

In [16]:
tree = et.parse(xml_lst[1])
root = tree.getroot()

In [17]:
for child in root:
  print(child.tag, child.attrib)

folder {}
filename {}
path {}
source {}
size {}
object {}
object {}


In [18]:
for object_xml in root.findall('object'):
  name = object_xml.find('name').text
  xmin = object_xml.find('bndbox/xmin').text
  ymin = object_xml.find('bndbox/ymin').text
  xmax = object_xml.find('bndbox/xmax').text
  ymax = object_xml.find('bndbox/ymax').text
  print(name, xmin, xmax, ymin, ymax)

polyp 270 362 247 391
polyp 45 359 209 446


In [19]:
filename = root.find('filename').text
filename

'10.jpg'

In [20]:
width = root.find('size/width').text
height = root.find('size/height').text

print(width, height)

611 530


### Making final helper function for parsing XML files & making their list and converting into dataframe for furthur use.

In [21]:
def xml_parser(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    filename = root.find('filename').text
    width = root.find('size/width').text
    height = root.find('size/height').text
    parser = []

    for object_xml in root.findall('object'):
      name = object_xml.find('name').text
      xmin = object_xml.find('bndbox/xmin').text
      ymin = object_xml.find('bndbox/ymin').text
      xmax = object_xml.find('bndbox/xmax').text
      ymax = object_xml.find('bndbox/ymax').text
      parser.append([filename, width, height, name, xmin, xmax, ymin, ymax])

    return parser

In [22]:
xml_parser(xml_lst[1])

[['10.jpg', '611', '530', 'polyp', '270', '362', '247', '391'],
 ['10.jpg', '611', '530', 'polyp', '45', '359', '209', '446']]

In [23]:
xml_parser(xml_lst[100])

[['189.jpg', '543', '528', 'polyp', '215', '315', '224', '328']]

In [31]:
parsed_all_xml_files = list(map(xml_parser, xml_lst))

In [32]:
data = reduce(lambda x,y : x+y, parsed_all_xml_files)
df = pd.DataFrame(data, columns=['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])

In [33]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,1.jpg,619,529,polyp,243,427,19,142
1,1.jpg,619,529,polyp,187,256,224,347
2,1.jpg,619,529,polyp,345,412,159,267
3,1.jpg,619,529,polyp,229,321,303,416
4,1.jpg,619,529,polyp,340,382,407,459


In [34]:
df.shape

(1460, 8)

In [36]:
df['name'].value_counts()

polyp    1460
Name: name, dtype: int64

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  1460 non-null   object
 1   width     1460 non-null   object
 2   height    1460 non-null   object
 3   name      1460 non-null   object
 4   xmin      1460 non-null   object
 5   xmax      1460 non-null   object
 6   ymin      1460 non-null   object
 7   ymax      1460 non-null   object
dtypes: object(8)
memory usage: 91.4+ KB


In [38]:
df_updated = df.copy()
df_updated['width'] = df_updated['width'].astype("int64")
df_updated['height'] = df_updated['height'].astype("int64")
df_updated['xmin'] = df_updated['xmin'].astype("int64")
df_updated['xmax'] = df_updated['xmax'].astype("int64")
df_updated['ymin'] = df_updated['ymin'].astype("int64")
df_updated['ymax'] = df_updated['ymax'].astype("int64")
df_updated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  1460 non-null   object
 1   width     1460 non-null   int64 
 2   height    1460 non-null   int64 
 3   name      1460 non-null   object
 4   xmin      1460 non-null   int64 
 5   xmax      1460 non-null   int64 
 6   ymin      1460 non-null   int64 
 7   ymax      1460 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 91.4+ KB


In [39]:
df_updated['center_x'] = np.round((df_updated['xmin'] + df_updated['xmax'] / 2) / df_updated['width'], 3)
df_updated['center_y'] = np.round((df_updated['ymin'] + df_updated['ymax'] / 2) / df_updated['height'], 3)
df_updated['width_norm'] = np.round((df_updated['xmax'] - df_updated['xmin']) / df_updated['width'], 3)
df_updated['height_norm'] = np.round((df_updated['ymax'] - df_updated['ymin']) / df_updated['height'], 3)

In [64]:
#required labelling for YOLO -- Normalization of data as well
df_updated.head(30)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,width_norm,height_norm
0,1.jpg,619,529,polyp,243,427,19,142,0.737,0.17,0.297,0.233
1,1.jpg,619,529,polyp,187,256,224,347,0.509,0.751,0.111,0.233
2,1.jpg,619,529,polyp,345,412,159,267,0.89,0.553,0.108,0.204
3,1.jpg,619,529,polyp,229,321,303,416,0.629,0.966,0.149,0.214
4,1.jpg,619,529,polyp,340,382,407,459,0.858,1.203,0.068,0.098
5,10.jpg,611,530,polyp,270,362,247,391,0.738,0.835,0.151,0.272
6,10.jpg,611,530,polyp,45,359,209,446,0.367,0.815,0.514,0.447
7,100.jpg,622,528,polyp,401,513,183,340,1.057,0.669,0.18,0.297
8,100.jpg,622,528,polyp,352,430,106,186,0.912,0.377,0.125,0.152
9,1000.jpg,1348,1070,polyp,217,914,369,1035,0.5,0.829,0.517,0.622


In [41]:
images = df['filename'].unique()
len(images)

997

In [42]:
# Splitting in to 80%:20% percent ratios with random shuffle
img_df = pd.DataFrame(images, columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename'])
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename'])

In [43]:
train_df = df_updated.query(f"filename in {img_train}")
test_df = df_updated.query(f"filename in {img_test}")

In [46]:
len(train_df), len(test_df)

(1172, 288)

In [51]:
train_df = train_df[train_df['name'] == 'polyp'].assign(id=1) 
train_df

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,width_norm,height_norm,id
5,10.jpg,611,530,polyp,270,362,247,391,0.738,0.835,0.151,0.272,1
6,10.jpg,611,530,polyp,45,359,209,446,0.367,0.815,0.514,0.447,1
7,100.jpg,622,528,polyp,401,513,183,340,1.057,0.669,0.180,0.297,1
8,100.jpg,622,528,polyp,352,430,106,186,0.912,0.377,0.125,0.152,1
10,101.jpg,626,547,polyp,259,355,112,201,0.697,0.388,0.153,0.163,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,994.jpg,621,531,polyp,488,607,268,399,1.275,0.880,0.192,0.247,1
1454,995.jpg,720,576,polyp,321,401,295,363,0.724,0.827,0.111,0.118,1
1457,997.jpg,600,528,polyp,287,391,304,427,0.804,0.980,0.173,0.233,1
1458,998.jpg,720,576,polyp,347,483,208,328,0.817,0.646,0.189,0.208,1


In [53]:
test_df = test_df[test_df['name'] == 'polyp'].assign(id=1)
test_df

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,width_norm,height_norm,id
0,1.jpg,619,529,polyp,243,427,19,142,0.737,0.170,0.297,0.233,1
1,1.jpg,619,529,polyp,187,256,224,347,0.509,0.751,0.111,0.233,1
2,1.jpg,619,529,polyp,345,412,159,267,0.890,0.553,0.108,0.204,1
3,1.jpg,619,529,polyp,229,321,303,416,0.629,0.966,0.149,0.214,1
4,1.jpg,619,529,polyp,340,382,407,459,0.858,1.203,0.068,0.098,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1436,984.jpg,720,576,polyp,354,355,316,319,0.738,0.826,0.001,0.005,1
1445,988.jpg,720,576,polyp,327,546,147,486,0.833,0.677,0.304,0.589,1
1446,989.jpg,720,576,polyp,243,429,194,488,0.635,0.760,0.258,0.510,1
1455,996.jpg,585,530,polyp,294,423,4,199,0.864,0.195,0.221,0.368,1


In [54]:
train_folder = '/content/drive/MyDrive/Polyp Detection/dataset/train'
test_folder = '/content/drive/MyDrive/Polyp Detection/dataset/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

In [56]:
cols = ['filename', 'id', 'center_x',	'center_y',	'width_norm',	'height_norm']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [60]:
def save_data(filename, folder_path, group_obj):
  src = os.path.join('/content/drive/MyDrive/Polyp Detection/dataset', filename)
  dst = os.path.join(folder_path, filename)
  move(src, dst)

  text_filename = os.path.join(folder_path, os.path.splitext(filename)[0] + '.txt')
  group_obj.get_group(filename).set_index('filename').to_csv(text_filename, sep=' ', index=False, header=False)

In [61]:
filename_series = pd.Series(groupby_obj_train.groups.keys())
filename_series.apply(save_data, args=(train_folder, groupby_obj_train))

0      None
1      None
2      None
3      None
4      None
       ... 
793    None
794    None
795    None
796    None
797    None
Length: 798, dtype: object

In [62]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data, args=(test_folder, groupby_obj_test))

0      None
1      None
2      None
3      None
4      None
       ... 
194    None
195    None
196    None
197    None
198    None
Length: 199, dtype: object

In [63]:
for i in os.listdir('/content/drive/MyDrive/Polyp Detection/dataset'):
  if os.path.splitext(i)[1] == '.xml':
    path = '/content/drive/MyDrive/Polyp Detection/dataset/' + i
    os.remove(path)