## Brief EDA
Create pandas-profiling report to see if the labels are balanced, check if image height and width all the same for all images, check if each image corresponds to only one label.

In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Install pandas-profiling package used for EDA

In [None]:
! pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip 

Collecting https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
[?25l  Downloading https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
[K     \ 38.2MB 1.9MB/s
Collecting confuse>=1.0.0
  Downloading https://files.pythonhosted.org/packages/6d/55/b4726d81e5d6509fa3441f770f8a9524612627dc1b2a7d6209d1d20083fe/confuse-1.4.0-py2.py3-none-any.whl
Collecting visions[type_image_path]==0.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/98/30/b1e70bc55962239c4c3c9660e892be2d8247a882135a3035c10ff7f02cde/visions-0.6.0-py3-none-any.whl (75kB)
[K     |████████████████████████████████| 81kB 5.7MB/s 
Collecting htmlmin>=0.1.12
  Downloading https://files.pythonhosted.org/packages/b3/e7/fcd59e12169de19f0131ff2812077f964c6b960e7c09804d30a7bf2ab461/htmlmin-0.1.12.tar.gz
Collecting phik>=0.10.0
[?25l  Downloading https://files.pythonhosted.org/packages/a7/44/13c8ba410f8bf9ba54aa678e18f23f73a270235b6fd74f8be8b7d13e4de8/phik-0.11.1.tar.gz (596kB)
[K  

In [2]:
import sys
import os
sys.path.append('/content/drive/MyDrive/gcolab/bacteria_segmentation')
os.chdir('/content/drive/MyDrive/gcolab/bacteria_segmentation')

In [3]:
import json
import numpy as np
import pandas as pd

## Constants:
- abbreviated labels — these labels will be used in future
- labels with full names (used in .json files descriptioning masks)
- dictionary mapping full labels names to abbreviated ones
- paths to data and etc. (file extensions, ...)

In [None]:
LABELS = ['epidermidis', 'pneumoniae', 'aureus', 'moraxella', 'kefir', 'cloacae']
LABELS_FULL = ['staphylococcus_epidermidis', 'klebsiella_pneumoniae', 'staphylococcus_aureus',
               'moraxella_catarrhalis', 'c_kefir', 'ent_cloacae']

LABELS_DICT = dict(zip(LABELS_FULL, LABELS))

PATH_TRAIN_RAW = 'data/train_raw/'
MASKS_TRAIN_FOLDER_NAME = 'masks'

FILE_FILTER = '.json'

DataFrame for storing basic information about images: filename, label (abbreviated), image width and height

In [None]:
df_eda = pd.DataFrame(columns=['filename', 'labels', 'image_h', 'image_w'])

Append file to dataframe above

In [None]:
def add_file_to_df(filename):
  global df_eda

  with open(os.path.join(PATH_TRAIN_RAW, f'{filename}{FILE_FILTER}'), 'r') as f:
        layout = json.load(f)
  h, w = layout['imageHeight'], layout['imageWidth']
  labels = ''
  for shape in layout['shapes']:
    label = LABELS_DICT[shape['label']]
    
    if label not in labels:
      labels += f'_{label}'

  df_eda.loc[len(df_eda)] = [filename, labels, h, w]


Append all files to dataframe

In [None]:
def create_df_for_all_files():
  files = os.listdir(PATH_TRAIN_RAW)
  files_jsons = list(filter(lambda x: x[-len(FILE_FILTER):].lower() == FILE_FILTER, files))

  for i, file in enumerate(files_jsons):
    
    if i % 10 == 0:
      print(f'done {i} of {len(files_jsons)}')

    add_file_to_df(file[:-len(FILE_FILTER)])

In [None]:
create_df_for_all_files()

done 0 of 261
done 10 of 261
done 20 of 261
done 30 of 261
done 40 of 261
done 50 of 261
done 60 of 261
done 70 of 261
done 80 of 261
done 90 of 261
done 100 of 261
done 110 of 261
done 120 of 261
done 130 of 261
done 140 of 261
done 150 of 261
done 160 of 261
done 170 of 261
done 180 of 261
done 190 of 261
done 200 of 261
done 210 of 261
done 220 of 261
done 230 of 261
done 240 of 261
done 250 of 261
done 260 of 261


In [None]:
df_eda

Unnamed: 0,filename,labels,image_h,image_w
0,115,_cloacae,512,640
1,050,_kefir,512,640
2,007,_kefir,512,640
3,142,_cloacae,512,640
4,011,_kefir,512,640
...,...,...,...,...
256,213,_moraxella,512,640
257,090,_cloacae,512,640
258,182,_moraxella,512,640
259,244,_epidermidis,512,640


In [None]:
df_eda.to_csv('bacteria_segmentation_eda.csv')

In [None]:
import pandas_profiling

Create pandas-profiling report for simple EDA

In [None]:
report = pandas_profiling.ProfileReport(df_eda, title='bacteria_segmentation')

In [None]:
report.to_file('bacteria_segmentation.html')

Summarize dataset:   0%|          | 0/17 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

  plt.legend(wedges, data.index.values, **legend_kws)
  plt.legend(wedges, data.index.values, **legend_kws)
  plt.legend(wedges, data.index.values, **legend_kws)
  plt.legend(wedges, data.index.values, **legend_kws)
  plt.legend(wedges, data.index.values, **legend_kws)
  plt.legend(wedges, data.index.values, **legend_kws)


Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
LABELS_MASKS = ['_background', '_epidermidis', '_pneumoniae', '_aureus', '_moraxella', '_kefir', '_cloacae']

PATH_TRAIN = 'data//train'
MASKS_PATH_POSTFIX = '_masks'
MASKS_FORMAT = '.png'

In [10]:
import cv2

In [15]:
def count_freq_pixel_labels_dict(df_data):
  freq_pixel_dict = dict(zip(LABELS_MASKS, [0]*len(LABELS_MASKS)))

  for _, row in df_data.iterrows():
    label = row['labels']
    filename = format(row['filename'], '03d')
    area = row['image_w'] * row['image_h']

    mask_labeled = cv2.imread(os.path.join(PATH_TRAIN, f'{filename}{MASKS_PATH_POSTFIX}/', f'{filename}{label}{MASKS_FORMAT}'), cv2.IMREAD_UNCHANGED)
    pixel_labels_count = np.count_nonzero(mask_labeled)
    pixel_background_count = area - pixel_labels_count

    freq_pixel_dict[label] += pixel_labels_count
    freq_pixel_dict['_background'] += pixel_background_count

  return freq_pixel_dict


In [6]:
df_data = pd.read_csv('bacteria_segmentation_eda.csv')

In [12]:
df_data.head()

Unnamed: 0.1,Unnamed: 0,filename,labels,image_h,image_w
0,0,115,_cloacae,512,640
1,1,50,_kefir,512,640
2,2,7,_kefir,512,640
3,3,142,_cloacae,512,640
4,4,11,_kefir,512,640


In [17]:
dict_freq_pixel_count = count_freq_pixel_labels_dict(df_data)
dict_freq_pixel_count

{'_aureus': 31883,
 '_background': 83935852,
 '_cloacae': 268389,
 '_epidermidis': 86426,
 '_kefir': 879189,
 '_moraxella': 248657,
 '_pneumoniae': 74084}

In [18]:
import json

In [20]:
with open('pixel_freq_count.json', 'w') as fp:
  json.dump(dict_freq_pixel_count, fp, indent='\n')