In [134]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from skimage import io, color, exposure
import skimage

from pathlib import Path

## Import normal peripheral blood cell dataset

In [61]:
#get path to data/main_dir
path_name = '../data/PBC_dataset_normal_DIB'
path = Path(path_name)

In [62]:
#create dataframe
df = pd.DataFrame()
df['img_paths'] = [str(image) for image in path.glob('*/*')]
df['id'] = [image.stem.split('_')[1] for image in path.glob('*/*')]
df['label'] = [image.stem.split('_')[0] for image in path.glob('*/*')]

#id's are similar! Probably 1 patient for different cells

### Sort labels

Most immature neutrophils (IG) as metamyelocytes, myelocytes and promyelocytes are difficult to differentiate and their separation are prone to debate.
Band neutrophils are also immature.
Segmented neutrophils are fully mature and can be merge with the category 'Neutrophil'

In [63]:
df['label'].unique()

array(['MO', 'ERB', 'PLATELET', 'BA', 'BNE', 'SNE', 'NEUTROPHIL', 'LY',
       'EO', 'MMY', 'PMY', 'MY', 'IG'], dtype=object)

In [64]:
# Fusion of neutrophil sub-groups
df["label"] = df["label"].replace(to_replace = ["NEUTROPHIL","BNE","MY","MMY","PMY"], 
                                  value = ["SNE","IG","IG","IG","IG"])

The label `.DS` is not an image and should be removed (1 image)

In [65]:
df = df[~(df['label']=='.DS')]

In [66]:
df['label'].unique()

array(['MO', 'ERB', 'PLATELET', 'BA', 'IG', 'SNE', 'LY', 'EO'],
      dtype=object)

In [67]:
#We now have 8 classes
classes = df['label'].unique()

### Import images and store as a list of array

In [161]:
%time
images = [io.imread(file) for file in df['img_paths']]

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 9.06 µs


In [69]:
len(images)

17092

### Add height, width and brightness to df

In [73]:
df['height'] = [im.shape[0] for im in images]
df['width'] = [im.shape[1] for im in images]

In [166]:
df['mean_brightness'] = [np.mean(im) for im in images]

In [170]:
df.to_csv('../data/PBC_dataset_normal_df.csv', index=False)