In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from skimage import io, color, exposure
import skimage

from pathlib import Path
import os, sys

from dask import bag, diagnostics


sys.path.insert(0, str(Path.cwd().parent))
import leukopy_lib as leuko
from importlib import reload
reload(leuko)

## Import normal peripheral blood cell dataset

In [2]:
#get path to data/main_dir
path_name = '../../data/PBC_dataset_normal_DIB'
path = Path(path_name)

In [3]:
#create dataframe
df = pd.DataFrame()
df['img_paths'] = [str(image) for image in path.glob('*/*')]
df['id'] = [image.stem.split('_')[1] for image in path.glob('*/*')]
df['label'] = [image.stem.split('_')[0] for image in path.glob('*/*')]
df['cell_type'] = [image.parts[-2] for image in path.glob('*/*')]

df
#id's are similar! Probably 1 patient for different cells

Unnamed: 0,img_paths,id,label,cell_type
0,../../data/PBC_dataset_normal_DIB/monocyte/MO_...,225079,MO,monocyte
1,../../data/PBC_dataset_normal_DIB/monocyte/MO_...,582430,MO,monocyte
2,../../data/PBC_dataset_normal_DIB/monocyte/MO_...,436409,MO,monocyte
3,../../data/PBC_dataset_normal_DIB/monocyte/MO_...,648815,MO,monocyte
4,../../data/PBC_dataset_normal_DIB/monocyte/MO_...,668574,MO,monocyte
...,...,...,...,...
17087,../../data/PBC_dataset_normal_DIB/ig/PMY_73148...,73148,PMY,ig
17088,../../data/PBC_dataset_normal_DIB/ig/MY_364905...,364905,MY,ig
17089,../../data/PBC_dataset_normal_DIB/ig/MY_22578.jpg,22578,MY,ig
17090,../../data/PBC_dataset_normal_DIB/ig/PMY_75125...,751256,PMY,ig


### Sort labels

Most immature neutrophils (IG) as metamyelocytes, myelocytes and promyelocytes are difficult to differentiate and their separation are prone to debate.
Band neutrophils are also immature.
Segmented neutrophils are fully mature and can be merge with the category 'Neutrophil'

In [4]:
df['label'].unique()

array(['MO', 'ERB', 'PLATELET', 'BA', 'BNE', 'SNE', 'NEUTROPHIL', 'LY',
       'EO', 'MMY', 'PMY', 'MY', 'IG'], dtype=object)

In [5]:
df[['label', 'cell_type']].drop_duplicates().sort_values('label')

Unnamed: 0,label,cell_type
5319,BA,basophil
6537,BNE,neutrophil
11080,EO,eosinophil
1420,ERB,erythroblast
14214,IG,ig
9866,LY,lymphocyte
14197,MMY,ig
0,MO,monocyte
14199,MY,ig
6552,NEUTROPHIL,neutrophil


In [10]:
# Fusion of neutrophil sub-groups
df["label"] = df["label"].replace(to_replace = ["SNE","BNE","NEUTROPHIL", "MY","MMY","PMY", "PLATELET"], 
                                  value = ["NEU","NEU","NEU", "IG","IG","IG", "PLT"])

The label `.DS` is not an image and should be removed (1 image)

In [11]:
df = df[~(df['label']=='.DS')]

In [12]:
df['label'].value_counts()

NEU    3329
EO     3117
IG     2895
PLT    2348
ERB    1551
MO     1420
BA     1218
LY     1214
Name: label, dtype: int64

In [13]:
#We now have 8 classes
df[['label', 'cell_type']].drop_duplicates().sort_values('label')

Unnamed: 0,label,cell_type
5319,BA,basophil
11080,EO,eosinophil
1420,ERB,erythroblast
14197,IG,ig
9866,LY,lymphocyte
0,MO,monocyte
6537,NEU,neutrophil
2971,PLT,platelet


### Import images with dask

### Add height, width, brightness, luminance to df

In [14]:
def add_columns(filename):
    
    im = io.imread(filename)
    temp = pd.DataFrame(index=[0])

    temp['height'] = im.shape[0] 
    temp['width'] = im.shape[1]
    temp['mean_brightness'] = np.mean(im)    
    
    im_gray = color.rgb2grey(im)
    temp['mean_luminance'] = np.mean(im_gray)
    
    return temp

In [None]:
addcol_bag = bag.from_sequence(df.img_paths.to_list()).map(add_columns)
with diagnostics.ProgressBar():
    res = addcol_bag.compute()

[#                                       ] | 2% Completed | 11.6s

In [None]:
res_df = pd.concat(res).reset_index(drop=True)
df_temp = df.join(res_df)
df_temp

In [None]:
df_temp.to_csv('../../data/PBC_dataset_normal_df.csv', index=False)

In [57]:
df_temp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17092 entries, 0 to 17091
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   img_paths        17092 non-null  object 
 1   id               17092 non-null  object 
 2   label            17092 non-null  object 
 3   cell_type        17092 non-null  object 
 4   height           17092 non-null  int64  
 5   width            17092 non-null  int64  
 6   mean_brightness  17092 non-null  float64
 7   mean_luminance   17092 non-null  float64
dtypes: float64(2), int64(2), object(4)
memory usage: 1.7+ MB


In [59]:
df_temp.to_pickle('../../data/PBC_dataset_normal_df.pkl')