# DataFrame cleaning, normalization and feature selection

In [24]:
import pandas as pd
import numpy as np
import pycytominer
import pickle

## Import the raw data table with the metadata

In [6]:
compartments = ['Nuclei', 'Cytoplasm', 'Cells']

In [7]:
path = r'G:\My Drive\Analysis\DILI\workspace\backend'

In [8]:
# plate = r'\220617_090443_Plate 1_DILI'
# plate = r'\220607_092050_Plate 1_DILI'
plate = r'\220608_152238_Plate 1_DILI'

In [9]:
list_cols = ['Metadata', 'FileName', 'PathName']

In [10]:
metadata = pd.read_csv('G:\My Drive\Analysis\DILI\workspace\metadata\metadata.csv')

## Join all compartments aggregated by well

Here, we take each csv table (Nuclei, Cell, Cytoplasm) and open it into a df. 

In [11]:
df_all = []
for cmpt in compartments:
    df = pd.read_csv(path + plate + cmpt + ".csv", low_memory=False)
    # print(df)
    #pop metadata and save it to use later
    list_to_pop = ['Metadata_Plate', 'Metadata_Well']
    meta = [df.pop(col) for col in list_to_pop]
    df_meta = pd.concat(meta, axis=1)
    list_df = list(df.columns)
    #pop all metadata, filename and pathname cols
    for name in list_cols:
        for cols in range(len(list_df)):
            if name in list_df[cols]:
                df.drop(list_df[cols], axis='columns', inplace=True)
                # print(list_df[cols])
    list_df = list(df.columns)
    #here we just add Nuclei, Cell or Cytoplasm before each col name
    for cols in range(len(list_df)):
        df.rename(columns={list_df[cols]: cmpt+"_"+list_df[cols]}, inplace=True, errors='raise') 
    df_after = pd.concat([df, df_meta], axis = 'columns')
    features = pycytominer.cyto_utils.infer_cp_features(df_after,compartments=compartments)
    df_ag = pycytominer.aggregate(df_after, features=features, operation = 'median', float_format = "%.3g")
    df_all.append(df_ag)

# Join dataset plate level

In [12]:
df = pd.concat(df_all, axis=1) #create a final dataframe from df_all list above

In [13]:
df = df.loc[:,~df.columns.duplicated()].copy() #exclude duplicate cols

In [14]:
print(df.shape)

(58, 1817)


In [25]:
df.head()

Unnamed: 0,Metadata_Plate,Metadata_Well,Nuclei_ImageNumber,Nuclei_ObjectNumber,Nuclei_AreaShape_Area,Nuclei_AreaShape_BoundingBoxArea,Nuclei_AreaShape_BoundingBoxMaximum_X,Nuclei_AreaShape_BoundingBoxMaximum_Y,Nuclei_AreaShape_BoundingBoxMinimum_X,Nuclei_AreaShape_BoundingBoxMinimum_Y,...,Cells_Texture_Variance_CorrPI_10_02_256,Cells_Texture_Variance_CorrPI_10_03_256,Cells_Texture_Variance_CorrPI_20_00_256,Cells_Texture_Variance_CorrPI_20_01_256,Cells_Texture_Variance_CorrPI_20_02_256,Cells_Texture_Variance_CorrPI_20_03_256,Cells_Texture_Variance_CorrPI_5_00_256,Cells_Texture_Variance_CorrPI_5_01_256,Cells_Texture_Variance_CorrPI_5_02_256,Cells_Texture_Variance_CorrPI_5_03_256
0,220608_152238_Plate 1,B10,11.0,8.0,4196.5,5484.0,665.0,525.5,595.5,451.0,...,215.366229,210.070175,211.701579,215.698091,212.603268,211.439531,213.945414,212.25843,212.516874,213.238307
1,220608_152238_Plate 1,B11,37.0,15.0,2847.0,3780.0,665.0,484.0,600.0,424.5,...,148.839653,150.502579,150.047433,152.647744,153.241051,151.534075,147.293229,147.157296,147.330577,148.230023
2,220608_152238_Plate 1,B3,61.0,18.0,2649.0,3510.0,571.0,487.0,507.0,424.0,...,157.878923,158.643516,163.154688,168.27555,165.339165,169.09912,154.93988,156.587124,155.874534,154.328912
3,220608_152238_Plate 1,B4,87.0,19.5,2851.5,3776.0,620.5,478.0,558.0,421.5,...,125.388832,126.723466,128.192027,131.446936,129.236769,130.47709,122.708104,123.346758,124.450071,124.706196
4,220608_152238_Plate 1,B5,112.0,18.0,2652.0,3480.0,673.0,500.0,615.5,442.5,...,216.938099,219.904319,224.152057,225.925791,225.041882,231.726194,215.108946,215.09103,215.064808,215.946016


# Annotate

- Careful here with the name of the join_on columns: if they have the SAME NAME, pycytominer will drop BOTH. 
- Pycytominer has a "add_metadata_id_to_platemap" parameter that adds "Metadata_" prefix to your metadata columns. Change to False if you already have the prefix.

In [16]:
df_an = pycytominer.annotate(df, metadata, join_on = ['Metadata_Wells', 'Metadata_Well'], add_metadata_id_to_platemap = True)

In [17]:
df_an.head()

Unnamed: 0,Metadata_Concentration,Metadata_Cell,Metadata_Compound,Metadata_moa,Metadata_DILI-concern,Metadata_Severity Class,Metadata_Plate,Metadata_Well,Nuclei_ImageNumber,Nuclei_ObjectNumber,...,Cells_Texture_Variance_CorrPI_10_02_256,Cells_Texture_Variance_CorrPI_10_03_256,Cells_Texture_Variance_CorrPI_20_00_256,Cells_Texture_Variance_CorrPI_20_01_256,Cells_Texture_Variance_CorrPI_20_02_256,Cells_Texture_Variance_CorrPI_20_03_256,Cells_Texture_Variance_CorrPI_5_00_256,Cells_Texture_Variance_CorrPI_5_01_256,Cells_Texture_Variance_CorrPI_5_02_256,Cells_Texture_Variance_CorrPI_5_03_256
0,1.0,Huh7,Aspirin,COX inhibitor,Less-DILI-Concern,0,220608_152238_Plate 1,B3,61.0,18.0,...,157.878923,158.643516,163.154688,168.27555,165.339165,169.09912,154.93988,156.587124,155.874534,154.328912
1,10.0,Huh7,Aspirin,COX inhibitor,Less-DILI-Concern,0,220608_152238_Plate 1,B4,87.0,19.5,...,125.388832,126.723466,128.192027,131.446936,129.236769,130.47709,122.708104,123.346758,124.450071,124.706196
2,0.2,Huh7,Amiodarone,sodium/potassium-ATPase inhibitor,Most-DILI-Concern,8,220608_152238_Plate 1,B5,112.0,18.0,...,216.938099,219.904319,224.152057,225.925791,225.041882,231.726194,215.108946,215.09103,215.064808,215.946016
3,2.0,Huh7,Amiodarone,sodium/potassium-ATPase inhibitor,Most-DILI-Concern,8,220608_152238_Plate 1,B6,137.0,17.0,...,281.084116,284.057437,284.470988,287.877273,291.492034,286.11139,278.033318,279.199912,276.583569,279.335872
4,1.0,Huh7,Cyclophosphamide,DNA alkylation,\tLess-DILI-Concern,5,220608_152238_Plate 1,B7,163.0,17.0,...,159.982108,161.206634,159.627931,167.744992,163.380125,164.382678,159.701625,159.640378,159.636862,158.83195


## Normalize data using pycytominer

- Pycytominer has a caveat - it will normalize the Location columns, because they have the Compartments as a prefix (for example, 'Nuclei_Location_Center_X'.)
- For us, maintaining these columns with their original value is important to be able to plot single-cell, and have their localization numbers as it is, because they point to locations in the image. 
- That's why we retrieve metadata cols and add Nuclei_Location cols as meta_features, so we don't normalize them. 

In [18]:
#giving cols_keep as a paramter for meta_features. But pycytominer is renaming the Nuclei_Location_Center_Y to Nuclei_Location_Center_Y_x 
# meta_cols = pycytominer.cyto_utils.features.infer_cp_features(df_an, metadata=True)
# cols_keep = meta_cols + ['Nuclei_Location_Center_X', 'Nuclei_Location_Center_Y']

In [19]:
df_loc = df_an[['Nuclei_Location_Center_X', 'Nuclei_Location_Center_Y']]
df_an2 = df_an.drop(columns=['Nuclei_Location_Center_X', 'Nuclei_Location_Center_Y'])

In [20]:
#use pycytominer to normalize columns
#here the default is to ignore only the metadata columns, so it will normalize everything else "Metadata_Compound == 'Ctrl'"  
df_norm = pycytominer.normalize(df_an2, method = 'mad_robustize', mad_robustize_epsilon = 0, samples = "Metadata_Compound == 'DMSO'") 

  mad = np.median(np.abs(x - med))


In [21]:
df_all = pd.concat([df_norm, df_loc], axis = 1) #join the location_columns with the normalized df

In [26]:
df_all.head()

Unnamed: 0,Metadata_Concentration,Metadata_Cell,Metadata_Compound,Metadata_moa,Metadata_DILI-concern,Metadata_Severity Class,Metadata_Plate,Metadata_Well,Nuclei_ImageNumber,Nuclei_ObjectNumber,...,Cells_Texture_Variance_CorrPI_20_00_256,Cells_Texture_Variance_CorrPI_20_01_256,Cells_Texture_Variance_CorrPI_20_02_256,Cells_Texture_Variance_CorrPI_20_03_256,Cells_Texture_Variance_CorrPI_5_00_256,Cells_Texture_Variance_CorrPI_5_01_256,Cells_Texture_Variance_CorrPI_5_02_256,Cells_Texture_Variance_CorrPI_5_03_256,Nuclei_Location_Center_X,Nuclei_Location_Center_Y
0,1.0,Huh7,Aspirin,COX inhibitor,Less-DILI-Concern,0,220608_152238_Plate 1,B3,-3.050107,-3.372454,...,3.470718,3.942599,3.487385,3.609819,3.169775,3.065453,3.501375,3.200141,538.008168,458.608063
1,10.0,Huh7,Aspirin,COX inhibitor,Less-DILI-Concern,0,220608_152238_Plate 1,B4,-2.979678,-2.360718,...,1.753401,2.013983,1.779847,1.828637,1.498379,1.438532,1.720114,1.629242,590.097303,450.25082
2,0.2,Huh7,Amiodarone,sodium/potassium-ATPase inhibitor,Most-DILI-Concern,8,220608_152238_Plate 1,B5,-2.911958,-3.372454,...,6.466825,6.961589,6.31115,6.498073,6.289875,5.928877,6.856511,6.467709,641.863063,470.249426
3,2.0,Huh7,Amiodarone,sodium/potassium-ATPase inhibitor,Most-DILI-Concern,8,220608_152238_Plate 1,B6,-2.844238,-4.046945,...,9.429607,10.205823,9.454049,9.006226,9.552852,9.066632,10.343635,9.829285,594.346481,428.523968
4,1.0,Huh7,Cyclophosphamide,DNA alkylation,\tLess-DILI-Concern,5,220608_152238_Plate 1,B7,-2.773809,-4.046945,...,3.297489,3.914815,3.394728,3.392305,3.416698,3.214892,3.714639,3.438938,618.873067,462.362828


# Save plate

In [23]:
df_all.to_pickle(path + plate + '_Normalized' + '.pkl')