In [31]:
import os
import pandas as pd
import numpy as np

In [17]:
# Set Basic Dir
base_dir = r"C:\Users\USER\Desktop\profiles.dir"

# Start & End Plate number
start_plate = 24277
end_plate = 26795


combined_df = pd.DataFrame()

# Get .csv file
for plate_number in range(start_plate, end_plate):
    plate_dir = os.path.join(base_dir, f'Plate_{plate_number}', 'profiles')
    if os.path.exists(plate_dir):
        for file_name in os.listdir(plate_dir):
            if file_name.endswith('.csv'):
                file_path = os.path.join(plate_dir, file_name)
                df = pd.read_csv(file_path)
                combined_df = pd.concat([combined_df, df], ignore_index = True)

# Save combined datasets
combined_df.to_csv('cpg0012_combined_df.csv', index = False)

In [2]:
import matplotlib.pyplot as plt
import warnings
import pycytominer
from pycytominer.consensus import modz

In [3]:
warnings.filterwarnings(action = 'ignore')

In [6]:
# Check Number of compounds
compound = combined_df['Metadata_broad_sample'].unique()
len(compound)

30617

In [7]:
# Load Chemical annotation data
chem_dat = pd.read_csv('chemical_annotations.csv')

In [8]:
# Validation Dataset
x_val_dat = combined_df

# Follow idr0080 normalization ways
x_val_norm_dat = pycytominer.normalize(
    profiles = x_val_dat,
    features = 'infer',
    method = 'mad_robustize',
    samples = 'Metadata_solvent == "DMSO"'
)

# Follow idr0080 feature selection ways
x_val_norm_dat = pycytominer.feature_select(
    profiles = x_val_norm_dat,
    features = 'infer',
    operation = [
        'drop_na_columns',
        'blocklist',
        'variance_threshold',
        'drop_outliers'
    ]
)

# Remove DMSO 
#x_val_norm_dat = x_val_norm_dat.query('Metadata_broad_sample != "DMSO"')

# Devied metadata & cell painting data
x_val_meta = x_val_norm_dat.iloc[:, :17]
x_val_df = x_val_norm_dat.iloc[:, 17:]

In [19]:
# MODZ Scaling for validaiton dataset
x_val_df['Metadata_broad_sample'] = x_val_meta['Metadata_broad_sample']

x_df_modz = modz(
    x_val_df,
    replicate_columns = ['Metadata_broad_sample'],
    precision = 5
)

x_df_modz = x_df_modz.iloc[:, 1:]
x_df_modz

Unnamed: 0,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,Cells_AreaShape_MajorAxisLength,Cells_AreaShape_MaxFeretDiameter,Cells_AreaShape_MaximumRadius,Cells_AreaShape_MeanRadius,Cells_AreaShape_MinFeretDiameter,Cells_AreaShape_MinorAxisLength,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_3_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_3_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_3_0,Nuclei_Texture_Variance_RNA_5_0
0,0.001517,0.277820,0.504330,-0.065387,0.062889,0.075443,0.054113,0.188722,-0.130858,-0.136382,...,0.150347,-0.582148,-0.476835,-0.599079,-0.378143,-0.289351,-0.312277,1.013788,1.052467,1.207051
1,0.034036,0.090993,0.306434,0.656347,0.169470,0.097680,0.273494,0.223999,0.274959,0.263713,...,0.263372,-0.718833,-0.380045,-0.425015,-0.320584,-0.054972,-0.143702,0.078715,0.138940,0.073614
2,-0.418352,-0.452332,0.874477,1.169239,-0.875260,-0.897311,-0.907200,-0.869241,-1.096682,-1.076988,...,1.200041,1.311100,1.775256,1.558439,0.854839,0.777536,0.807036,3.170317,2.884533,3.037351
3,-0.302987,-0.107649,0.926159,0.248480,0.374914,0.375433,0.827529,0.768058,0.554810,0.682450,...,0.527436,-0.664602,-0.484909,-0.514862,-0.452709,-0.296196,-0.325485,1.374789,1.315783,1.310991
4,1.015297,0.971762,-0.344257,0.225323,0.981853,1.036833,0.848087,0.872655,0.825637,0.816335,...,1.502258,-0.996452,-0.988205,-1.186983,-0.710340,-0.500895,-0.613318,0.537547,0.145562,0.217822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30612,-0.265613,-0.433268,0.493264,0.043408,-0.420174,-0.448928,-0.318385,-0.310333,-0.296333,-0.272400,...,-0.130101,-0.145140,0.013093,0.052746,-0.828810,-0.877587,-0.837383,-0.535416,-0.393819,-0.292694
30613,-0.210157,-0.305976,0.503456,-0.080167,-0.609581,-0.617716,-0.591531,-0.604536,-0.566016,-0.557393,...,-0.343942,0.760601,0.402465,0.596649,-0.304898,-0.682755,-0.636920,-0.487289,-0.547894,-0.561567
30614,0.597644,0.486508,-0.388333,0.915099,0.624892,0.577874,0.631964,0.754461,0.850882,0.836880,...,0.132184,-1.306855,-1.365039,-1.293119,-0.470069,-0.236536,-0.253907,-0.178110,-0.096819,-0.159071
30615,0.806925,0.851465,-0.373886,0.429034,1.313307,1.253971,1.375236,1.400515,1.388219,1.416915,...,0.618390,-1.604463,-1.515319,-1.582861,-0.668558,-0.681813,-0.753128,0.523788,0.115707,0.128306


In [21]:
# Load IDR0080 Cell Painting Data
x_df = pd.read_csv('cell_painting_modz.tsv', sep = '\t')
x_meta = x_df.iloc[:, :3]
x_df = x_df.iloc[:, 3:]

In [22]:
# Filter except same features in idr0080 in dir0080 & cpg0012
x_cols = x_df.columns
x_val_cols = x_df_modz.columns

filt_cols = x_cols.intersection(x_val_cols)

x_df = x_df[filt_cols]
x_val_df = x_df_modz[filt_cols]

display(x_df)
display(x_val_df)

Unnamed: 0,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_Orientation,Cells_AreaShape_Zernike_0_0,Cells_AreaShape_Zernike_1_1,Cells_AreaShape_Zernike_2_0,Cells_AreaShape_Zernike_2_2,Cells_AreaShape_Zernike_3_3,Cells_AreaShape_Zernike_4_0,...,Nuclei_Texture_SumEntropy_DNA_5_0,Nuclei_Texture_SumEntropy_ER_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_5_0
0,-0.155631,0.014646,0.188053,1.231056,0.031064,-0.585477,0.467976,0.254203,0.129446,0.027104,...,0.882859,0.841687,0.339831,0.562585,0.879950,0.904785,0.923143,0.944998,0.984938,0.961945
1,-0.247842,-0.030773,0.433778,0.062456,0.266860,0.838679,-0.467417,0.340793,-0.024450,0.418928,...,0.513820,0.344554,0.134318,0.018933,0.359496,0.557998,0.504751,0.407462,0.522251,0.519441
2,0.794740,0.743296,-0.438752,0.827288,-0.728499,-0.422394,-0.147624,0.083870,-0.160722,-0.363004,...,-0.456539,0.373750,-0.111184,-0.042733,-0.840771,-0.536215,-0.497296,-0.748232,-0.515240,-0.526734
3,0.480421,0.821096,-0.337898,0.054479,-0.662314,0.067979,0.025010,0.158138,-0.342062,-0.434783,...,0.115466,0.186411,0.249292,0.688435,-0.521766,0.124801,0.063444,-0.560178,-0.062851,0.026056
4,-0.074895,0.569377,0.571678,0.976504,-1.111947,1.348542,0.584906,-3.128483,-2.324773,1.945741,...,0.154802,2.975785,1.128414,-0.638839,0.753884,0.808944,0.594059,0.674015,0.140325,0.417465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,0.247953,0.331462,-0.228694,0.648857,-0.394799,-0.477804,-0.203016,0.387554,0.485502,-0.302333,...,0.305814,0.952718,0.697652,0.087784,0.266722,0.254346,0.367875,0.191115,0.220894,0.257950
353,1.455427,1.153796,-1.395398,1.167778,-1.413069,-0.684198,-0.923160,-0.145706,-0.535382,0.224011,...,-0.207834,-0.280317,-0.422648,-0.763835,-0.254697,-0.380000,-0.404530,-0.266026,-0.341998,-0.287431
354,0.719510,0.691270,-0.999921,0.701489,-0.965601,-0.937154,-0.218680,0.145943,-0.149745,-0.637266,...,0.011171,-0.575574,-0.272102,-0.101586,-0.117305,-0.083261,-0.085342,-0.213054,-0.126822,-0.101668
355,-0.255658,0.075113,0.999995,0.251456,0.237111,0.342004,0.084765,0.957639,-0.562746,-0.409896,...,0.766721,0.518347,1.450779,1.421830,1.483743,0.639279,0.614153,1.624128,0.523767,0.603819


Unnamed: 0,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,Cells_AreaShape_Orientation,Cells_AreaShape_Zernike_0_0,Cells_AreaShape_Zernike_1_1,Cells_AreaShape_Zernike_2_0,Cells_AreaShape_Zernike_2_2,Cells_AreaShape_Zernike_3_3,Cells_AreaShape_Zernike_4_0,...,Nuclei_Texture_SumEntropy_DNA_5_0,Nuclei_Texture_SumEntropy_ER_5_0,Nuclei_Texture_SumEntropy_Mito_5_0,Nuclei_Texture_SumEntropy_RNA_5_0,Nuclei_Texture_SumVariance_AGP_5_0,Nuclei_Texture_SumVariance_DNA_10_0,Nuclei_Texture_SumVariance_DNA_5_0,Nuclei_Texture_Variance_AGP_5_0,Nuclei_Texture_Variance_DNA_10_0,Nuclei_Texture_Variance_DNA_5_0
0,0.001517,0.277820,0.504330,0.872770,-0.062229,-0.097463,-0.136418,1.363021,-0.121675,0.111071,...,0.012441,-0.515965,-0.125886,0.378361,-0.192612,0.263924,0.134799,-0.065563,0.159910,0.150347
1,0.034036,0.090993,0.306434,0.485507,0.052555,-0.031708,-0.116941,0.108810,-0.257286,0.182884,...,0.148373,-0.708293,-0.168461,0.159921,0.005949,0.278903,0.181974,0.119256,0.289176,0.263372
2,-0.418352,-0.452332,0.874477,-0.128444,0.677804,0.464805,0.139284,0.689670,0.092896,-0.396120,...,1.174464,1.873223,0.559107,1.438213,1.659821,1.049936,1.254289,0.822218,1.143912,1.200041
3,-0.302987,-0.107649,0.926159,0.397246,0.287252,-0.666721,0.485750,0.286509,-0.249060,0.851031,...,0.402387,-0.358765,-0.296739,0.925500,-0.294972,0.741252,0.544289,-0.375331,0.497514,0.527436
4,1.015297,0.971762,-0.344257,0.236280,-0.908705,0.246192,-1.322807,0.577879,-0.200651,0.544807,...,1.195175,-0.566205,-0.462147,0.560941,-0.435885,1.580252,1.477814,-0.779816,1.448586,1.502258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30612,-0.265613,-0.433268,0.493264,-0.545809,0.505944,-0.235360,0.530636,0.305867,-0.243942,-0.247084,...,-0.166256,-0.606686,-1.410230,-1.127026,-0.063554,-0.134207,-0.174389,0.904218,-0.093846,-0.130101
30613,-0.210157,-0.305976,0.503456,-0.028201,0.453887,0.195480,0.276551,0.262468,0.886899,-0.258701,...,-0.326757,-0.692443,-1.197271,-1.103457,0.064433,-0.189828,-0.262928,0.950440,-0.374726,-0.343942
30614,0.597644,0.486508,-0.388333,-0.393189,-0.434949,0.348865,-0.739404,-0.280500,0.231109,-0.160809,...,0.281298,-0.558976,-0.394671,-0.123102,-0.271379,0.429285,0.213798,-0.222311,0.061249,0.132184
30615,0.806925,0.851465,-0.373886,0.048235,-0.452466,-0.217211,-0.818209,0.555877,0.007761,0.535272,...,0.762975,-1.044746,-0.611885,0.340107,-0.832198,0.986590,0.869581,-1.017756,0.612146,0.618390


In [None]:
# Save File
x_df.to_csv('x_df.csv')
x_val_df.to_csv('x_val_df.csv')
x_val_meta.to_csv('x_val_meta.csv')