In [0]:
import numpy as np

# A python library for data observation and data analysis
import pandas as pd 

# A Python library for scientific computing
from scipy import stats

In [0]:
# Mount Colab with local drive to access data
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


##Definition of the pre-processing functions 

In [0]:
def load_data():
  # Read all raw data files and concatenate
  per_object_base = pd.read_csv("gdrive/My Drive/ML2_final/data/ML-course_project_BSF/SoftC_per_object.csv")
  per_object_more_data = pd.read_csv("gdrive/My Drive/ML2_final/data/ML-course_project_BSF/SoftC_perObject_Supplement1_more_data.csv") # data points around chosen time points

  per_object = pd.concat([per_object_base, per_object_more_data], axis=0, sort=False)

  return per_object

Rename given classes (phenotypes) by numbers. \
*   CC: 'E05' and 'F05' all plates = 0 
*   CG: 'D16', 'J14', 'K14', 'L15' plate 3 = 1
*   DM: 'D05' all plates = 2 (control)
*   DX: 'G12', 'H14', 'N13' plate 3 = 3
*   GA: 'K05' and 'L05' all plates = 4
*   ST: 'D14', 'H15', 'J12', 'J15' plate 3 = 5
*   TT: 'H14' plate 1, 'J09', 'M14' plate 2 = 6
*   TU: 'E16', 'M15'plate 1, 'L09' plate 2 = 7
*   TX: 'G05', 'H05' all plates = 8
*   VD: 'I05', 'J05' all plates = 9


In [0]:
def split_data(data):
  ''' 
    rename plates, add known labels, split train (labeled data) from test set (unlabeled data)
  '''

  #name plates
  plate1 = ['BSF01745' + str(1)]
  plate2 = ['BSF01745' + str(2)]
  plate3 = ['BSF01745' + str(3)]

  tmp = data.copy(deep=True) # Create deep copy of raw data
  #add a column 'Label' for the predicted and known phenotypes 
  tmp.insert(0, 'Label', -1)

  #class 0
  CC_wells = ['E05', 'F05']
  CC_idxs = data['Image_Metadata_Well'].isin(CC_wells)
  CC_set = data[CC_idxs] 
  tmp.loc[CC_idxs, 'Label'] = 0
  
  #class 1
  CG_wells = ['D16', 'J14', 'K14', 'L15']
  CG_plate = plate3
  CG_idxs = (data['Image_Metadata_Well']).isin(CG_wells) & (data["Image_Metadata_Plate"]).isin(CG_plate)
  CG_set = data[CG_idxs]
  tmp.loc[CG_idxs, 'Label'] = 1

  #class 2
  DM_wells = ['D05']
  DM_idxs = data['Image_Metadata_Well'].isin(DM_wells)
  DM_set = data[DM_idxs]
  tmp.loc[DM_idxs, 'Label'] = 2

  #class 3
  DX_wells = ['G12', 'H14', 'N13']
  DX_plate = plate3
  DX_idxs = (data["Image_Metadata_Well"]).isin(DX_wells) & (data["Image_Metadata_Plate"]).isin(DX_plate)
  DX_set =  data[DX_idxs]
  tmp.loc[DX_idxs, 'Label'] = 3

  #class 4
  GA_wells = ['K05', 'L05']
  GA_idxs = data['Image_Metadata_Well'].isin(GA_wells)
  GA_set = data[GA_idxs] 
  tmp.loc[GA_idxs, 'Label'] = 4

  #class 5
  ST_wells = ['D14', 'H15', 'J12', 'J15']
  ST_plate = plate3
  ST_idxs = (data["Image_Metadata_Well"]).isin(ST_wells) & (data["Image_Metadata_Plate"]).isin(ST_plate)
  ST_set =  data[ST_idxs]
  tmp.loc[ST_idxs, 'Label'] = 5

  #class 6
  TT1_wells, TT2_wells = ['H14'], ['J09', 'M14']
  TT1_idxs = (data["Image_Metadata_Well"]).isin(TT1_wells) & (data["Image_Metadata_Plate"]).isin(plate1)
  TT2_idxs = (data["Image_Metadata_Well"]).isin(TT2_wells) & (data["Image_Metadata_Plate"]).isin(plate2)
  tmp.loc[(TT1_idxs), 'Label'] = 6
  tmp.loc[(TT2_idxs), 'Label'] = 6
  TT_set = pd.concat([data[TT1_idxs], data[TT2_idxs]], axis=0)

  #class 7
  TU1_wells, TU2_wells = ['E16', 'M15'], ['L09']
  TU1_idxs = (data["Image_Metadata_Well"]).isin(TU1_wells) & (data["Image_Metadata_Plate"]).isin(plate1)
  TU2_idxs = (data["Image_Metadata_Well"]).isin(TU2_wells) & (data["Image_Metadata_Plate"]).isin(plate2)
  tmp.loc[(TU1_idxs), 'Label'] = 7
  tmp.loc[(TU2_idxs), 'Label'] = 7
  TU_set = pd.concat([data[TU1_idxs], data[TU2_idxs]], axis=0)
  
  #class 8
  TX_wells = ['G05', 'H05']
  TX_idxs = data['Image_Metadata_Well'].isin(TX_wells)
  TX_set = data[TX_idxs] 
  tmp.loc[TX_idxs, 'Label'] = 8

  #class 9
  VD_wells = ['I05', 'J05']
  VD_idxs = data['Image_Metadata_Well'].isin(VD_wells)
  VD_set = data[VD_idxs] 
  tmp.loc[VD_idxs, 'Label'] = 9

  #index of the known wells used for training  
  labeled = pd.concat([CC_set, CG_set, DM_set, DX_set, GA_set, ST_set, TT_set, TU_set, TX_set, VD_set], axis=0)
  #index of the unknown wells used for testing 
  unlabeled = pd.concat([data, labeled]).drop_duplicates(keep=False)

  # collect plate number and well as metadata for each set
  meta_labeled = labeled[['Image_Metadata_Well', 'Image_Metadata_Plate', 'Image_Metadata_TimePoint']]
  meta_unlabeled = unlabeled[['Image_Metadata_Well', 'Image_Metadata_Plate', 'Image_Metadata_TimePoint']]

  # remove the collect metadata on position (plate, well)
  labeled = labeled.drop(['Image_Metadata_Well', 'Image_Metadata_Plate'], axis=1)
  unlabeled = unlabeled.drop(['Image_Metadata_Well', 'Image_Metadata_Plate'], axis=1)

  #create df with phenotypes 
  labels = tmp.loc[tmp.Label >= 0][['Label', 'Image_Metadata_TimePoint']]

  return labeled, unlabeled, labels, meta_labeled, meta_unlabeled

Time points need to be grouped together: \
 3, 4, 5 et 6 together as 0 \
 47, 48, 49 et 50 together as 1 \
 95, 96, 97 et 98 together as 2 \
 191, 192, 193 et 194 together as 3


In [0]:
def clean(data): 
  '''
    remove features based on their interest for the experiment 
    rename time points 
  '''

  # useless columns: there is no tubulin in the nucleus, and no H2B in the cytoplasm
  nuclei_tub = data[(name.endswith('_TUB') and name.startswith('Nuclei_') for name in data.columns)]
  cytoplasm_h2b = data[(name.endswith('_H2B') and name.startswith('Cytoplasm_') for name in data.columns)]
  
  # position or orientation of the cell does not matter for predicting phenotypes
  x_dim = data[(name.count('_X')!=0 for name in data.columns)] 
  y_dim = data[(name.count('_Y')!=0 for name in data.columns)]
  z_dim = data[(name.count('_Z')!=0 for name in data.columns)] 
  orient = data[(name.count('Orientation')!=0 for name in data.columns)]

  # automatically generated features, unnecessary for our project
  children = data[(name.count('_Children')!=0 for name in data.columns)]
  
  # drop all selected unncessary columns 
  columns_drop = pd.concat([nuclei_tub, cytoplasm_h2b, x_dim, y_dim, z_dim, children], axis=1)
  data = data.drop(columns = columns_drop)
  
  # rename group time points together
  # 4 = 1h
  # 48: 12h
  # 96: 24h
  # 192: 48h
  # others: timepoints around 1h/12h/24h/48h
  replacement=[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
  data['Image_Metadata_TimePoint'].replace(to_replace=[3,4,5,6,47,48,49,50,95,96,97,98,191,192,193,194] , value=replacement, inplace=True)
  
  # drop metadata
  columns_to_drop = ["Image_Metadata_Fov", "Image_Metadata_Timems", "ImageNumber", "ObjectNumber", "Cells_Number_Object_Number",
                    "Cells_Parent_Nuclei","Iteration", "Cytoplasm_Number_Object_Number", "Nuclei_Number_Object_Number", "Cytoplasm_Parent_Cells",
                    "Cytoplasm_Parent_Nuclei", "Nuclei_Number_Object_Number"]  
  
  return data.drop(columns = columns_to_drop)

In [0]:
def group(data, m_data, labels=None):
  '''
    group data points by time points in a table 
    create separate dataframes for labels and metadatas
    data: df with features to analyse
    m_data: df with plate and well of each object/cell
    labels: labels associated to cell, if present
  '''

  #group data by timepoint 
  gb = data.groupby('Image_Metadata_TimePoint') 
  gbm = m_data.groupby('Image_Metadata_TimePoint')
  dfs = []
  mdatas=[]

  # group label (metadata on the position in the plate/well) by timepoint
  if not(labels is None):
    lb = labels.groupby('Image_Metadata_TimePoint')
    labels = []

    for tp in lb.groups:
      tp_per_object = lb.get_group(tp)

      df = tp_per_object.drop(['Image_Metadata_TimePoint'], axis=1)
      labels.append(df)

  for tp in gb.groups: 
    tp_per_object = gb.get_group(tp)
    df = tp_per_object.drop(['Image_Metadata_TimePoint'], axis=1)
    dfs.append(df)

  for tp in gbm.groups: 
    tp_per_object_m = gbm.get_group(tp)
    mdata = tp_per_object_m.drop(['Image_Metadata_TimePoint'], axis=1)
    mdatas.append(mdata)

  #return generated df 
  if not(labels is None):
    return dfs, mdatas, labels
  return dfs, mdatas

In [0]:
def remove_outliers(data):
  ''' 
  remove outliers of data based on a z-score statistics 
  ''' 
  # Outliers have an absolute zscore superior to 3 on at least one feature
  outlier_idxs = np.unique(np.where(np.abs(stats.zscore(data)) > 3)[0])

  # Build rows of non-outliers, as those who never have an absolute zscore above 3
  non_outlier_idxs = np.delete(range(len(data)), outlier_idxs)
  
  return non_outlier_idxs

In [0]:
# Remove outliers in each well

def remove_outliers_per_well(new_l_groups, label_groups, new_meta_lgroups):
  #all_non_outliers = []
  for i in range(len(new_l_groups)):
    # Find all possible wells in all plates
    '''
       Structure of all wells: ['D05_1', 'D05_2', 'D05_3'...]
       --> Equivalent to: well D05 in plate, well D05 in plate 2, well D05 in plate 3...
    '''

    all_wells = np.unique(new_meta_lgroups[i]) 

    # List that will contain all indices for objects that are not outliers at a given timepoint
    non_outliers = []

    for well in all_wells:
      well_idxs = np.where(new_meta_lgroups[i] == well)[0]

      well_data = new_l_groups[i][well_idxs,:]
      #well_metadata = new_meta_lgroups[i][well_idxs]

      non_outlier_idxs = well_idxs[remove_outliers(well_data)]
      non_outliers.extend(non_outlier_idxs)
    
    new_l_groups[i] = new_l_groups[i][non_outliers,:]
    
    label_groups[i] = label_groups[i].iloc[non_outliers, -1]

    new_meta_lgroups[i] = new_meta_lgroups[i][non_outliers]
  
  return new_l_groups, label_groups, new_meta_lgroups
  

In [0]:
def save_data(new_l_groups, label_groups, new_meta_lgroups, u_groups, meta_ugroups):
  '''
    Save all processed data as CSV files
  '''
  fname = 'gdrive/My Drive/ML2_final/data/'
  for i in range(len(new_l_groups)):
    labeled_fname = fname + 'labeled_' + repr(i) + '.csv'
    unlabeled_fname = fname + 'unlabeled_' + repr(i) + '.csv'
    labels_fname = fname + 'labels_' + repr(i) + '.csv'
    meta_labeled_fname = fname + 'meta_labeled_' + repr(i) + '.csv'
    meta_unlabeled_fname = fname + 'meta_unlabeled_' + repr(i) + '.csv'

    np.savetxt(labeled_fname, new_l_groups[i], delimiter=",")
    np.savetxt(unlabeled_fname, u_groups[i], delimiter=",")
    np.savetxt(labels_fname, label_groups[i], delimiter=",")
    # As these arrays only contain strings, they were passed as Pandas dataframes before storage as .csv
    pd.DataFrame(new_meta_lgroups[i]).to_csv(meta_labeled_fname)
    pd.DataFrame(meta_ugroups[i]).to_csv(meta_unlabeled_fname)

##Pre-processing 

In [0]:
# load data, and remove unnecessary features for raw data
per_object = clean(load_data())

In [0]:
# reset indices for later pre-processing
per_object = per_object.reset_index(drop=True)
per_object.shape

(1597594, 97)

In [0]:
# split data into labeled and unlabeled data, with corresponding metadata
labeled, unlabeled, labels, meta_labeled, meta_unlabeled = split_data(per_object)

# Check that dimensions are ok (same number of rows for all labeled sets, and same number of rows for all unlabeled sets)
assert(labeled.shape[0] == labels.shape[0] == meta_labeled.shape[0])
assert(unlabeled.shape[0] == meta_unlabeled.shape[0])

print('Dimensions are OK.')
print('Labeled data:', labeled.shape)
print('Unlabeled data:', unlabeled.shape)
print('Labels:', labels.shape)

Dimensions are OK.
Labeled data: (337974, 95)
Unlabeled data: (1259620, 95)
Labels: (337974, 2)


In [0]:
# group labeled and unlabeled data by time point
l_groups, meta_lgroups, label_groups = group(labeled, meta_labeled, labels)
u_groups, meta_ugroups = group(unlabeled, meta_unlabeled)

In [0]:
# Check that row-dimensions are equal for labeled and unlabeled data at each timepoint
for i in range(len(l_groups)):
  # Print number of examples for each timepoint, for either labeled or unlabeled data
  print('Labeled objects in timepoint ' + repr(i) + ': ' + repr(l_groups[i].shape[0]))
  print('Unlabeled objects in timepoint ' + repr(i) + ': ' + repr(u_groups[i].shape[0]))

Labeled objects in timepoint 0: 78117
Unlabeled objects in timepoint 0: 288157
Labeled objects in timepoint 1: 82097
Unlabeled objects in timepoint 1: 308991
Labeled objects in timepoint 2: 84765
Unlabeled objects in timepoint 2: 321566
Labeled objects in timepoint 3: 92995
Unlabeled objects in timepoint 3: 340906


In [0]:
# Create a deep copy of labeled input and its metadata as NumPy arrays for later processing
new_meta_lgroups = []
new_l_groups = []
for i in range(len(l_groups)):
  new_meta_lgroups.append(np.asarray(meta_lgroups[i].copy(deep=True)))
  new_l_groups.append(np.asarray(l_groups[i].copy(deep=True)))

In [0]:
# transform metadata: 2 columns into 1 column
# The new column contains the well (D05, D06) and the plate number (1, 2 or 3)
for i in range(len(new_meta_lgroups)):
  for j in range(new_meta_lgroups[i].shape[0]):
    new_meta_lgroups[i][j, 0] += '_' + new_meta_lgroups[i][j, -1][-1]

In [0]:
for i in range(len(new_meta_lgroups)):
  new_meta_lgroups[i] = new_meta_lgroups[i][:,0]

In [0]:
# Check that x-dimensions are still consistent
for (idx_grp, grp) in enumerate(new_l_groups):
  assert(grp.shape[0] == new_meta_lgroups[idx_grp].shape[0])
  print('Dimensions OK for timepoint: ' + repr(idx_grp) + '. Number of rows for labeled data: ' + repr(grp.shape[0]))

Dimensions OK for timepoint: 0. Number of rows for labeled data: 78117
Dimensions OK for timepoint: 1. Number of rows for labeled data: 82097
Dimensions OK for timepoint: 2. Number of rows for labeled data: 84765
Dimensions OK for timepoint: 3. Number of rows for labeled data: 92995


In [0]:
new_l_groups, label_groups, new_meta_lgroups = remove_outliers_per_well(new_l_groups, label_groups, new_meta_lgroups)

  return (a - mns) / sstd
  


In [0]:
# Check dimensions after outlier removal
for i in range(len(new_l_groups)):
  assert(new_l_groups[i].shape[0] == label_groups[i].shape[0] == new_meta_lgroups[i].shape[0])
  print('Row dimensions are OK. Number of examples left in timepoint ' + repr(i) + ': ' + repr(new_l_groups[i].shape[0]))

Row dimensions are OK. Number of examples left in timepoint 0: 41393
Row dimensions are OK. Number of examples left in timepoint 1: 45139
Row dimensions are OK. Number of examples left in timepoint 2: 47782
Row dimensions are OK. Number of examples left in timepoint 3: 51847


In [0]:
save_data(new_l_groups, label_groups, new_meta_lgroups, u_groups, meta_ugroups)