In [None]:
#@title Default title text
# Licensed under the Attribution-NonCommercial-ShareAlike 4.0 International.

In [1]:
### To run the script simply replace the embedding data path with the path to the embedding data in your pc and run all the cells in this script. To obtain the same results pubblished make sure that the libraries listed below are the same versions as stated here
### Running time: ~ 15'

import pandas as pd
import numpy as np
import scipy
import sklearn
import sklearn.linear_model
import sklearn.ensemble

print('pandas version', pd.__version__)       # version 1.1.0 was used in the paper
print('numpy version', np.__version__)        # version 1.19.1 was used in the paper
print('scipy version', scipy.__version__)     #  version 1.4.1 was used in the paper
print('sklearn version', sklearn.__version__) # 0.23.1 was used in the paper



pandas version 1.1.0
numpy version 1.18.5
scipy version 1.4.1
sklearn version 0.23.1


In [2]:
# TODO: Change to final paths when available.
EMBEDDING_DATA_PATH = '~/cell_embeddings_normalized_well_mean.h5'


In [3]:
embedding_df = pd.read_hdf(EMBEDDING_DATA_PATH)
embedding_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,0,1,2,3,4,5,6,7,8,9,...,310,311,312,313,314,315,316,317,318,319
batch,plateset,plate,well,cell_line_id,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1,0,01,A01,35,-0.011380,0.015596,-0.000713,0.022472,0.011061,0.008742,-0.001814,-0.006121,0.015930,-0.008897,...,0.004009,0.002362,0.005836,0.005494,0.004006,0.001027,-0.005221,0.013420,-0.007970,0.005291
2,0,01,A01,35,-0.020697,0.004085,0.000376,0.012405,0.003625,0.012532,-0.000819,-0.010274,0.010611,-0.008731,...,0.008230,0.008236,-0.003655,0.009392,0.002596,0.004486,-0.000056,0.002468,-0.001958,-0.007056
3,0,01,A01,35,-0.017980,0.026568,0.001911,0.003925,-0.002714,0.031943,-0.006049,-0.011123,0.004488,0.001495,...,0.011918,0.012485,0.005318,-0.005615,0.006830,-0.011707,-0.006400,0.016804,-0.005372,0.006547
4,0,01,A01,35,-0.011012,0.013042,0.001664,-0.004280,-0.009064,0.014998,-0.002295,-0.005095,0.005748,-0.006788,...,0.007417,0.008693,-0.001294,0.000942,0.004215,-0.002452,-0.009201,0.002686,-0.009696,0.004878
1,0,02,A01,35,-0.004354,0.002995,0.003153,0.009723,0.012581,0.009024,0.007631,-0.013057,0.011785,-0.010185,...,0.009379,0.004466,0.011103,0.000744,0.008223,-0.005647,0.007555,0.014263,-0.003439,0.009431
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,0,05,H12,58,-0.002883,0.021034,-0.000698,-0.008323,-0.030163,0.026297,-0.007503,0.004422,-0.010300,0.003006,...,0.005290,-0.001000,0.009243,0.007538,-0.011123,-0.012950,0.005893,0.008769,-0.000638,-0.005162
1,0,06,H12,58,-0.020445,0.035047,-0.002931,0.008438,-0.001349,0.021580,-0.015623,-0.013258,0.006832,-0.005546,...,-0.001105,-0.002010,0.005750,0.015476,-0.008056,0.010284,-0.017688,0.009404,0.002734,-0.015022
2,0,06,H12,58,-0.026524,0.018327,-0.000707,0.003034,-0.006621,0.020604,-0.007940,-0.007536,0.006090,-0.004343,...,0.001158,-0.001036,0.003468,0.005221,-0.001713,0.006459,0.009837,0.009122,0.008133,-0.005013
3,0,06,H12,58,0.007209,-0.038815,-0.018416,0.055958,0.079771,-0.040436,0.014855,-0.020958,0.041535,-0.020445,...,-0.012173,0.007175,-0.003238,0.045570,-0.008343,0.080386,0.020263,-0.004837,0.013236,-0.035910


In [4]:
def get_train_test_split(data,batch,layout):
  data3=data.loc[data.plateset==layout]
  data4=data.loc[data.plateset==np.abs(1-layout)]
  auxTrain=data3.loc[data3.batch!=batch]
  aux_test=data4.loc[data4.batch==batch]
  auxTrain=auxTrain.sort_values(axis=0,by=['batch','plate','well'])
  X_train=auxTrain.iloc[:,5:]
  y_train=auxTrain.cell_line_id

  aux_test=aux_test.sort_values(axis=0,by=['batch','plate','well'])
  X_test=aux_test.iloc[:,5:]
  y_test=aux_test.cell_line_id 
     
  return X_train, X_test, y_train, y_test

def get_predictions(model_name,X_train, y_train,X_test):
  if model_name == 'LogisticCV':
    model = sklearn.linear_model.LogisticRegressionCV(
      solver='lbfgs', max_iter=1000000)
  elif model_name == 'Logistic':
    model = sklearn.linear_model.LogisticRegression(solver='lbfgs')
  elif model_name == 'RidgeCV':
    model = sklearn.linear_model.RidgeCV()
  else:
    raise ValueError('Unknown model_name: %s' % model_name)

  model.fit(X_train, y_train)
  model_predictions = model.predict(X_test)
  if model_name == 'RidgeCV':
    preds = model.predict(X_test)
  else:
    preds=model.predict_proba(X_test)
    
  return preds,model_predictions

def get_Cell_level_accuracy(preds,y_test):
  predictions=pd.DataFrame(preds,index = y_test.values)        
  predictions['cell_line_id'] = predictions.index
  preds3=predictions.groupby('cell_line_id').mean()
  rank_df = preds3.rank(axis = 1)
  top_prediction_df = rank_df.idxmax(axis=1)#.reset_index('cell_line_id')
  top_prediction_df = top_prediction_df+1
  top_prediction_df.astype(int)
  return np.sum(top_prediction_df.index.astype(int) == top_prediction_df)/ top_prediction_df.shape[0]

def get_Well_level_accuracy(preds,y_test,model_predictions):
  predictions=pd.DataFrame(preds,index = y_test.values) 
  return np.asarray(np.count_nonzero(model_predictions == y_test)/len(y_test))

def run_prediction_model(data, model_name):
  """Train the model and return the prediction results."""
  Results = pd.DataFrame(columns=['group1','group2','group3','group4','group5','group6','group7','group8','mean','std'])
  
  data=data.reset_index()
  count=0
  for layout in [0,1]: 
      for batch in np.unique(data.batch):
        
        X_train, X_test, y_train,y_test = get_train_test_split(data,batch,layout)
        
        preds,model_predictions=get_predictions(model_name,X_train, y_train,X_test)
               
        Results.loc['WellLevelAccuracy','group'+str(count+1)] = get_Well_level_accuracy(preds,y_test,model_predictions)
        Results.loc['CellLine_Accuracy','group'+str(count+1)] = get_Cell_level_accuracy(preds,y_test)

        count+=1

  Results.loc['WellLevelAccuracy','mean']=np.mean(Results.loc['WellLevelAccuracy'].values[0:-2])
  Results.loc['WellLevelAccuracy','std']=np.std(Results.loc['WellLevelAccuracy'].values[0:-2])
  Results.loc['CellLine_Accuracy','mean']=np.mean(Results.loc['CellLine_Accuracy'].values[0:-2])
  Results.loc['CellLine_Accuracy','std']=np.std(Results.loc['CellLine_Accuracy'].values[0:-2])

  return Results


In [5]:
logistic_cv_result = run_prediction_model(embedding_df,'LogisticCV')
logistic_cv_result

Unnamed: 0,group1,group2,group3,group4,group5,group6,group7,group8,mean,std
WellLevelAccuracy,0.6631944444444444,0.8498233215547704,0.7932862190812721,0.6421052631578947,0.640625,0.9128919860627178,0.8226086956521739,0.7491289198606271,0.759208,0.0962458
CellLine_Accuracy,0.854167,0.9375,0.9375,0.885417,0.802083,0.979167,0.958333,0.947917,0.91276,0.0563217


In [6]:
paper_result_df = pd.DataFrame(columns=['mean','std'])
paper_result_df.loc['WellLevelAccuracy','mean'] = 0.76
paper_result_df.loc['WellLevelAccuracy','std'] = 0.10
paper_result_df.loc['CellLine_Accuracy','mean'] = 0.91
paper_result_df.loc['CellLine_Accuracy','std'] = 0.06

paper_result_df

Unnamed: 0,mean,std
WellLevelAccuracy,0.76,0.1
CellLine_Accuracy,0.91,0.06


In [7]:
#Raises an error if the results don't match the results stated in the paper
pd.testing.assert_frame_equal(logistic_cv_result[['mean', 'std']], paper_result_df, atol=1e-2)