# This notebook conducts the analysis in Fig. 5 using cell image deep embeddings

## The data files required by this notebook can be accessed at https://nyscf.org/nyscf-adpd/.
    
## Licensed under the Attribution-NonCommercial-ShareAlike 4.0 International.

In [None]:
### To run the script simply replace the embedding data path with the path to the embedding data in your pc and run all the cells in this script. To obtain the same results pubblished make sure that the libraries listed below are the same versions as stated here
### Running time: ~ 1'
 
import pandas as pd
import numpy as np
import scipy
import sklearn
import sklearn.linear_model
import sklearn.ensemble

print('pandas version', pd.__version__)       # version 1.1.0 was used in the paper
print('numpy version', np.__version__)        # version 1.19.1 was used in the paper
print('scipy version', scipy.__version__)     #  version 1.4.1 was used in the paper
print('sklearn version', sklearn.__version__) # 0.23.1 was used in the paper



In [2]:
# TODO: Change to final paths when available.
EMBEDDING_DATA_PATH = '~/tile_embeddings_normalized_well_mean.h5'
METADATA_PATH = '~/Schiff et al. Supplementary Tables.csv'

In [34]:
embedding_df = pd.read_hdf(EMBEDDING_DATA_PATH)
np.unique(embedding_df.reset_index().disease_state)

array(['GBA', 'HEALTHY', 'LRRK2', 'SPORADIC'], dtype=object)

In [36]:
all_metadata_df = pd.read_csv(METADATA_PATH,header=1)
all_metadata_df
np.unique(all_metadata_df.reset_index()['Disease state'])

array(['GBA PD', 'GBA PD*', 'Healthy', 'Healthy*', 'Healthy**',
       'LRRK2 PD', 'Sporadic PD'], dtype=object)

In [51]:
def clean_up_relevant_metadata(all_metadata_df):
  """Make a clean version of the relevant metadata columns we need to join with the embedding dataframe."""
  # Select the columns from the metadata that we want to merge in with the embeddings.
  metadata_to_merge_df = all_metadata_df[['Cell line ID', 'Cross-val fold', 'Disease state']].copy()

  # Rename the columns.
  metadata_to_merge_df.columns = ['cell_line_id', 'group', 'disease_state']

  # Make the disease state uppercase.
  metadata_to_merge_df['disease_state'] = metadata_to_merge_df['disease_state'].str.upper()

  # Fix the naming of the disease_state.
  metadata_to_merge_df['disease_state'].replace('SPORADIC PD', 'SPORADIC', inplace=True)
  metadata_to_merge_df['disease_state'].replace('GBA PD', 'GBA', inplace=True)
  metadata_to_merge_df['disease_state'].replace('LRRK2 PD','LRRK2', inplace=True)
  metadata_to_merge_df['disease_state'].replace('HEALTHY*','HEALTHY', inplace=True)  # See Methods for description about unconfirmed cell line 57 from donor 50634

  # Strip off the whitespace on the cell line id.
  metadata_to_merge_df.loc[:, 'cell_line_id'] = metadata_to_merge_df['cell_line_id'].str.strip()

  # Remove the * from some of the cell_line_id so we can join.
  metadata_to_merge_df['cell_line_id'].replace('48*', '48', inplace=True)
  metadata_to_merge_df['cell_line_id'].replace('57*', '57', inplace=True)

  # Drop any rows that do not have a cross validation group set. This drops all the GBA lines.
  metadata_to_merge_df.dropna(axis=0, subset=['group'], inplace=True)

  # Make cross validation group an int.
  metadata_to_merge_df['group'] = metadata_to_merge_df['group'].astype(int)

  return metadata_to_merge_df

metadata_to_merge_df = clean_up_relevant_metadata(all_metadata_df)
metadata_to_merge_df

Unnamed: 0,cell_line_id,group,disease_state
0,01,1,HEALTHY
1,02,1,LRRK2
2,03,2,HEALTHY
3,04,2,SPORADIC
4,05,4,HEALTHY
...,...,...,...
85,86,3,SPORADIC
86,87,3,HEALTHY
87,88,3,SPORADIC
88,89,2,HEALTHY


In [52]:
def merge_metadata_with_embedding_df(embedding_df, metadata_to_merge_df):
  """Join the metadata with the embedding dataframe."""
  # Reset the index of the embedding dataframe before merge.
  embedding_to_merge_df = embedding_df.reset_index()

  # Merge the embedding data with the metadata.
  merged_df = pd.merge(metadata_to_merge_df, embedding_to_merge_df, on=['cell_line_id','disease_state'])

  return merged_df

train_test_df = merge_metadata_with_embedding_df(embedding_df, metadata_to_merge_df)
train_test_df

Unnamed: 0,cell_line_id,group,disease_state,batch,plateset,plate,well,0,1,2,...,310,311,312,313,314,315,316,317,318,319
0,01,1,HEALTHY,1,1,07,D03,0.004184,0.040249,0.026530,...,-0.015100,-0.017746,0.045604,-0.016558,0.040380,0.000401,0.000483,-0.017317,-0.019437,-0.026159
1,01,1,HEALTHY,2,1,07,D03,0.004639,0.020272,0.018574,...,-0.009795,0.004587,0.049970,-0.020397,0.016456,-0.014867,-0.005192,-0.006122,-0.021637,-0.039839
2,01,1,HEALTHY,3,1,07,D03,0.012636,0.029946,0.016748,...,-0.021145,-0.009695,0.050091,-0.000340,0.016756,-0.011345,-0.000120,-0.011953,-0.023764,-0.029296
3,01,1,HEALTHY,4,1,07,D03,0.008200,0.013857,0.012598,...,-0.026685,0.004242,0.031001,0.001708,0.006971,0.004401,0.014787,-0.013059,-0.025400,-0.022717
4,01,1,HEALTHY,1,1,08,D03,0.002618,0.030171,0.027768,...,-0.004445,-0.015143,0.045930,-0.024526,0.041045,-0.015062,0.004498,-0.007361,-0.022869,-0.049547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3529,90,2,SPORADIC,1,1,11,E03,0.015173,0.014493,-0.002184,...,-0.008282,-0.030635,0.015056,-0.021353,0.036720,0.002607,-0.011756,-0.035307,-0.001649,-0.016294
3530,90,2,SPORADIC,3,1,11,E03,0.006690,-0.010136,-0.000080,...,0.022990,-0.010417,-0.019210,-0.038663,0.021145,0.026774,-0.019745,-0.020217,0.007407,0.001976
3531,90,2,SPORADIC,4,1,11,E03,-0.001238,-0.085635,-0.029494,...,0.005093,0.048951,0.002258,0.052432,-0.049021,0.008997,-0.001029,0.031468,0.009586,0.057692
3532,90,2,SPORADIC,1,1,12,E03,-0.000022,-0.045230,-0.024410,...,0.001309,0.017465,-0.013068,0.012519,-0.025893,0.027894,-0.002240,-0.002403,0.009480,0.014004


In [53]:
def run_prediction_model(data, model_name):
  """Train the model and return the prediction results."""
  Results = pd.DataFrame(
      columns=['group0', 'group1', 'group2', 'group3', 'group4', 'mean', 'std'])

  for fold in np.unique(data.group):
    auxTrain = data.loc[data.group != fold]
    aux_test = data.loc[data.group == fold]
    auxTrain = auxTrain.sort_values(axis=0, by=['batch', 'plate', 'well'])
    X_train = auxTrain.iloc[:,7:]
    Targ = auxTrain.disease_state
    Targ = Targ.replace('HEALTHY', 0)
    Targ = Targ.replace('SPORADIC', 1)
    y_train = Targ.replace('LRRK2', 1)

    aux_test = aux_test.sort_values(axis=0, by=['batch', 'plate', 'well'])
    X_test = aux_test.iloc[:,7:]
    Targ = aux_test.disease_state
    Targ = Targ.replace('HEALTHY', 0)
    Targ = Targ.replace('SPORADIC', 1)
    y_test = Targ.replace('LRRK2', 1)

    if model_name == 'LogisticCV':
      RF = sklearn.linear_model.LogisticRegressionCV(
          solver='lbfgs', max_iter=1000000)
    elif model_name == 'Logistic':
      RF = sklearn.linear_model.LogisticRegression(solver='lbfgs')
    elif model_name == 'RidgeCV':
      RF = sklearn.linear_model.RidgeCV()
    else:
      raise ValueError('Unknown model_name: %s' % model_name)

    RF.fit(X_train, y_train)
    rf_predictions = RF.predict(X_test)
    if model_name == 'RidgeCV':
      preds = RF.predict(X_test)
    else:
      preds = RF.predict_proba(X_test)[:, 1]
    pred_df = pd.DataFrame(data=preds, index=X_test.index, columns=['pred'])
    pred_df['cell_line_id'] = aux_test.cell_line_id
    pred_arr = pred_df.groupby('cell_line_id').mean()
    pred_df['disease_state'] = aux_test.disease_state
    pred_arr = pred_df.groupby(['cell_line_id', 'disease_state']).mean()
    pred_arr = pred_arr.reset_index()
    pred_arr['true'] = 0
    for line in pred_arr.cell_line_id:
      if aux_test.loc[aux_test.cell_line_id == line,
                      'disease_state'].values[0] != 'HEALTHY':
        pred_arr.loc[pred_arr.cell_line_id == line, 'true'] = 1
    Results.loc[
        'sporadic_AUC', 'group' + str(int(fold))] = sklearn.metrics.roc_auc_score(
            pred_arr.drop(
                pred_arr.loc[pred_arr.disease_state == 'LRRK2'].index).true,
            pred_arr.drop(
                pred_arr.loc[pred_arr.disease_state == 'LRRK2'].index).pred)
    Results.loc[
        'LRRK2_AUC', 'group' + str(int(fold))] = sklearn.metrics.roc_auc_score(
            pred_arr.drop(
                pred_arr.loc[pred_arr.disease_state == 'SPORADIC'].index).true,
            pred_arr.drop(
                pred_arr.loc[pred_arr.disease_state == 'SPORADIC'].index).pred)
    Results.loc['total_AUC',
                'group' + str(int(fold))] = sklearn.metrics.roc_auc_score(
                    pred_arr.true, pred_arr.pred)

  Results.loc['sporadic_AUC', 'mean'] = np.mean(Results.loc['sporadic_AUC'].values[0:-2])
  Results.loc['sporadic_AUC', 'std'] = np.std(Results.loc['sporadic_AUC'].values[0:-2])
  Results.loc['LRRK2_AUC', 'mean'] = np.mean(Results.loc['LRRK2_AUC'].values[0:-2])
  Results.loc['LRRK2_AUC', 'std'] = np.std(Results.loc['LRRK2_AUC'].values[0:-2])

  Results.loc['total_AUC',
              'mean'] = np.mean(Results.loc['total_AUC'].values[0:-2])
  Results.loc['total_AUC', 'std'] = np.std(Results.loc['total_AUC'].values[0:-2])
  return Results


In [54]:
logistic_cv_result = run_prediction_model(train_test_df,'LogisticCV')
logistic_cv_result

Unnamed: 0,group0,group1,group2,group3,group4,mean,std
sporadic_AUC,0.785714,0.942857,0.690476,0.666667,0.785714,0.774286,0.097246
LRRK2_AUC,0.75,0.785714,0.857143,1.0,1.0,0.878571,0.104978
total_AUC,0.78125,0.897959,0.714286,0.714286,0.8125,0.784056,0.068595


In [55]:
paper_result_df = pd.DataFrame(columns=['mean','std'])
paper_result_df.loc['sporadic_AUC','mean'] = 0.77
paper_result_df.loc['sporadic_AUC','std'] = 0.10
paper_result_df.loc['LRRK2_AUC','mean'] = 0.89
paper_result_df.loc['LRRK2_AUC','std'] = 0.10
paper_result_df.loc['total_AUC','mean'] = 0.79
paper_result_df.loc['total_AUC','std'] = 0.08
paper_result_df

Unnamed: 0,mean,std
sporadic_AUC,0.77,0.1
LRRK2_AUC,0.89,0.1
total_AUC,0.79,0.08


In [None]:
pd.testing.assert_frame_equal(logistic_cv_result[['mean', 'std']], paper_result_df, atol=1e-2)