# Agent 1

Agent 1 focuses on downloading the data / feed information to train the LLM to be able to handle queries related to HTAN spatial transcriptomics data using promt engineering. The prompts include
* Example queries from HTAN community notebook
* Downloading HTAN spatial datasets from Synapse database. The downloaded files will be used by Agent 2.
  The information was obtained from
this table `HTAN.10xvisium_spatialtranscriptomics_scRNAseq_level4_metadata_current`
* Metadata information obtained from https://humantumoratlas.org/standard/spatial_transcriptomics

Once the prompts have been provided, we then run some generalized queries to validate our work


In [None]:
!pip install synapseclient




In [58]:
import re, io, sys

In [59]:
from IPython.display import HTML, Markdown

def set_css_in_cell_output(unused):
  display(HTML(""""""))

get_ipython().events.register('pre_run_cell', set_css_in_cell_output)


In [60]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting

In [61]:
# Synapse client for data access
import synapseclient
from synapseclient import Synapse

# Google BigQuery and data handling
import pandas as pd
from google.cloud import bigquery
import pandas_gbq  # To read BigQuery data into pandas DataFrames


## Token for accessing Synapse API

In [88]:
token1="eyJ0eXAiOiJKV1QiLCJraWQiOiJXN05OOldMSlQ6SjVSSzpMN1RMOlQ3TDc6M1ZYNjpKRU9VOjY0NFI6VTNJWDo1S1oyOjdaQ0s6RlBUSCIsImFsZyI6IlJTMjU2In0.eyJhY2Nlc3MiOnsic2NvcGUiOlsidmlldyIsImRvd25sb2FkIiwibW9kaWZ5Il0sIm9pZGNfY2xhaW1zIjp7fX0sInRva2VuX3R5cGUiOiJQRVJTT05BTF9BQ0NFU1NfVE9LRU4iLCJpc3MiOiJodHRwczovL3JlcG8tcHJvZC5wcm9kLnNhZ2ViYXNlLm9yZy9hdXRoL3YxIiwiYXVkIjoiMCIsIm5iZiI6MTczMDkyNjQ5MSwiaWF0IjoxNzMwOTI2NDkxLCJqdGkiOiIxMzUxOSIsInN1YiI6IjM1MDUyODEifQ.EtYAIYe5ETCKZ_MwMgrnig1ygkeSMONvY9_a8kbfDdbhea5pn8dlRmGUKgV-llPzZqQQlijrITaoNclatU7EUpW_FUVPzH573pcGzOi2SfEwM7N-_dYddd5OgvUnLwcDmAFEAbYSEOtsm-h4CSfrZJaflhP08nSTzWEw30AYRvW28H-1KX_xEBDkOfZC1xyKDStcDdH3-02Tu-hP5POuUqcG2MKmRFco55xtnZ4vg98Izp-4QX-DbafqTO04DIsopYlm8SmCto_4ZoVdzJZNq3ZugcxN_6lb6CnEG6WWh81xIur8Fe9af8ghCTtFDrAENONx7bRlzeA_hsoLstv49w"
syn = synapseclient.login(authToken=token1)

Welcome, krithika.b!



INFO:synapseclient_default:Welcome, krithika.b!



## Mounting google drive to see if we can access any data from there

In [62]:
# Specify the path to the uploaded file
metadata_df = pd.read_csv('/content/HTAN_ST_metadata_ad.tsv', sep='\t')  # Use sep='\t' for TSV files

# Display the dataframe
metadata_df.head()



Unnamed: 0,Attribute:,Description
0,Filename:,Name of a file
1,Run ID:,A unique identifier for this individual run (t...
2,File Format:,"Format of a file (e.g. txt, csv, fastq, bam, e..."
3,HTAN Parent Biospecimen ID:,HTAN Biospecimen Identifier (eg HTANx_yyy_zzz)...
4,HTAN Data File ID:,Self-identifier for this data file - HTAN ID o...


In [63]:
PROJECT_ID = "isb-cgc-external-004"
LOCATION = "us-central1"

import vertexai
vertexai.init(project=PROJECT_ID, location=LOCATION)

In [64]:
from vertexai.preview import reasoning_engines

In [76]:
generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.5,
}

safety_settings = [
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
]

def generate(question = """What can you do?"""):

    ## General prompt
    prompt = f"""
    You are a bioinformatics expert coder using the Human Tumor Network Atlas (HTAN)
    datasets via Google Cloud and Jupyter. You can write Python code to answer
    questions by writing BigQuery queries, pandas, scanpy, and squidpy code.
    When a user asks a question, identify whether you need to write any code to
    answer.
    Very important: !! For any general questions and conversations,
    you can answer with a conversation. For any questions that needs the code,
    just write and return the Python code enclosed in ```python``` tags so that
    the user can regex extract the code easily.
    You do not have to access any data. You are an expert coder, just write the
    code. When you're asked to load a dataset, use BigQuery to load it.

    Include all the necessary import packages to run the code. If using any
    Google Cloud services, you can use:
    project_id = "isb-cgc-external-004"

    Here are some examples:

    Question: Can you load the cells from the HTAN biospecimen HTA7_1_3?
    Answer:
    '''
    query = '''
    WITH cells AS (
      SELECT  CellID, X_centroid, Y_centroid,
      FROM `isb-cgc-bq.HTAN.imaging_level4_HMS_mel_mask_current`
      WHERE HTAN_Biospecimen_ID = 'HTA7_1_3'
    )
    SELECT CellID, X_centroid,  Y_centroid
    FROM cells
    '''
    df = pandas_gbq.read_gbq(query, project_id=project_id)
    '''


    """



    #### Synapse data download : Example 1

    prompt += f"""
    Question: Can you load the entityId of all data under HTAN.10xvisium_spatialtranscriptomics_scRNAseq_level4_metadata_current
    where the File_Format is hdf5?

    Answer:
    '''
    import pandas_gbq
    from google.cloud import bigquery

    project_id = "isb-cgc-external-004"

    query = '''
    SELECT entityId
    FROM `isb-cgc-bq.HTAN.10xvisium_spatialtranscriptomics_scRNAseq_level4_metadata_current`
    WHERE File_Format = 'hdf5'
    '''

    df = pandas_gbq.read_gbq(query, project_id=project_id)
    '''

    """

    ##  Synapse example 2

    prompt += f"""
    Question: Can you download the synapse data 'syn51133602' to /content/datasets/?
    Answer:
    '''
    import synapseclient
    syn = synapseclient.login()
    entity = syn.get('syn51133602', downloadLocation='/content/datasets')
    '''

    """

    ### Spatial example 1

    prompt += f"""
    Question: Can you categorize cells within a defined spatial region as either 'Tumor' or 'Other' based on threshold expression levels of specific markers (SOX10_cellRingMask, S100B_cellRingMask, and CD63_cellRingMask)?
    Answer:
    '''
    query = '''
      WITH cells AS (
      SELECT CellID, X_centroid, Y_centroid,
      IF (SOX10_cellRingMask > 3704.5 AND (S100B_cellRingMask > 7589.48 OR CD63_cellRingMask > 570.68),
      'Tumor', 'Other') AS celltype
      FROM `isb-cgc-bq.HTAN.imaging_level4_HMS_mel_mask_current`
      WHERE HTAN_Biospecimen_ID = 'HTA7_1_3')

      SELECT CellID, X_centroid, Y_centroid, celltype
      FROM cells
      WHERE X_centroid > 23076.9 AND X_centroid < 30384.6
      AND Y_centroid > 9615.3 AND Y_centroid < 15000
      '''

    """

    ### Spatial example 2
    prompt += f"""
    Question: Can you classify cells as 'Tumor' or 'Other', convert their pixel coordinates to geospatial points, and calculate distances between cell pairs that are within a 20-micrometer proximity threshold
    Answer:
    query = '''
      WITH geodat AS (
      SELECT CellID, X_centroid, Y_centroid,
      IF (SOX10_cellRingMask > 3704.5 AND (S100B_cellRingMask > 7589.48 OR CD63_cellRingMask > 570.68),
      'Tumor', 'Other') AS celltype,
      ST_GeogPoint(X_centroid / 368570, Y_centroid / 368570) AS p
      FROM `isb-cgc-bq.HTAN.imaging_level4_HMS_mel_mask_current`
      WHERE HTAN_Biospecimen_ID = 'HTA7_1_3'
      )
    SELECT t1.CellID, t1.X_centroid, t1.Y_centroid, t1.p, t1.celltype,
    t2.CellID AS CellID_1, t2.X_centroid AS X_centroid_1, t2.Y_centroid AS Y_centroid_1, t2.p AS p_1, t2.celltype AS celltype_1,
    ST_Distance(t1.p, t2.p) AS Distance
    FROM geodat AS t1
    JOIN geodat AS t2
    ON ST_DWithin(t1.p, t2.p, 9.29324770787722)
    '''


    """

    ### Spatial example 3
    prompt += f"""
    Question: For each tumor cell within the specified spatial region, calculate the number of neighboring tumor cells
    Answer:
    query = '''
      WITH cellp AS (
      SELECT CellID, celltype, CellID_1, celltype_1
      FROM `isb-cgc-bq.temp15432.Melanoma_CyCIF_HTA7_1_3_points_within_20um`
      WHERE X_centroid > 23076.9 AND X_centroid < 30384.6
      AND Y_centroid > 9615.3 AND Y_centroid < 15000)

      SELECT CellID, COUNTIF(celltype_1 = 'Tumor') - 1 AS N_Tumor_Cells
      FROM cellp
      WHERE celltype = 'Tumor'
      GROUP BY CellID
    '''

    """


    ## Metadata

    # Create a prompt with the file content included
    prompt += f"""
    The HTAN metadata contains a subset of these type of attributes. Also included
    is a description of these attributes. Internalize this information and use it
    to answer any queries related to metadata.

    Metadata Description:
    {metadata_df}

    Question: Which attributes describe the spatial location of cells?
    Answer:

    """



    ## ScRNAseq example 1
    prompt += f"""
    Question: What are the counts of unique cells, sex groupings, samples,
    cell types, and therapies by development stage in the MSK scRNAseq dataset?
    Answer:
    '''
    query = '''
    SELECT
      development_stage,
      count(distinct(iObs)) AS Number_Cells,
      count(distinct(sex)) AS Unique_Sex_Grouping,
      count(distinct(donor_id)) AS Number_Samples,
      count(distinct(cell_type)) AS Number_Cell_Types,
      count(distinct(treatment)) AS Number_Therapies
    FROM
      `isb-cgc-bq.HTAN.scRNAseq_MSK_SCLC_combined_samples_current`
    GROUP BY development_stage
    ORDER BY Number_Samples DESC
    '''
    df = pandas_gbq.read_gbq(query, project_id=project_id)
    '''

    """

    ## ScRNAseq example 2
    prompt += f"""
    Question: How many unique cell types, sex groupings, cells, and samples
    are present in the a specific human stage of development in the MSK
    scRNAseq dataset (e.g. 74-year-old)?
    Answer:
    '''
    query = '''
    SELECT
      cell_type,
      count(distinct(sex)) AS Unique_Sex_Grouping,
      count(distinct(iObs)) AS Number_Cells,
      count(distinct(donor_id)) AS Number_Samples
    FROM
      `isb-cgc-bq.HTAN.scRNAseq_MSK_SCLC_combined_samples_current`
    WHERE
      development_stage = '74-year-old human stage'
    GROUP BY cell_type
    '''
    df = pandas_gbq.read_gbq(query, project_id=project_id)
    '''

    """

    ## ScRNAseq example 3
    prompt += f"""
    Question: How many genes and cells are associated with each sex and
    cell type in the MSK scRNAseq dataset for an individual
    (e.g. a 74-year-old human stage)?
    Answer:
    '''
    query = '''
    SELECT
      sex,
      Cell_Type,
      count(distinct(feature_name)) AS Number_Genes,
      count(distinct(iObs)) AS Number_Cells
    FROM
      `isb-cgc-bq.HTAN.scRNAseq_MSK_SCLC_combined_samples_current`
    WHERE development_stage = '74-year-old human stage'
    GROUP BY sex, Cell_Type
    ORDER BY Cell_Type DESC
    '''
    df = pandas_gbq.read_gbq(query, project_id=project_id)
    '''

    """

    ## ScRNAseq example 4
    prompt += f"""
    Question: How many genes and cells are there in each Seurat Cluster
    for males and females of the 'epithelial cell' type in the specific
    human stage (here 74-year-old)?
    Answer:
    '''
    query = '''
    SELECT
      sex,
      clusters,
      Cell_Type,
      count(distinct(feature_name)) AS Number_Genes,
      count(distinct(iObs)) AS Number_Cells
    FROM
      `isb-cgc-bq.HTAN.scRNAseq_MSK_SCLC_combined_samples_current`
    WHERE development_stage = '74-year-old human stage' AND Cell_Type = 'epithelial cell'
    GROUP BY sex, clusters, Cell_Type
    ORDER BY clusters ASC
    '''
    df = pandas_gbq.read_gbq(query, project_id=project_id)
    '''

    """

   ## ScRNAseq example 5
    prompt += f"""
    Question: How do the average expression values for genes differ between
    male and female epithelial cells in a specific cluster, and which genes
    show the greatest differences (here cluster 41 of the 74-year-old
    human stage)?
    Answer:
    '''
    query = '''
    SELECT
      A.feature_name,
      A.avg_counts_clust10 AS female_avg_counts,
      B.avg_counts_clust10 AS male_avg_counts,
      A.avg_counts_clust10 - B.avg_counts_clust10 AS mean_diff
    FROM (
      SELECT
        feature_name,
        AVG(X_value) AS avg_counts_clust10
      FROM
        `isb-cgc-bq.HTAN.scRNAseq_MSK_SCLC_combined_samples_current`
      WHERE development_stage = '74-year-old human stage' AND Cell_Type = 'epithelial cell' AND clusters = '41' AND sex = 'female'
      GROUP BY feature_name
    ) AS A
    INNER JOIN (
      SELECT
        feature_name,
        AVG(X_value) AS avg_counts_clust10
      FROM
        `isb-cgc-bq.HTAN.scRNAseq_MSK_SCLC_combined_samples_current`
      WHERE development_stage = '74-year-old human stage' AND Cell_Type = 'epithelial cell' AND clusters = '41' AND sex = 'male'
      GROUP BY feature_name
    ) AS B
    ON A.feature_name = B.feature_name
    ORDER BY mean_diff DESC
    '''
    df = pandas_gbq.read_gbq(query, project_id=project_id)
    '''

    """


    # Add the dynamic user question and the placeholder for the answer
    prompt += f"""
    Question: {question}
    Answer:
    """


    # Model information
    vertexai.init(project="isb-cgc-external-004", location="us-central1")
    model = GenerativeModel(
        "gemini-1.5-flash-002",
    )

    # Responses
    responses = model.generate_content(
        [prompt],
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=False,
        # tools='code_execution'
    )

    # for response in responses:
    #     print(response.text, end="")

    pattern = r'```python\n(.*?)\n```'

    # Extract the code
    match = re.search(pattern, responses.text, re.DOTALL)

    if match:
        python_code = match.group(1).strip()
        print(python_code)
        return(python_code)

    else:
        print(responses)



In [None]:
# Test 1
code=generate('Who are you and what can you do?')
print(code)

In [67]:
# Test 2
code=generate("Can you load the cells from the HTAN biospecimen HTA7_1_3?")




import pandas_gbq

project_id = "isb-cgc-external-004"

query = """
WITH cells AS (
  SELECT  CellID, X_centroid, Y_centroid,
  FROM `isb-cgc-bq.HTAN.imaging_level4_HMS_mel_mask_current`
  WHERE HTAN_Biospecimen_ID = 'HTA7_1_3'
)
SELECT CellID, X_centroid,  Y_centroid
FROM cells
"""

df = pandas_gbq.read_gbq(query, project_id=project_id)


In [68]:
exec(code)

Downloading: 100%|[32m██████████[0m|


In [None]:
print(df)

          CellID    X_centroid    Y_centroid
0         414954  12234.156425  11496.055866
1         414965  17100.977778  11491.444444
2         414982  20372.921053  11492.671053
3         415004  30851.258170  11497.666667
4         415185  10789.783784  11496.283784
...          ...           ...           ...
1110580  1099321  29743.450000  23012.790000
1110581  1099373   8805.850649  23025.246753
1110582  1099466  24451.210526  23035.491228
1110583  1099530  29740.808511  23049.994681
1110584  1099567  10085.244186  23054.709302

[1110585 rows x 3 columns]


In [70]:
# Test 3
code=generate("Can you load the cells from the HTAN biospecimen HTA7_8_2?")
exec(code)

import pandas_gbq

project_id = "isb-cgc-external-004"

query = """
WITH cells AS (
  SELECT  CellID, X_centroid, Y_centroid,
  FROM `isb-cgc-bq.HTAN.imaging_level4_HMS_mel_mask_current`
  WHERE HTAN_Biospecimen_ID = 'HTA7_8_2'
)
SELECT CellID, X_centroid,  Y_centroid
FROM cells
"""

df = pandas_gbq.read_gbq(query, project_id=project_id)
Downloading: 100%|[32m██████████[0m|


In [71]:
print(df)

        CellID    X_centroid    Y_centroid
0       271194  28345.181303  22522.594901
1       271210   8021.079452  22523.334247
2       271276   6641.683544  22525.746835
3       271353   5904.687980  22531.846547
4       271388   4592.990741  22528.296296
...        ...           ...           ...
352915  270827   4276.356322  22506.913793
352916  270863  10172.643750  22508.637500
352917  270889  26049.331034  22509.006897
352918  270893   5114.777778  22508.931624
352919  271067  11497.290179  22517.410714

[352920 rows x 3 columns]


In [77]:
## Test 3
code=generate("What kind of information is captured in the HTAN spatial transcriptomics metadata file?")

candidates {
  content {
    role: "model"
    parts {
      text: "The HTAN spatial transcriptomics metadata file contains information about the spatial location of cells, including attributes such as `Spatial Barcode Length`, `UMI Barcode Offset`, `UMI Barcode Length`, and potentially coordinates if available in the specific dataset.  It also includes general file information like `Filename`, `Run ID`, `File Format`, and HTAN identifiers such as `HTAN Parent Biospecimen ID` and `HTAN Data File ID`.  The exact attributes will vary depending on the specific dataset.\n"
    }
  }
  finish_reason: STOP
  avg_logprobs: -0.10189473395254098
}
usage_metadata {
  prompt_token_count: 3155
  candidates_token_count: 102
  total_token_count: 3257
}
model_version: "gemini-1.5-flash-002"



In [78]:
print(code)

None


In [79]:
## Test 4
code=generate("What kind of information is captured in the Synapse database?")

candidates {
  content {
    role: "model"
    parts {
      text: "The Synapse database contains a wide variety of data types relevant to biomedical research.  This includes, but is not limited to, genomic data (e.g., gene expression, mutations), imaging data (e.g., microscopy images, histology slides), clinical data (e.g., patient demographics, treatment information), and metadata describing the experimental procedures and data processing steps.  The specific contents vary depending on the individual datasets stored within Synapse.  To get a precise answer about the contents of a specific Synapse ID, you\'ll need to provide that ID.\n"
    }
  }
  finish_reason: STOP
  avg_logprobs: -0.17099447908072635
}
usage_metadata {
  prompt_token_count: 3150
  candidates_token_count: 116
  total_token_count: 3266
}
model_version: "gemini-1.5-flash-002"



In [80]:
### Test 6

code=generate("Labels cells as 'Tumor' or 'Other' based on the expression of certain markers such as CD63_cellRingMask")


import pandas_gbq
from google.cloud import bigquery

project_id = "isb-cgc-external-004"

query = """
SELECT
    CellID,
    CASE
        WHEN CD63_cellRingMask > 570.68 THEN 'Tumor'
        ELSE 'Other'
    END AS cell_type
FROM
    `isb-cgc-bq.HTAN.imaging_level4_HMS_mel_mask_current`
"""

df = pandas_gbq.read_gbq(query, project_id=project_id)


In [81]:
exec(code)
print(df)

Downloading: 100%|[32m██████████[0m|
         CellID cell_type
0        271351     Other
1        271535     Other
2        271733     Other
3        272229     Other
4        272613     Other
...         ...       ...
6282743   83619     Other
6282744  201761     Other
6282745  450282     Other
6282746   71082     Other
6282747    5520     Other

[6282748 rows x 2 columns]


In [87]:
# Question 1
print("Question 1")
code = generate("Can you load the cells from the HTAN biospecimen HTA7_1_3?")
exec(code)

# Question 2
print("Question 2")
code = generate("Can you load the entityId of all data under HTAN.10xvisium_spatialtranscriptomics_scRNAseq_level4_metadata_current where the File_Format is hdf5?")
exec(code)

# Question 4
print("Question 4")
code = generate("Label cells as 'Tumor' or 'Other' based on the expression of certain markers (SOX10_cellRingMask, S100B_cellRingMask, and CD63_cellRingMask). Restricts the results to cells within a specific spatial region.")
exec(code)




Question 1
import pandas_gbq

project_id = "isb-cgc-external-004"

query = """
WITH cells AS (
  SELECT  CellID, X_centroid, Y_centroid,
  FROM `isb-cgc-bq.HTAN.imaging_level4_HMS_mel_mask_current`
  WHERE HTAN_Biospecimen_ID = 'HTA7_1_3'
)
SELECT CellID, X_centroid,  Y_centroid
FROM cells
"""

df = pandas_gbq.read_gbq(query, project_id=project_id)

Downloading:   0%|[32m          [0m|[A
Downloading:  11%|[32m█         [0m|[A
Downloading:  21%|[32m██        [0m|[A
Downloading:  32%|[32m███▏      [0m|[A
Downloading:  42%|[32m████▏     [0m|[A
Downloading:  53%|[32m█████▎    [0m|[A
Downloading:  63%|[32m██████▎   [0m|[A
Downloading:  74%|[32m███████▎  [0m|[A
Downloading:  84%|[32m████████▍ [0m|[A
Downloading:  95%|[32m█████████▍[0m|[A
Downloading: 100%|[32m██████████[0m|
Question 2
import pandas_gbq
from google.cloud import bigquery

project_id = "isb-cgc-external-004"

query = '''
SELECT entityId
FROM `isb-cgc-bq.HTAN.10xvisium_spatialtranscriptomics_scRNA

In [None]:
# Question 3
code = generate("Can you download the synapse data 'syn51133602' to /content/datasets/?")


In [None]:
# Question 5
print("Question 5")
code = generate("Labels cells as 'Tumor' or 'Other', and converts pixel coordinates to geospatial points. Calculate the distance between cell pairs that fall within a 20-micrometer threshold.")
exec(code)

# Question 6
print("Question 6")
code = generate("Calculates the number of neighboring tumor cells for each tumor cell within the specified spatial region.")
exec(code)

# Question 7
print("Question 7")
code = generate("What are the counts of unique cells, sex groupings, samples, cell types, and therapies by development stage in the MSK scRNAseq dataset?")
exec(code)



Question 5
import pandas_gbq
from google.cloud import bigquery

project_id = "isb-cgc-external-004"

query = """
WITH geodat AS (
  SELECT
    CellID,
    X_centroid,
    Y_centroid,
    IF(
      SOX10_cellRingMask > 3704.5
      AND (S100B_cellRingMask > 7589.48 OR CD63_cellRingMask > 570.68),
      'Tumor',
      'Other'
    ) AS celltype,
    ST_GeogPoint(X_centroid / 368570, Y_centroid / 368570) AS p
  FROM
    `isb-cgc-bq.HTAN.imaging_level4_HMS_mel_mask_current`
  WHERE HTAN_Biospecimen_ID = 'HTA7_1_3'
)
SELECT
  t1.CellID,
  t1.X_centroid,
  t1.Y_centroid,
  t1.p,
  t1.celltype,
  t2.CellID AS CellID_1,
  t2.X_centroid AS X_centroid_1,
  t2.Y_centroid AS Y_centroid_1,
  t2.p AS p_1,
  t2.celltype AS celltype_1,
  ST_Distance(t1.p, t2.p) AS Distance
FROM
  geodat AS t1
JOIN geodat AS t2 ON ST_DWithin(t1.p, t2.p, 9.29324770787722)

"""

df = pandas_gbq.read_gbq(query, project_id=project_id)


In [None]:
# Question 8
print("Question 8")
code = generate("How many unique cell types, sex groupings, cells, and samples are present in a specific human stage of development in the MSK scRNAseq dataset (e.g. 74-year-old)?")
exec(code)

# Question 9
print("Question 9")
code = generate("How many genes and cells are associated with each sex and cell type in the MSK scRNAseq dataset for an individual (e.g. a 74-year-old human stage)?")
exec(code)

# Question 10
print("Question 10")
code = generate("How many genes and cells are there in each Seurat Cluster for males and females of the 'epithelial cell' type in the specific human stage (here 74-year-old)?")
exec(code)