In [None]:
%pip install google-cloud-bigquery
%pip install synapseclient
%pip install protobuf==3.20.1 
%pip install db-dtypes

In [None]:
import sevenbridges as sbg
import os
from google.cloud import bigquery
import synapseclient
import subprocess
import json
import pandas as pd
import numpy  as np 
import gzip
from scipy.io import mmread
import anndata as ad
from scipy.sparse import csr_matrix
import scanpy as sc

In [None]:
#gcloud auth application-default login  #enter this command in terminal if run on local environment

In [None]:
# instantiate synapse client
syn = synapseclient.Synapse() 
syn.login('<username>','<password>') 

# Get Porject Name and Project ID

def list_projects():
    try:
        # Run the gcloud command to list projects and capture the output
        result = subprocess.run(["gcloud", "projects", "list", "--format=json"], capture_output=True, text=True, check=True)

        # Parse the JSON output
        projects = json.loads(result.stdout)

        # Print or return the list of projects
        return projects
    except subprocess.CalledProcessError as e:
        print(f"Error: {e}")
        return None

# Call the function to list projects
project_list = list_projects()

# Print the project list
if project_list:
    print("Project List:")
    for project in project_list:
        print(f"Project ID: {project['projectId']}, Project Number: {project['projectNumber']}")
else:
    print("Failed to list projects.")


# Import the Google BigQuery client
from google.cloud import bigquery
# Replace <my-project> with the name of the Google project that will be billed for this notebook's computations
google_project = 'aesthetic-joy-397623'
# Create a client to access the data within BigQuery
client = bigquery.Client(google_project)

In [None]:
# Get all HTAPP-merfish data information
HTAPP_img = client.query("""
SELECT DISTINCT
  a.HTAN_Center,
  a.Filename,
  a.entityId,
  a.Component,
  a.HTAN_Participant_ID,
  a.entityId,
  b.Tissue_or_Organ_of_Origin
FROM
  `isb-cgc-bq.HTAN.id_provenance_current` a
JOIN
  `isb-cgc-bq.HTAN.clinical_tier1_diagnosis_current` b
ON
  a.HTAN_Participant_ID = b.HTAN_Participant_ID
WHERE
  a.HTAN_Center = 'HTAN HTAPP' AND
  a.Component IN ('ImagingLevel3Segmentation', 'ImagingLevel4');

""").result().to_dataframe()

#'ImagingLevel2'

HTAPP_img = HTAPP_img.sort_values(by='Filename')
HTAPP_img

In [None]:
HTAPP_scrna_L4 = client.query("""

SELECT DISTINCT
  a.HTAN_Center,
  a.Filename,
  a.entityId,
  a.Component,
  a.HTAN_Participant_ID,
  a.entityId,
  b.Tissue_or_Organ_of_Origin
FROM
  `isb-cgc-bq.HTAN.id_provenance_current` a
JOIN
  `isb-cgc-bq.HTAN.clinical_tier1_diagnosis_current` b
ON
  a.HTAN_Participant_ID = b.HTAN_Participant_ID
WHERE
  a.HTAN_Center = 'HTAN HTAPP' AND
  a.Component = 'ScRNA-seqLevel4';
""").result().to_dataframe()


patient_intersection = set(HTAPP_img['HTAN_Participant_ID']).intersection(set(HTAPP_scrna_L4['HTAN_Participant_ID']))
HTAPP_scrna_L4_intersection = HTAPP_scrna_L4[HTAPP_scrna_L4['HTAN_Participant_ID'].isin(patient_intersection)]
HTAPP_scrna_L4_intersection = HTAPP_scrna_L4_intersection.sort_values(by='Filename')
HTAPP_scrna_L4_intersection


syn26127156 = syn.get(entity='syn26127156').path #Cell type file
syn26127157 = syn.get(entity='syn26127157').path #Cell type file
cell_type1 = pd.read_table(syn26127156)
cell_type2 = pd.read_table(syn26127157)
cell_type = pd.concat([cell_type1, cell_type2])

for i in HTAPP_scrna_L4_intersection['HTAN_Participant_ID']:
    cell_type[['HTAN_Participant_ID', 'a1']] = cell_type['Biospecimen'].str.rsplit('_', 1, expand=True)
    a = cell_type.loc[cell_type['HTAN_Participant_ID'] == i]
    a.index = a['NAME']
    a.drop(columns=['NAME', 'a1'], inplace=True)
    save_path = '/Users/zhaox/OHSU/HTAN_12042023/DATA/scRNA-seq/CellType/' + i +'_cell_type.csv'
    a.to_csv(save_path)

In [None]:
HTAPP_scrna_L3 = client.query("""

SELECT DISTINCT
  a.HTAN_Center,
  a.Filename,
  a.entityId,
  a.Component,
  a.HTAN_Participant_ID,
  a.entityId,
  b.Tissue_or_Organ_of_Origin
FROM
  `isb-cgc-bq.HTAN.id_provenance_current` a
JOIN
  `isb-cgc-bq.HTAN.clinical_tier1_diagnosis_current` b
ON
  a.HTAN_Participant_ID = b.HTAN_Participant_ID
WHERE
  a.HTAN_Center = 'HTAN HTAPP' AND
  a.Component = 'ScRNA-seqLevel3'
""").result().to_dataframe()


patient_intersection = set(HTAPP_img['HTAN_Participant_ID']).intersection(set(HTAPP_scrna_L3['HTAN_Participant_ID']))
HTAPP_scrna_L3_intersection = HTAPP_scrna_L3[HTAPP_scrna_L3['HTAN_Participant_ID'].isin(patient_intersection)]
HTAPP_scrna_L3_intersection = HTAPP_scrna_L3_intersection.sort_values(by='Filename')
HTAPP_scrna_L3_intersection

In [None]:
HTAPP_scrna_L3_intersection_syn = HTAPP_scrna_L3_intersection['entityId_1'].unique()
HTAPP_img_syn = HTAPP_img['entityId_1'].unique()
HTAPP_img_syn

In [None]:
# Extract HTAPP single cell data

for i in range(16):
    barcode_path = syn.get(entity=HTAPP_scrna_L3_intersection_syn[3*i]).path
    with gzip.open(barcode_path, 'rt') as file:
        barcode = pd.read_csv(barcode_path, header=None)
    feature_path = syn.get(entity=HTAPP_scrna_L3_intersection_syn[3*i+1]).path
    with gzip.open(feature_path, 'rt') as file:
        feature = pd.read_table(feature_path, header=None)[1]
    matrix_path = syn.get(entity=HTAPP_scrna_L3_intersection_syn[3*i+2]).path
    with gzip.open(matrix_path, 'rt') as file:
        matrix = mmread(matrix_path)

    adata = sc.AnnData(matrix.T)
    adata.obs_names = np.array(barcode).reshape(barcode.shape[0])
    adata.var_names= np.array(feature)
    adata.X = csr_matrix(adata.X)
    save_path = save_path = '/Users/zhaox/OHSU/HTAN_12042023/DATA/scRNA-seq/' + barcode_path.rsplit('_', 1)[0].rsplit('/', 1)[1] + '.h5ad'
    adata.write_h5ad(save_path)

In [None]:
# Extract HTAPP merfish data
HTAPP_img_filename = HTAPP_img['Filename'].to_numpy().reshape(3, 12).T
for i in range(12):
    segmentation_path = syn.get(entity=HTAPP_img.loc[HTAPP_img['Filename'] == HTAPP_img_filename[i, 0], 'entityId_1'].iloc[0]).path
    segmentation = pd.read_csv(segmentation_path, index_col=0, header=0)
    spot_path = syn.get(entity=HTAPP_img.loc[HTAPP_img['Filename'] == HTAPP_img_filename[i, 1], 'entityId_1'].iloc[0]).path
    spot = pd.read_csv(spot_path, index_col=0, header=0)
    count_path = syn.get(entity=HTAPP_img.loc[HTAPP_img['Filename'] == HTAPP_img_filename[i, 2], 'entityId_1'].iloc[0]).path
    count = ad.read_h5ad(count_path)
    count.obs = segmentation
    save_path = '/Users/zhaox/OHSU/HTAN_12042023/DATA/Merfish/' + segmentation_path.split('_', 1)[1].split('.', 1)[0] + '.h5ad'
    count.write_h5ad(save_path)
    #print(count.shape)
    #print(spot['target_molecule_name'].unique().shape)
    #print(segmentation.shape)
    #print(count.var_names.drop(spot['target_molecule_name'].unique()))