## Load Data


In [7]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
plt.rcParams['figure.dpi'] = 80
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['savefig.dpi'] = 300

# Load data
adata = sc.read_h5ad('/home/kirandeav/projects/drug-response-predictor/data/raw/SrivatsanTrapnell2020_sciplex3.h5ad')

print(f"Dataset shape: {adata.shape}")
print(f"Number of cells: {adata.n_obs}")
print(f"Number of genes: {adata.n_vars}")


Dataset shape: (799317, 110984)
Number of cells: 799317
Number of genes: 110984


## Explore Metadata

In [10]:
# Explore metadata
print(" Available metadata columns:")
print(adata.obs.columns.tolist())

print("\n" + "="*50)
print("First few rows of metadata:")
print(adata.obs.head(10))

print("\n" + "="*50)
# Check what columns exist for treatments
if 'product_name' in adata.obs.columns:
    print(f"Number of unique drugs: {adata.obs['product_name'].nunique()}")
    print("\nTop 20 drugs by cell count:")
    print(adata.obs['product_name'].value_counts().head(20))
elif 'condition' in adata.obs.columns:
    print(f"Number of unique conditions: {adata.obs['condition'].nunique()}")
    print("\nTop 20 conditions by cell count:")
    print(adata.obs['condition'].value_counts().head(20))
elif 'perturbation' in adata.obs.columns:
    print(f"Number of unique perturbations: {adata.obs['perturbation'].nunique()}")
    print("\nTop 20 perturbations by cell count:")
    print(adata.obs['perturbation'].value_counts().head(20))
else:
    print("Column names available:")
    for col in adata.obs.columns:
        print(f"  - {col}")

print("\n" + "="*50)
# Check for cell line info
if 'cell_line' in adata.obs.columns:
    print(f"Cell lines: {adata.obs['cell_line'].unique()}")
    print("\nCells per cell line:")
    print(adata.obs['cell_line'].value_counts())
elif 'cell_type' in adata.obs.columns:
    print(f"Cell types: {adata.obs['cell_type'].value_counts()}")




 Available metadata columns:
['ncounts', 'well', 'plate', 'cell_line', 'replicate', 'time', 'dose_value', 'pathway_level_1', 'pathway_level_2', 'perturbation', 'target', 'pathway', 'dose_unit', 'celltype', 'disease', 'cancer', 'tissue_type', 'organism', 'perturbation_type', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'chembl-ID']

First few rows of metadata:
                              ncounts        well    plate cell_line  \
cell_barcode                                                           
A01_E09_RT_BC_100_Lig_BC_147     2957   plate6_A9  plate44      MCF7   
A01_E09_RT_BC_100_Lig_BC_186     1528   plate8_H3  plate46      MCF7   
A01_E09_RT_BC_100_Lig_BC_196     1881   plate3_C2  plate41      MCF7   
A01_E09_RT_BC_100_Lig_BC_213     1700   plate9_E3  plate51      A549   
A01_E09_RT_BC_100_Lig_BC_220     1430  plate8_H10  plate30      K562   
A01_E09_RT_BC_100_Lig_BC_227      613   plate9_G9  plate47      MCF7   
A01_E09_RT_BC_100_Lig_BC_245     3094   plate9_C3  plat