# AlphaGenome

Experimenting with AlphaGenome

In [1]:
from alphagenome.models.variant_scorers import GeneMaskLFCScorer
from alphagenome.models.dna_client import OutputType
from alphagenome.data import genome
from alphagenome.models import variant_scorers, dna_client

  from .autonotebook import tqdm as notebook_tqdm
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
alphagenome_api_key = "AIzaSyCir34AonB_Q0y7aHJx1od6Z5SJrf5Z1mg"
model = dna_client.create(alphagenome_api_key)

In [4]:
# Define the DNMT3A R882C variant
variant = genome.Variant(
    chromosome="chr2", position=25234373, reference_bases="C", alternate_bases="T"
)

# Create interval around the variant
sequence_length = dna_client.SUPPORTED_SEQUENCE_LENGTHS["SEQUENCE_LENGTH_100KB"]
interval = variant.reference_interval.resize(sequence_length)

# Create the RNA-seq LFC scorer
rna_lfc_scorer = GeneMaskLFCScorer(requested_output=OutputType.RNA_SEQ)

# Score the variant for RNA-seq
rna_scores = model.score_variant(
    interval=interval, variant=variant, variant_scorers=[rna_lfc_scorer]
)

# Convert to dataframe
rna_df = variant_scorers.tidy_scores(rna_scores)
rna_df

Unnamed: 0,variant_id,scored_interval,gene_id,gene_name,gene_type,gene_strand,junction_Start,junction_End,output_type,variant_scorer,track_name,track_strand,Assay title,ontology_curie,biosample_name,biosample_type,gtex_tissue,raw_score,quantile_score
0,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000115138,POMC,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),CL:0000047 polyA plus RNA-seq,-,polyA plus RNA-seq,CL:0000047,neuronal stem cell,in_vitro_differentiated_cells,,0.000042,0.160114
1,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000115138,POMC,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),CL:0000062 total RNA-seq,-,total RNA-seq,CL:0000062,osteoblast,primary_cell,,0.000001,0.034594
2,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000115138,POMC,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),CL:0000084 polyA plus RNA-seq,-,polyA plus RNA-seq,CL:0000084,T-cell,primary_cell,,-0.000015,-0.092027
3,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000115138,POMC,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),CL:0000084 total RNA-seq,-,total RNA-seq,CL:0000084,T-cell,primary_cell,,-0.000005,-0.046111
4,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000115138,POMC,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),CL:0000115 total RNA-seq,-,total RNA-seq,CL:0000115,endothelial cell,in_vitro_differentiated_cells,,0.000056,0.160114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1183,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000230452,LINC01381,lncRNA,+,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),UBERON:0018115 polyA plus RNA-seq,.,polyA plus RNA-seq,UBERON:0018115,left renal pelvis,tissue,,-0.000320,-0.322598
1184,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000230452,LINC01381,lncRNA,+,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),UBERON:0018116 polyA plus RNA-seq,.,polyA plus RNA-seq,UBERON:0018116,right renal pelvis,tissue,,0.000154,0.215740
1185,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000230452,LINC01381,lncRNA,+,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),UBERON:0018117 polyA plus RNA-seq,.,polyA plus RNA-seq,UBERON:0018117,left renal cortex interstitium,tissue,,0.000219,0.248480
1186,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000230452,LINC01381,lncRNA,+,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),UBERON:0018118 polyA plus RNA-seq,.,polyA plus RNA-seq,UBERON:0018118,right renal cortex interstitium,tissue,,-0.000208,-0.248480


In [5]:
print(f"Number of predictions: {len(rna_df)}")

# Summary statistics
print(f"Mean LFC: {rna_df['raw_score'].mean():.6f}")
print(f"Median LFC: {rna_df['raw_score'].median():.6f}")
print(f"Min LFC: {rna_df['raw_score'].min():.6f}")
print(f"Max LFC: {rna_df['raw_score'].max():.6f}")

# Count effects
upregulated = (rna_df["raw_score"] > 0).sum()
downregulated = (rna_df["raw_score"] < 0).sum()
print(f"Upregulated: {upregulated}")
print(f"Downregulated: {downregulated}")

# Top effects
top_up = rna_df.nlargest(5, "raw_score")[["biosample_name", "raw_score"]]
top_down = rna_df.nsmallest(5, "raw_score")[["biosample_name", "raw_score"]]

print("Top upregulated:")
display(top_up)
print("Top downregulated:")
display(top_down)

rna_df.to_csv("DNMT3A_R882C_RNA_seq_LFC_scores.csv", index=False)

Number of predictions: 1188
Mean LFC: -0.001372
Median LFC: -0.000035
Min LFC: -0.024788
Max LFC: 0.011298
Upregulated: 409
Downregulated: 751
Top upregulated:


Unnamed: 0,biosample_name,raw_score
827,mononuclear cell,0.011298
826,hematopoietic multipotent progenitor cell,0.010371
530,HeLa-S3,0.009999
1075,immature natural killer cell,0.007974
809,B cell,0.007563


Top downregulated:


Unnamed: 0,biosample_name,raw_score
668,Purkinje cell,-0.024788
708,GM23248,-0.016414
781,dorsolateral prefrontal cortex,-0.014448
500,hair follicular keratinocyte,-0.014296
510,MCF 10A,-0.013721
