# AlphaGenome

Experimenting with AlphaGenome

In [2]:
from alphagenome.models.variant_scorers import GeneMaskLFCScorer
from alphagenome.models.dna_client import OutputType
from alphagenome.data import genome
from alphagenome.models import variant_scorers, dna_client
import numpy as np
import pandas as pd
import requests
import plotly.express as px
import matplotlib.pyplot as plt



In [3]:
alphagenome_api_key = "AIzaSyCir34AonB_Q0y7aHJx1od6Z5SJrf5Z1mg"
model = dna_client.create(alphagenome_api_key)

In [4]:
# Define the DNMT3A R882C variant
variant = genome.Variant(
    chromosome="chr2", position=25_234_373, reference_bases="C", alternate_bases="T"
)

# MAPT P301L (rs63751273) variant
# variant = genome.Variant(
#     chromosome="chr17", position=44061046, reference_bases="C", alternate_bases="T"
# )

# Create interval around the variant
sequence_length = dna_client.SUPPORTED_SEQUENCE_LENGTHS["SEQUENCE_LENGTH_100KB"]
interval = variant.reference_interval.resize(sequence_length)

# Create the RNA-seq LFC scorer
rna_lfc_scorer = GeneMaskLFCScorer(requested_output=OutputType.RNA_SEQ)

# Score the variant for RNA-seq
rna_scores = model.score_variant(
    interval=interval, variant=variant, variant_scorers=[rna_lfc_scorer]
)

# Convert to dataframe
rna_df = variant_scorers.tidy_scores(rna_scores)

In [5]:
rna_df = rna_df.loc[rna_df["Assay title"] == "total RNA-seq"]

In [6]:
print(f"Number of predictions: {len(rna_df)}")

# Summary statistics
print(f"Mean LFC: {rna_df['raw_score'].mean():.6f}")
print(f"Median LFC: {rna_df['raw_score'].median():.6f}")
print(f"Min LFC: {rna_df['raw_score'].min():.6f}")
print(f"Max LFC: {rna_df['raw_score'].max():.6f}")

# Count effects
upregulated = (rna_df["raw_score"] > 0).sum()
downregulated = (rna_df["raw_score"] < 0).sum()
print(f"Upregulated: {upregulated}")
print(f"Downregulated: {downregulated}")

# Top effects
top_up = rna_df.nlargest(5, "raw_score")[["biosample_name", "raw_score"]]
top_down = rna_df.nsmallest(5, "raw_score")[["biosample_name", "raw_score"]]

print("Top upregulated:")
display(top_up)
print("Top downregulated:")
display(top_down)

Number of predictions: 594
Mean LFC: -0.001701
Median LFC: -0.000123
Min LFC: -0.014296
Max LFC: 0.011298
Upregulated: 118
Downregulated: 454
Top upregulated:


Unnamed: 0,biosample_name,raw_score
827,mononuclear cell,0.011298
826,hematopoietic multipotent progenitor cell,0.010371
819,natural killer cell,0.007397
948,OCI-LY7,0.00631
828,"naive thymus-derived CD4-positive, alpha-beta ...",0.005348


Top downregulated:


Unnamed: 0,biosample_name,raw_score
500,hair follicular keratinocyte,-0.014296
510,MCF 10A,-0.013721
645,esophagus mucosa,-0.013185
653,esophagus squamous epithelium,-0.01308
661,dorsolateral prefrontal cortex,-0.012895


In [7]:
rna_df["quantile_score_pos"] = (rna_df["quantile_score"] + 1) / 2
rna_df["raw_score_pos"] = 1 / (np.exp(-rna_df["raw_score"]) + 1)

In [8]:
cell_types = rna_df.loc[
    rna_df["gene_name"] == "DNMT3A", "biosample_name"
].drop_duplicates()

t_cells = cell_types[
    cell_types.str.contains("T cell") | cell_types.str.contains("T-cell")
]
t_cells

399                                               T-cell
424                      CD4-positive, alpha-beta T cell
425                      CD8-positive, alpha-beta T cell
429    CD4-positive, CD25-positive, alpha-beta regula...
432    naive thymus-derived CD4-positive, alpha-beta ...
433               CD4-positive, alpha-beta memory T cell
435    naive thymus-derived CD8-positive, alpha-beta ...
436               CD8-positive, alpha-beta memory T cell
Name: biosample_name, dtype: object

In [9]:
t_cell_scores = rna_df.loc[
    (rna_df["gene_name"] == "DNMT3A") & (rna_df["biosample_name"].isin(t_cells))
]
t_cell_scores

Unnamed: 0,variant_id,scored_interval,gene_id,gene_name,gene_type,gene_strand,junction_Start,junction_End,output_type,variant_scorer,...,track_strand,Assay title,ontology_curie,biosample_name,biosample_type,gtex_tissue,raw_score,quantile_score,quantile_score_pos,raw_score_pos
399,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000119772,DNMT3A,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,-,total RNA-seq,CL:0000084,T-cell,primary_cell,,-0.004932,-0.970426,0.014787,0.498767
424,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000119772,DNMT3A,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,-,total RNA-seq,CL:0000624,"CD4-positive, alpha-beta T cell",primary_cell,,-0.005808,-0.982893,0.008553,0.498548
425,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000119772,DNMT3A,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,-,total RNA-seq,CL:0000625,"CD8-positive, alpha-beta T cell",primary_cell,,-0.006186,-0.986078,0.006961,0.498454
429,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000119772,DNMT3A,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,-,total RNA-seq,CL:0000792,"CD4-positive, CD25-positive, alpha-beta regula...",primary_cell,,-0.005623,-0.981678,0.009161,0.498594
432,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000119772,DNMT3A,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,-,total RNA-seq,CL:0000895,"naive thymus-derived CD4-positive, alpha-beta ...",primary_cell,,-0.004492,-0.963724,0.018138,0.498877
433,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000119772,DNMT3A,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,-,total RNA-seq,CL:0000897,"CD4-positive, alpha-beta memory T cell",primary_cell,,-0.004776,-0.965332,0.017334,0.498806
435,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000119772,DNMT3A,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,-,total RNA-seq,CL:0000900,"naive thymus-derived CD8-positive, alpha-beta ...",primary_cell,,-0.004428,-0.955539,0.022231,0.498893
436,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000119772,DNMT3A,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,-,total RNA-seq,CL:0000909,"CD8-positive, alpha-beta memory T cell",primary_cell,,-0.005681,-0.981678,0.009161,0.49858


In [10]:
categories = {
    "stem": ["stem cell", "progenitor", "mesendoderm"],
    "immune": [
        "T-cell",
        "B cell",
        "natural killer",
        "monocyte",
        "lymphoblast",
        "jurkat",
        "immune",
        "OCI-LY7",
        "GM128",
    ],
    "epithelial_endothelial": [
        "epithelial",
        "endothelial",
        "keratinocyte",
        "melanocyte",
        "myoepithelial",
        "luminal",
    ],
    "muscle_connective": [
        "muscle",
        "myocyte",
        "osteoblast",
        "osteocyte",
        "chondrocyte",
        "fibroblast",
        "myotube",
        "myoblast",
    ],
    "neural": ["neuron", "neural", "astrocyte", "purkinje", "glutamatergic"],
    "organ_specific": [
        "hepatocyte",
        "pancreatic",
        "trophoblast",
        "myometrial",
        "mesangial",
    ],
    "cell_line": [
        "A549",
        "Caco-2",
        "HepG2",
        "HT-29",
        "IMR-90",
        "MCF",
        "HT1080",
        "K562",
        "PC-",
        "HeLa",
        "Calu3",
        "HCT116",
        "SK-",
        "Panc1",
        "BJ",
        "SJSA1",
        "H1",
        "H9",
        "Daoy",
        "RPMI",
        "U-87",
        "BE2C",
        "WTC11",
    ],
}


# Function to map cell type to broad category
def map_category(cell):
    cell_lower = cell.lower()
    for category, keywords in categories.items():
        for keyword in keywords:
            if keyword.lower() in cell_lower:
                return category
    return "other"


# Apply mapping
rna_df["biosample_category"] = rna_df["biosample_name"].apply(map_category)
rna_df

Unnamed: 0,variant_id,scored_interval,gene_id,gene_name,gene_type,gene_strand,junction_Start,junction_End,output_type,variant_scorer,...,Assay title,ontology_curie,biosample_name,biosample_type,gtex_tissue,raw_score,quantile_score,quantile_score_pos,raw_score_pos,biosample_category
1,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000115138,POMC,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,total RNA-seq,CL:0000062,osteoblast,primary_cell,,1.430511e-06,0.034594,0.517297,0.500000,muscle_connective
3,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000115138,POMC,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,total RNA-seq,CL:0000084,T-cell,primary_cell,,-4.768372e-06,-0.046111,0.476944,0.499999,immune
4,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000115138,POMC,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,total RNA-seq,CL:0000115,endothelial cell,in_vitro_differentiated_cells,,5.626678e-05,0.160114,0.580057,0.500014,epithelial_endothelial
5,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000115138,POMC,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,total RNA-seq,CL:0000127,astrocyte,primary_cell,,-4.768372e-07,-0.046111,0.476944,0.500000,neural
7,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000115138,POMC,protein_coding,-,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,total RNA-seq,CL:0000137,osteocyte,in_vitro_differentiated_cells,,4.768372e-07,0.023068,0.511534,0.500000,muscle_connective
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1059,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000230452,LINC01381,lncRNA,+,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,total RNA-seq,UBERON:0011907,gastrocnemius medialis,tissue,,1.616478e-04,0.373276,0.686638,0.500040,other
1060,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000230452,LINC01381,lncRNA,+,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,total RNA-seq,UBERON:0015143,mesenteric fat pad,tissue,,1.311302e-04,0.343112,0.671556,0.500033,other
1061,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000230452,LINC01381,lncRNA,+,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,total RNA-seq,UBERON:0036149,suprapubic skin,tissue,,-1.845360e-04,-0.322598,0.338701,0.499954,other
1063,chr2:25234373:C>T,chr2:25168837-25299909:.,ENSG00000230452,LINC01381,lncRNA,+,,,RNA_SEQ,GeneMaskLFCScorer(requested_output=RNA_SEQ),...,total RNA-seq,CL:0000100,motor neuron,in_vitro_differentiated_cells,,1.435280e-04,0.114851,0.557425,0.500036,neural


In [11]:
rna_df["biosample_category"].value_counts()

biosample_category
other                     306
muscle_connective          87
epithelial_endothelial     81
cell_line                  63
stem                       18
immune                     15
neural                     12
organ_specific             12
Name: count, dtype: int64

In [12]:
dnmt3a_df = rna_df.loc[rna_df["gene_name"] == "DNMT3A"]
dnmt3a_immune_neural_df = dnmt3a_df.loc[
    dnmt3a_df["biosample_category"].isin(["immune", "neural"])
]

In [13]:
px.box(
    dnmt3a_immune_neural_df,
    x="biosample_category",
    y="quantile_score",
    title="DNMT3A R882C Variant Effects on RNA-seq",
    width=500,
    points="all",
)

In [None]:
# Bar plot for DNMT3A scores: immune vs neural cell types, with cell type names as labels
dnmt3a_immune_neural = dnmt3a_df.loc[
    dnmt3a_df["biosample_category"].isin(["immune", "neural"])
]
fig = px.bar(
    dnmt3a_immune_neural,
    x="biosample_name",
    y="quantile_score",
    color="biosample_category",
    text="quantile_score",
    title="DNMT3A R882C Scores: Immune vs Neural Cell Types (RNA-seq)",
    labels={
        "quantile_score": "Quantile Score",
        "biosample_name": "Cell Type",
        "biosample_category": "Compartment",
    },
    width=1100,
    height=500,
    template="plotly_white",
)
fig.update_traces(textposition="outside")
fig.update_layout(xaxis_tickangle=0, showlegend=True)
fig.show()