# AlphaGenome

Experimenting with AlphaGenome

In [None]:
from alphagenome.models.variant_scorers import GeneMaskLFCScorer
from alphagenome.models.dna_client import OutputType
from alphagenome.data import genome
from alphagenome.models import variant_scorers, dna_client
import numpy as np
import pandas as pd
import requests
import plotly.express as px
import matplotlib.pyplot as plt

In [None]:
alphagenome_api_key = "AIzaSyCir34AonB_Q0y7aHJx1od6Z5SJrf5Z1mg"
model = dna_client.create(alphagenome_api_key)

In [None]:
# Define the DNMT3A R882C variant
variant = genome.Variant(
    chromosome="chr2", position=25_234_373, reference_bases="C", alternate_bases="T"
)

# MAPT P301L (rs63751273) variant
# variant = genome.Variant(
#     chromosome="chr17", position=44061046, reference_bases="C", alternate_bases="T"
# )

# Create interval around the variant
sequence_length = dna_client.SUPPORTED_SEQUENCE_LENGTHS["SEQUENCE_LENGTH_100KB"]
interval = variant.reference_interval.resize(sequence_length)

# Create the RNA-seq LFC scorer
rna_lfc_scorer = GeneMaskLFCScorer(requested_output=OutputType.RNA_SEQ)

# Score the variant for RNA-seq
rna_scores = model.score_variant(
    interval=interval, variant=variant, variant_scorers=[rna_lfc_scorer]
)

# Convert to dataframe
rna_df = variant_scorers.tidy_scores(rna_scores)

In [None]:
rna_df = rna_df.loc[rna_df["Assay title"] == "total RNA-seq"]

In [None]:
print(f"Number of predictions: {len(rna_df)}")

# Summary statistics
print(f"Mean LFC: {rna_df['raw_score'].mean():.6f}")
print(f"Median LFC: {rna_df['raw_score'].median():.6f}")
print(f"Min LFC: {rna_df['raw_score'].min():.6f}")
print(f"Max LFC: {rna_df['raw_score'].max():.6f}")

# Count effects
upregulated = (rna_df["raw_score"] > 0).sum()
downregulated = (rna_df["raw_score"] < 0).sum()
print(f"Upregulated: {upregulated}")
print(f"Downregulated: {downregulated}")

# Top effects
top_up = rna_df.nlargest(5, "raw_score")[["biosample_name", "raw_score"]]
top_down = rna_df.nsmallest(5, "raw_score")[["biosample_name", "raw_score"]]

print("Top upregulated:")
display(top_up)
print("Top downregulated:")
display(top_down)

In [None]:
rna_df["quantile_score_pos"] = (rna_df["quantile_score"] + 1) / 2
rna_df["raw_score_pos"] = 1 / (np.exp(-rna_df["raw_score"]) + 1)

In [None]:
cell_types = rna_df.loc[
    rna_df["gene_name"] == "DNMT3A", "biosample_name"
].drop_duplicates()

t_cells = cell_types[
    cell_types.str.contains("T cell") | cell_types.str.contains("T-cell")
]
t_cells

In [None]:
t_cell_scores = rna_df.loc[
    (rna_df["gene_name"] == "DNMT3A") & (rna_df["biosample_name"].isin(t_cells))
]
t_cell_scores

In [None]:
categories = {
    "stem": ["stem cell", "progenitor", "mesendoderm"],
    "immune": [
        "T-cell",
        "B cell",
        "natural killer",
        "monocyte",
        "lymphoblast",
        "jurkat",
        "immune",
        "OCI-LY7",
        "GM128",
    ],
    "epithelial_endothelial": [
        "epithelial",
        "endothelial",
        "keratinocyte",
        "melanocyte",
        "myoepithelial",
        "luminal",
    ],
    "muscle_connective": [
        "muscle",
        "myocyte",
        "osteoblast",
        "osteocyte",
        "chondrocyte",
        "fibroblast",
        "myotube",
        "myoblast",
    ],
    "neural": ["neuron", "neural", "astrocyte", "purkinje", "glutamatergic"],
    "organ_specific": [
        "hepatocyte",
        "pancreatic",
        "trophoblast",
        "myometrial",
        "mesangial",
    ],
    "cell_line": [
        "A549",
        "Caco-2",
        "HepG2",
        "HT-29",
        "IMR-90",
        "MCF",
        "HT1080",
        "K562",
        "PC-",
        "HeLa",
        "Calu3",
        "HCT116",
        "SK-",
        "Panc1",
        "BJ",
        "SJSA1",
        "H1",
        "H9",
        "Daoy",
        "RPMI",
        "U-87",
        "BE2C",
        "WTC11",
    ],
}


# Function to map cell type to broad category
def map_category(cell):
    cell_lower = cell.lower()
    for category, keywords in categories.items():
        for keyword in keywords:
            if keyword.lower() in cell_lower:
                return category
    return "other"


# Apply mapping
rna_df["biosample_category"] = rna_df["biosample_name"].apply(map_category)
rna_df

In [None]:
rna_df["biosample_category"].value_counts()

In [None]:
dnmt3a_df = rna_df.loc[rna_df["gene_name"] == "DNMT3A"]
dnmt3a_immune_neural_df = dnmt3a_df.loc[
    dnmt3a_df["biosample_category"].isin(["immune", "neural"])
]

In [None]:
px.box(
    dnmt3a_immune_neural_df,
    x="biosample_category",
    y="quantile_score",
    title="DNMT3A R882C Variant Effects on RNA-seq",
    width=500,
    points="all",
)

In [None]:
# Bar plot for DNMT3A scores: immune vs neural cell types, with cell type names as labels
dnmt3a_immune_neural = dnmt3a_df.loc[
    dnmt3a_df["biosample_category"].isin(["immune", "neural"])
]
fig = px.bar(
    dnmt3a_immune_neural,
    x="biosample_name",
    y="quantile_score",
    color="biosample_category",
    text="quantile_score",
    title="DNMT3A R882C Scores: Immune vs Neural Cell Types (RNA-seq)",
    labels={
        "quantile_score": "Quantile Score",
        "biosample_name": "Cell Type",
        "biosample_category": "Compartment",
    },
    width=1100,
    height=500,
    template="plotly_white",
)
fig.update_traces(textposition="outside")
fig.update_layout(xaxis_tickangle=0, showlegend=True)
fig.show()