# AlphaGenome

Experimenting with AlphaGenome

In [31]:
from alphagenome.models.variant_scorers import GeneMaskLFCScorer
from alphagenome.models.dna_client import OutputType
from alphagenome.data import genome
from alphagenome.models import variant_scorers, dna_client
import numpy as np
import pandas as pd
import requests
import plotly.express as px
import matplotlib.pyplot as plt

In [32]:
alphagenome_api_key = "AIzaSyCir34AonB_Q0y7aHJx1od6Z5SJrf5Z1mg"
model = dna_client.create(alphagenome_api_key)

In [33]:
# Define the DNMT3A R882C variant
# gene_variant_name = "DNMT3A R882C"
# gene = "DNMT3A"
# variant = genome.Variant(
#     chromosome="chr2", position=25_234_373, reference_bases="C", alternate_bases="T"
# )


gene_variant_name = "MAPT P301L (rs63751273) variant"
gene = "MAPT"
variant = genome.Variant(
    chromosome="chr17", position=46010389, reference_bases="C", alternate_bases="T"
)

# Create interval around the variant
sequence_length = dna_client.SUPPORTED_SEQUENCE_LENGTHS["SEQUENCE_LENGTH_100KB"]
interval = variant.reference_interval.resize(sequence_length)

# Create the RNA-seq LFC scorer
rna_lfc_scorer = GeneMaskLFCScorer(requested_output=OutputType.RNA_SEQ)

# Score the variant for RNA-seq
rna_scores = model.score_variant(
    interval=interval, variant=variant, variant_scorers=[rna_lfc_scorer]
)

# Convert to dataframe
rna_df = variant_scorers.tidy_scores(rna_scores)

In [34]:
rna_df = rna_df.loc[rna_df["Assay title"] == "total RNA-seq"]

In [35]:
rna_df = rna_df.loc[rna_df["gene_name"] == gene]

In [36]:
rna_df = rna_df.drop(711)

In [37]:
categories = {
    "cell_line": [
        "Caco-2",
        "HepG2",
        "HT-29",
        "IMR-90",
        "MCF 10A",
        "MCF-7",
        "HT1080",
        "K562",
        "PC-3",
        "A172",
        "A375",
        "A673",
        "Caki2",
        "G401",
        "H4",
        "MG63",
        "SJSA1",
        "Panc1",
        "GM12878",
        "Calu3",
        "HCT116",
        "PC-9",
        "H1",
        "NCI-H460",
        "H9",
        "M059J",
        "Daoy",
        "RPMI7951",
        "LHCN-M2",
        "Karpas-422",
        "SK-MEL-5",
        "SJCRH30",
        "GM23248",
        "H7",
        "OCI-LY7",
        "GM23338",
        "HFFc6",
        "WTC11",
    ],
    "epithelial/endothelial": [
        "esophagus muscularis mucosa",
        "gastroesophageal sphincter",
        "endothelial cell",
        "endodermal cell",
        "tracheal epithelial cell",
        "glomerular endothelial cell",
        "epithelial cell of proximal tubule",
        "mammary epithelial cell",
        "bronchial epithelial cell",
        "airway epithelial cell",
        "kidney epithelial cell",
        "vein endothelial cell",
        "thoracic aorta endothelial cell",
        "placental epithelial cell",
        "renal cortical epithelial cell",
        "endothelial cell of umbilical vein",
        "epithelial cell of umbilical artery",
        "epithelial cell of alveolus of lung",
        "mesothelial cell of epicardium",
        "pulmonary artery endothelial cell",
        "foreskin keratinocyte",
        "dermis blood vessel endothelial cell",
        "dermis lymphatic vessel endothelial cell",
        "lung microvascular endothelial cell",
        "endothelial cell of coronary artery",
        "bladder microvascular endothelial cell",
        "dermis microvascular lymphatic vessel endothelial cell",
        "mammary microvascular endothelial cell",
        "hair follicular keratinocyte",
        "nasal cavity respiratory epithelium epithelial cell of viscerocranial mucosa",
        "colonic mucosa",
        "esophagus mucosa",
        "mucosa of descending colon",
        "mucosa of gallbladder",
        "esophagus squamous epithelium",
        "breast epithelium",
        "melanocyte of skin",
    ],
    "immune": [
        "T-cell",
        "B cell",
        "natural killer cell",
        "CD4-positive, alpha-beta T cell",
        "CD8-positive, alpha-beta T cell",
        "CD4-positive, CD25-positive, alpha-beta regulatory T cell",
        "mononuclear cell",
        "naive thymus-derived CD4-positive, alpha-beta T cell",
        "CD4-positive, alpha-beta memory T cell",
        "T-helper 17 cell",
        "naive thymus-derived CD8-positive, alpha-beta T cell",
        "CD8-positive, alpha-beta memory T cell",
    ],
    "muscle/connective": [
        "suprapubic skin",
        "osteoblast",
        "osteocyte",
        "chondrocyte",
        "myocyte",
        "smooth muscle cell",
        "hair follicle dermal papilla cell",
        "skeletal muscle myoblast",
        "skeletal muscle satellite cell",
        "mesangial cell",
        "cardiac muscle cell",
        "regular cardiac myocyte",
        "myometrial cell",
        "myotube",
        "aortic smooth muscle cell",
        "fibroblast of the aortic adventitia",
        "fibroblast of dermis",
        "fibroblast of lung",
        "fibroblast of villous mesenchyme",
        "smooth muscle cell of the pulmonary artery",
        "smooth muscle cell of the coronary artery",
        "smooth muscle cell of the umbilical artery",
        "smooth muscle cell of bladder",
        "bronchial smooth muscle cell",
        "smooth muscle cell of trachea",
        "uterine smooth muscle cell",
        "articular chondrocyte of knee joint",
        "cardiac ventricle fibroblast",
        "cardiac atrium fibroblast",
        "pericardium fibroblast",
        "placental pericyte",
        "bronchus fibroblast of lung",
        "Right ventricle myocardium inferior",
        "Right ventricle myocardium superior",
        "left ventricle myocardium inferior",
        "left ventricle myocardium superior",
        "skeletal muscle tissue",
        "psoas muscle",
        "gastrocnemius medialis",
        "skin of body",
        "lower leg skin",
        "subcutaneous adipose tissue",
        "subcutaneous preadipocyte",
        "mesenteric fat pad",
        "omental fat pad",
    ],
    "neural": [
        "astrocyte",
        "glutamatergic neuron",
        "neural crest cell",
        "sciatic nerve",
        "tibial nerve",
        "frontal cortex",
        "temporal lobe",
        "parietal lobe",
        "diencephalon",
        "occipital lobe",
        "cerebellum",
        "spinal cord",
        "dorsolateral prefrontal cortex",
        "motor neuron",
    ],
    "organ_specific": [
        "type B pancreatic cell",
        "hepatocyte",
        "camera-type eye",
        "ureter",
        "metanephros",
        "testis",
        "stomach",
        "aorta",
        "heart",
        "ovary",
        "uterus",
        "vagina",
        "posterior vena cava",
        "right lobe of liver",
        "left lobe of liver",
        "body of pancreas",
        "transverse colon",
        "sigmoid colon",
        "Peyer's patch",
        "urinary bladder",
        "pancreas",
        "ascending aorta",
        "thoracic aorta",
        "tongue",
        "placenta",
        "thyroid gland",
        "lung",
        "right cardiac atrium",
        "left cardiac atrium",
        "heart right ventricle",
        "heart left ventricle",
        "cardiac septum",
        "spleen",
        "liver",
        "kidney",
        "left lung",
        "upper lobe of right lung",
        "lower lobe of right lung",
        "prostate gland",
        "adrenal gland",
        "left colon",
        "lower lobe of left lung",
        "right atrium auricular region",
        "upper lobe of left lung",
        "left colon",
        "lower lobe of left lung",
        "right atrium auricular region",
        "upper lobe of left lung",
        "umbilical cord",
    ],
    "stem": [
        "progenitor cell of endocrine pancreas",
        "hematopoietic multipotent progenitor cell",
        "neural progenitor cell",
        "mesenchymal stem cell of Wharton's jelly",
        "mesenchymal stem cell of adipose",
        "mesenchymal stem cell of the bone marrow",
    ],
}

In [38]:
if gene == "DNMT3A":
    rna_df.loc[rna_df["biosample_name"] == "WTC11"]

In [39]:
# Visualize quantile scores for all biosample names, creating separate plots for each category
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import os


# Function to map cell type to broad category
def map_category(cell):
    cell_lower = cell.lower()
    for category, keywords in categories.items():
        for keyword in keywords:
            if keyword.lower() in cell_lower:
                return category
    return "other"


# Apply mapping
rna_df["biosample_category"] = rna_df["biosample_name"].apply(map_category)

# Filter out "other" category and sort by category and quantile_score for better visualization
rna_df_filtered = rna_df[rna_df["biosample_category"] != "other"]

# Get unique categories
unique_categories = rna_df_filtered["biosample_category"].unique()

# Create output directory if it doesn't exist
output_dir = f"alphagenome_{gene}"
os.makedirs(output_dir, exist_ok=True)

# Create separate plots for each category
for category in unique_categories:
    category_data = rna_df_filtered[rna_df_filtered["biosample_category"] == category]
    category_data_sorted = category_data.sort_values("quantile_score", ascending=False)

    # Create bar plot for this category
    fig = px.bar(
        category_data_sorted,
        x="biosample_name",
        y="quantile_score",
        title=f"{gene_variant_name} - {category.replace('_', ' ').title()} Biosample Categories",
        labels={
            "quantile_score": "Quantile Score",
            "biosample_name": "Biosample Name",
        },
        width=max(
            800, len(category_data_sorted) * 30
        ),  # Adjust width based on number of items
        height=500,
        template="plotly_white",
    )

    # Update layout for better readability and consistent y-axis
    fig.update_layout(
        xaxis_tickangle=45,
        xaxis_title="Biosample Name",
        yaxis_title="Quantile Score",
        yaxis=dict(range=[-1, 1]),  # Set consistent y-axis range
    )

    # Export to files
    # Clean both gene_variant and category for safe filename
    clean_gene_variant = gene_variant_name.replace("/", "").replace(" ", "_")
    clean_category = category.replace("/", "_").replace(" ", "_")
    filename_base = f"{clean_gene_variant}_{clean_category}"
    jpg_filename = os.path.join(output_dir, f"{filename_base}.jpg")

    # Save as JPG (static image)
    fig.write_image(jpg_filename, width=fig.layout.width, height=fig.layout.height)

    # Show the plot
    fig.show()

    print(
        f"\n{category.replace('_', ' ').title()} category: {len(category_data_sorted)} biosample categories"
    )
    print("-" * 50)

print(f"\nAll plots exported to '{output_dir}' directory")


Muscle/Connective category: 45 biosample categories
--------------------------------------------------



Immune category: 12 biosample categories
--------------------------------------------------



Epithelial/Endothelial category: 37 biosample categories
--------------------------------------------------



Neural category: 14 biosample categories
--------------------------------------------------



Organ Specific category: 46 biosample categories
--------------------------------------------------



Stem category: 5 biosample categories
--------------------------------------------------



Cell Line category: 38 biosample categories
--------------------------------------------------

All plots exported to 'alphagenome_MAPT' directory
