In [None]:
import os
import numpy
import pandas

from aavomics import database
from aavomics import aavomics
import anndata
import scanpy

from plotly import offline as plotly
from plotly import graph_objects
from plotly.subplots import make_subplots

In [None]:
ANNDATA_FILE_NAME = "aavomics_mouse_cortex_2021.h5ad"

In [None]:
# Load the anndata file

adata = anndata.read_h5ad(os.path.join(database.DATA_PATH, ANNDATA_FILE_NAME))

In [None]:
# Filter out unlabeled cells
labeled_cell_mask = ~pandas.isna(adata.obs["X_CCN202105070"])
adata = adata[labeled_cell_mask]

# Filter out genes without any detected expression
gene_mask = (adata.X.max(axis=0) > 0).toarray().flatten()
adata = adata[:, gene_mask].copy()

In [None]:
# Make a mask identifying endothelial cells vs others
endothelial_cell_mask = adata.obs["Cell Type"].isin(["Endothelial Cells"])

In [None]:
# Extract the t-SNE coordinates

coordinates = numpy.array([
    adata.obs["X_CCN202105070"],
    adata.obs["Y_CCN202105070"]
]).transpose()

In [None]:
# Make a t-SNE plot coloring endothelial vs other cells

other_cells_trace = graph_objects.Scatter(
    x=coordinates[~endothelial_cell_mask, 0],
    y=coordinates[~endothelial_cell_mask, 1],
    mode="markers",
    marker={
        "size": 2
    },
    name="Other Cells"
)

endothelial_trace = graph_objects.Scatter(
    x=coordinates[endothelial_cell_mask, 0],
    y=coordinates[endothelial_cell_mask, 1],
    mode="markers",
    marker={
        "size": 2
    },
    name="Endothelial Cells"
)

In [None]:
layout = {}
    
layout["height"] = 800
layout["width"] = 1100
layout["plot_bgcolor"] = "rgba(255, 255, 255, 0)"
layout["paper_bgcolor"] = "rgba(255, 255, 255, 0)"
layout["yaxis"] = {
    "showgrid": False,
    "zeroline": False,
    "showticklabels": False
}
layout["xaxis"] = {
    "showgrid": False,
    "zeroline": False,
    "showticklabels": False
}
layout["showlegend"] = False

layout = graph_objects.Layout(layout)

figure = graph_objects.Figure(
    data=[other_cells_trace, endothelial_trace],
    layout=layout
)

In [None]:
figure.write_image(os.path.join("out", "endothelial_cell_tsne.png"), scale=2)

In [None]:
target_sum = 10000
adata_norm = adata.copy()
X_norm_new = scanpy.pp.normalize_total(adata_norm, target_sum=target_sum, inplace=False)["X"]
adata_norm.X = X_norm_new
scanpy.pp.log1p(adata_norm)

In [None]:
scanpy.tl.rank_genes_groups(adata_norm, "Cell Type", method="wilcoxon")

In [None]:
de_results = scanpy.get.rank_genes_groups_df(adata_norm, group="Endothelial Cells")

de_results["Gene Name"] = adata.var.loc[de_results["names"].values]["Gene Name"].values.astype(str)

total_transcript_counts = numpy.array(adata.X.sum(axis=1)).flatten()
adata_endothelial = adata[adata.obs["Cell Type"] == "Endothelial Cells"]
total_transcript_counts_endothelial = total_transcript_counts[adata.obs["Cell Type"] == "Endothelial Cells"]

# endothelial_expression = numpy.array(adata_endothelial[:, de_results["names"].values.astype(str)].X.sum(axis=0)).flatten()
# mean_expressions = endothelial_expression / total_transcript_counts_endothelial.sum()

mean_expressions = numpy.divide(
    numpy.array(adata_endothelial[:, de_results["names"].values.astype(str)].X.todense()),
    total_transcript_counts_endothelial.reshape((-1, 1)),
).mean(axis=0)

de_results["Mean Expression"] = mean_expressions

In [None]:
import requests

def get_names_of_genes_with_specific_features(feature):
    """"Retrieve names of genes that have a certain feature, such as "membrane" localized.
    
    Parameters:
        feature (string): A single word in string format that defines the feature of interest (e.g. "membrane").
        
    Output:
        genes_of_interest (dictionary): A dictionary listing all the names of genes that possess the feature of interest.
    
    """
    # Define constants
    BASE = 'http://www.uniprot.org'
    KB_ENDPOINT = '/uniprot/'
    TOOL_ENDPOINT = '/uploadlists/'

    # Define query
    payload = {'query': 'keyword:' + feature +' AND taxonomy:mus AND reviewed:yes',
               'format': 'tab',
               'columns': 'genes'}

    # Pull requests
    result = requests.get(BASE + KB_ENDPOINT, params=payload)

    if result.ok:
        print('Number of proteins with the keyword %s: ' % feature + str(len(result.text)))
    else:
        print('Something went wrong ', result.status_code)

    # Create dictionary
    genes_of_interest =  set()

    # Iterate through words from UniProt output
    for idx,row in enumerate(result.text.split()):

        # First two elements are "Gene names"
        if idx<3:
            continue

        # Add gene names to dictionary
        genes_of_interest.add(row)
        
    return genes_of_interest

membrane_genes = get_names_of_genes_with_specific_features("\"Cell membrane\"")

In [None]:
GENE_GROUPS = {
    "Screen 1": [
        "Ly6c1",
        "Cldn5",
        "Flt1",
        "Ly6a",
        "Slco1a4",
        "Abcb1a",
        "Bsg",
        "Slc2a1",
        "Esam",
        "Adgrl4",
        "Clec2d",
        "Ifitm3",
        "Adgrf5",
        "Abcg2",
        "Eng",
        "Cdh5",
        "Acvrl1",
        "Pecam1",
        "Slco1c1",
        "Slc6a6",
        "Fcgrt",
        "Ocln",
        "Kdr",
        "Tek",
        "Gpm6b",
        "Gpm6a",
        "Slc22a8",
        "Podxl",
        "Igf1r",
        "Slc7a1",
        "Prom1",
        "Clic4",
        "Car4",
        "Clic1",
        "Kitl",
        "Cd34",
        "Ly6e",
        "Serinc3",
        "App",
        "Itm2b",
        "Paqr5",
        "Ifitm2",
      
    ],
    "Screen 2": [
        "Ly6h",
        "Lypd5",
#         "Cd59a",
        "Car9",
        "Car12",
        "Car14",
        "Car15",
        "Ly6c1",
        "Car4",
        "Igf1r"

    ],
    "Hits": [
        "Ly6a",
        "Ly6c1",
        "Car4",
        "Igf1r"
    ]
}


In [None]:
GROUP_COLORS = {
    "Screen 1": "red",
    "Screen 2": "red",
    "Hits": "red",
    "Default": "black"
}

GROUP_SIZES = {
    "Screen 1": 10,
    "Screen 2": 10,
    "Hits": 10,
    "Default": 5
}

GROUP_SHAPES = {
    "Screen 1": "circle",
    "Screen 2": "circle",
    "Hits": "cross",
    "Default": "circle"
}

GROUP_OPACITIES = {
    "Screen 1": 0.25,
    "Screen 2": 1,
    "Hits": 1,
    "Default": 0.25
}

gene_colors = {}

for group, genes in GENE_GROUPS.items():
    
    for gene in genes:
        
        gene_colors[gene] = GROUP_COLORS[group]
        
gene_sizes = {}

for group, genes in GENE_GROUPS.items():
    
    for gene in genes:
        
        gene_sizes[gene] = GROUP_SIZES[group]
        
gene_shapes = {}

for group, genes in GENE_GROUPS.items():
    
    for gene in genes:
        
        gene_shapes[gene] = GROUP_SHAPES[group]
        
gene_opacities = {}

for group, genes in GENE_GROUPS.items():
    
    for gene in genes:
        
        gene_opacities[gene] = GROUP_OPACITIES[group]

In [None]:
CUSTOM_MEMBRANE_GENES = ["Cd34"]

CYTOPLASMIC_MEMBRANE_GENES = [
    "Ctnnb1",
    "Ndrg1",
    "Arl4a",
    "Gng11",
    "Vim",
    "Sptbn1",
    "Nostrin",
    "Gng5",
    "Gnai2",
    "Rras",
    "Rhoa",
    "Rhoc",
    "Lims2",
    "Gnas",
    "Eef1a1",
    "Hsp90ab1",
    "Slc9a3r2",
    "Rpsa",
    "Glul",
    "Rhob"
]

In [None]:
gene_mask = \
    (
        de_results["Gene Name"].isin(membrane_genes) |
        de_results["Gene Name"].isin(CUSTOM_MEMBRANE_GENES)
    ) & \
    (
        ~de_results["Gene Name"].isin(CYTOPLASMIC_MEMBRANE_GENES)
    ) & \
    (de_results["scores"] > 0)

In [None]:
de_results_filtered = de_results[gene_mask].copy()

color = [gene_colors[gene] if gene in gene_colors else GROUP_COLORS["Default"] for gene in de_results_filtered["Gene Name"]]
sizes = [gene_sizes[gene] if gene in gene_sizes else GROUP_SIZES["Default"] for gene in de_results_filtered["Gene Name"]]
shapes = [gene_shapes[gene] if gene in gene_shapes else GROUP_SHAPES["Default"] for gene in de_results_filtered["Gene Name"]]
opacities = [gene_opacities[gene] if gene in gene_shapes else GROUP_OPACITIES["Default"] for gene in de_results_filtered["Gene Name"]]

traces = []

scatter_trace = graph_objects.Scatter(
    y=de_results_filtered["Mean Expression"],
    x=de_results_filtered["scores"],
    mode="markers",
    text=de_results_filtered["Gene Name"],
    marker={
        "size": sizes,
        "color": color,
        "symbol": shapes,
        "opacity": opacities,
        "line": {
            "color": "rgba(255, 255, 255, 0)",
            "width": 0
        }
    }
)

traces.append(scatter_trace)

layout = {}

layout["height"] = 500
layout["width"] = 800
layout["plot_bgcolor"] = "rgba(255, 255, 255, 0)"
layout["paper_bgcolor"] = "rgba(255, 255, 255, 0)"
layout["yaxis"] = {
    "title": "Mean Transcript Abundance",
    "type": "log"
}
layout["xaxis"] = {
    "title": "Score"
}

layout = graph_objects.Layout(layout)

figure = graph_objects.Figure(
    data=traces,
    layout=layout
)

plotly.iplot(figure)

figure.write_image(os.path.join("out", "endothelial_cell_mean_abundance_vs_scanpy_score_upregulated_only_ly6a_marked.svg"))
figure.write_html(os.path.join("out", "endothelial_cell_mean_abundance_vs_scanpy_score_upregulated_only_ly6a_marked.html"))

In [None]:
MARKER_GENE_LIST = ["Cldn5", "Slc2a1", "Pecam1", "Cdh5", "Tek", "Ly6a"]

In [None]:
adata_norm.obs["Is Endothelial"] = adata_norm.obs["Cell Type"] == "Endothelial Cells"
adata_norm.obs.loc[adata_norm.obs["Cell Type"] == "Endothelial Cells", "Is Endothelial"] = "Endothelial Cells"
adata_norm.obs.loc[adata_norm.obs["Cell Type"] != "Endothelial Cells", "Is Endothelial"] = "Not Endothelial Cells"

In [None]:
figure = make_subplots(
    rows=len(MARKER_GENE_LIST),
    cols=2,
    shared_xaxes=True,
    shared_yaxes=True,
    vertical_spacing=0,
    horizontal_spacing=0.0
)

for gene_index, gene in enumerate(MARKER_GENE_LIST):
    
    gene_id = adata_norm.var[adata_norm.var["Gene Name"] == gene].index.values[0]
    
    gene_max = -numpy.infty

    for cell_type_index, cell_type in enumerate(sorted(adata_norm.obs["Is Endothelial"].unique())):
        
        gene_counts = numpy.array(adata_norm[adata_norm.obs["Is Endothelial"] == cell_type, gene_id].X.todense()).flatten()[::25]

        x_values = [cell_type] * len(gene_counts)
        y_values = gene_counts
        
        gene_max = max(gene_max, numpy.max(gene_counts))
    
        violin_trace = graph_objects.Violin(
            x=x_values,
            y=y_values,
            line={
                "width": 0.75,
                "color": "red" if cell_type == "Endothelial Cells" else "blue"
            },
            marker={
                "size": 1,
                "color": "red" if cell_type == "Endothelial Cells" else "blue"
            },
            points=False,
            hoverinfo="none"
        )

        figure.add_trace(violin_trace, row=gene_index + 1, col=cell_type_index + 1)
    
        figure.update_xaxes(
            {
                "tickangle": -90
            },
            row=gene_index + 1,
            col=cell_type_index + 1
        )
    
    gene_max = numpy.ceil(gene_max * 4.4) / 4
    gene_max = numpy.round(gene_max, 2)
    
    for cell_type_index, cell_type in enumerate(adata_norm.obs["Is Endothelial"].unique()):
        
        figure.update_yaxes(
            {
                "visible": True,
                "range": [-0.2*gene_max, gene_max]
            },
            row=gene_index + 1,
            col=cell_type_index + 1
        )

    figure.update_yaxes(
        {
            "visible": True,
            "side": "right",
            "tickvals": [0, gene_max - 0.25],
            "range": [-0.2*gene_max, gene_max]
        },
        row=gene_index + 1,
        col=2
    )
    
layout = {
    "plot_bgcolor": "rgba(0, 0, 0, 0)",
    "showlegend": False,
    "violingroupgap": 0,
    "annotations": [
        graph_objects.layout.Annotation(
            x=-0.4,
            y=1 - (gene_index/len(MARKER_GENE_LIST) + 0.75/len(MARKER_GENE_LIST)),
            xref="x",
            yref="paper",
            xanchor="right",
            text=gene,
            showarrow=False,
            align="right"
        )
        for gene_index, gene in enumerate(MARKER_GENE_LIST)
    ],
    "height": 1500,
    "width": 600,
    "margin": graph_objects.layout.Margin(
        l=100,
        r=25
    )
}

figure.update_layout(layout)

plotly.iplot(figure)

In [None]:
figure.write_image(os.path.join("out", "endothelial_cell_marker_violin_plot.svg"))