# Perturb-seq RPE1 Essential 

In [None]:
import pandas as pd
import datetime
import goenrich
import matplotlib.pyplot as plt
import numpy as np
import scipy.spatial.distance as dist
import seaborn as sns
import sys
from tqdm import tqdm
import json
import scanpy as sc

from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load and Pre-Process Data

In [None]:
adata = anndata.read_h5ad('rpe1_normalized_bulk_01.h5ad')
rpe1essential_df = adata.to_df().T.astype('float32')
rpe1essential_df.index = rpe1essential_df.index.map(adata.var.gene_name.to_dict()).rename('gene_name')
rpe1essential_df

In [None]:
gene_info = pd.read_csv('../../tables/Homo_sapiens.gene_info.gz', sep='\t', compression='gzip')
gene_info = gene_info[(gene_info['#tax_id']==9606) & (gene_info['type_of_gene']=='protein-coding')]
gene_info['Symbol'] = gene_info['Symbol'].str.upper()
symbols = set(gene_info['Symbol'])
geneids = gene_info.set_index('Symbol')['GeneID'].to_dict()
gene_info

In [None]:
synonyms = gene_info.copy()
synonyms['Synonyms'] = synonyms.apply(lambda x: x['Synonyms'].upper().split('|') + [x['Symbol']], axis=1)
synonyms = synonyms.explode('Synonyms')
synonyms['Synonyms'] = synonyms['Synonyms'].str.upper()
synonyms = synonyms[synonyms['Synonyms']!='-']
synonyms = synonyms.set_index('Synonyms')['Symbol'].to_dict()

In [None]:
def thresh(n):
    if n > 0: 
        return 1
    elif n < 0: 
        return -1
    return n

In [None]:
df = rpe1essential_df.T.replace(float('inf'), rpe1essential_df[rpe1essential_df!=float('inf')].mean(axis=1)).T
df.index = df.index.map(str.upper).map(synonyms)
df = df.loc[df.index.dropna()]
df = df.groupby(axis=0, level=0).mean()
df = df.rename_axis('Gene', axis=0).rename_axis('Gene Perturbation', axis=1)
df

In [None]:
edges = df[df.abs()>=1].stack().reset_index()
edges.columns = ['Gene', 'Gene Perturbation ID', 'Z-Score']
edges['Gene Perturbation'] = edges['Gene Perturbation ID'].map(lambda x: '_'.join(x.split('_')[:-1]).upper())
edges['Threshold'] = edges['Z-Score'].map(thresh)
edges = edges[['Gene', 'Gene Perturbation', 'Gene Perturbation ID', 'Z-Score', 'Threshold']]
edges

## Process Data for SQL Ingestion

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, positive_association, negative_association, is_signed, is_continuous_valued, last_updated, directory, num_page_views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural)
(184, 'Replogle et al., Cell, 2022 RPE1 Essential Perturb-seq Gene Perturbation Signatures', 'RPE1 Essential Perturb-seq Gene Perturbation Signatures', 'Gene expression profiles in the RPE1 cell line following CRISPRi genetic perturbation of essential genes', 'gene-gene perturbation associations by differential expression of gene A following perturbation of gene B', 'genes differentially expressed following the {0} gene perturbation from the Replogle et al., Cell, 2022 RPE1 Essential Perturb-seq Gene Perturbation Signatures dataset.', 'sets of genes differentially expressed following gene perturbation from the Replogle et al., Cell, 2022 RPE1 Essential Perturb-seq Gene Perturbation Signatures dataset.', 'gene perturbations changing expression of {0} gene from the Replogle et al., Cell, 2022 RPE1 Essential Perturb-seq Gene Perturbation Signatures dataset.', 'increased expression', 'decreased expression', 1, 1, '2025-05-07', 'reploglerpe1essential', 0, 'resource', 16, 7, 27, 5, 'gene expression by RNA-seq', 'primary experimental data', 'high throughput, data driven', 'gene perturbations')

### Attribute

In [None]:
attributes = pd.read_csv('../../tables/attribute.tsv', sep='\t')
attributes['name_from_naming_authority'] = attributes['name_from_naming_authority'].str.lower()
attributefks = attributes.set_index('name_from_naming_authority')['id'].to_dict()

In [None]:
pertmeta = pd.read_excel('mmc1.xlsx', sheet_name='TabC_RPE1_day7_library', index_col='unique sgRNA pair ID')
pertmeta

In [None]:
#(id, name_from_naming_authority, id_from_naming_authority, url_from_naming_authority, naming_authority_fk)
index = 510085


for pertid in edges['Gene Perturbation ID'].unique():
    pert = '_'.join(pertid.split('_')[:-1]).upper()
    if pert.lower() not in attributefks:
        print((index, pert, pertid, f'genetic perturbation targeting {" ".join(pert.split("_")[1:])}', 113), end=',\n')
        attributefks[pert.lower()] = index
        index += 1

### Gene Set

In [None]:
#(id, name_from_dataset, id_from_dataset, url_from_dataset, dataset_fk, attribute_type, attribute_fk)
index = 138500000
genesetfks = {}

for pertid in edges['Gene Perturbation ID'].unique():
    pert = '_'.join(pertid.split('_')[:-1]).upper()
    print((index, pert, pertid, f'genetic perturbation targeting {" ".join(pert.split("_")[1:])}', 184, 27, attributefks[pert.lower()]), end=',\n')
    genesetfks[pert] = index
    index += 1

### Association

In [None]:
genes = pd.read_csv('../../tables/gene.csv')
genes['symbol'] = genes['symbol'].str.upper()
genelist = set(genes['symbol'])
genefks = genes.set_index('symbol')['id'].to_dict()
geneids = gene_info.set_index('Symbol')['GeneID'].drop_duplicates()
genedescs = gene_info.set_index('Symbol')['description'].drop_duplicates()

In [None]:
index = 79000000

associations = edges.copy()
associations['Gene'] = associations['Gene'].apply(lambda x: genefks[x.upper()])
associations['Gene Perturbation'] = associations['Gene Perturbation'].apply(lambda x: genesetfks[x])

associations = associations.get(['Gene', 'Gene Perturbation', 'Z-Score', 'Threshold'])
associations.columns = ['gene_fk', 'gene_set_fk', 'standardized_value', 'threshold_value']
associations = associations.sort_values('standardized_value', ascending=False).reset_index(drop=True)
associations.index += index
associations = associations.rename_axis('id')
associations.to_csv('../../harmonizome-update/reploglerpe1essential.csv')
associations

## Prepare Downloads

In [None]:
output_path = 'rpe1essentialdownloads'

### Ternary Matrix

In [None]:
ternarymatrix = pd.crosstab(edges['Gene'], edges['Gene Perturbation'], edges['Threshold'], aggfunc=max).replace(np.nan, 0)
ternarymatrixT = ternarymatrix.T
ternarymatrix.to_csv(f'{output_path}/gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
ternarymatrix

### Gene Attribute Edge List

In [None]:
geneids = gene_info.set_index('Symbol')['GeneID'].to_dict()
edgelist = edges.copy()
edgelist['Gene ID'] = edgelist['Gene'].apply(lambda x: geneids[x])
edgelist = edgelist[['Gene', 'Gene ID', 'Gene Perturbation', 'Gene Perturbation ID', 'Z-Score', 'Threshold']]
edgelist.columns = ['Gene', 'Gene ID', 'Gene Perturbation', 'Gene Perturbation ID', 'Standardized Value', 'Threshold Value']
edgelist.to_csv(f'{output_path}/gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = edgelist[['Gene', 'Gene ID']].drop_duplicates().reset_index(drop=True)
geneslist.to_csv(f'{output_path}/gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = edgelist[['Gene Perturbation', 'Gene Perturbation ID']].drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(f'{output_path}/attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Up Gene Set Library

In [None]:
with open(f'{output_path}/gene_set_library_up_crisp.gmt', 'w') as f:
    arr = ternarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = ternarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], '', *ternarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Down Gene Set Library

In [None]:
with open(f'{output_path}/gene_set_library_dn_crisp.gmt', 'w') as f:
    arr = ternarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = ternarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrix.index[arr[:, i] == -1]])>= 5:
            print(attributes[i], '', *ternarymatrix.index[arr[:, i] == -1], sep='\t', end='\n', file=f)

### Up Attribute Set Library

In [None]:
with open(f'{output_path}/attribute_set_library_up_crisp.gmt', 'w') as f:
    arr = ternarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = ternarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], '', *ternarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Down Attribute Set Library

In [None]:
with open(f'{output_path}/attribute_set_library_dn_crisp.gmt', 'w') as f:
    arr = ternarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = ternarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrixT.index[arr[:, i] == -1]])>= 5:
            print(genes[i], '', *ternarymatrixT.index[arr[:, i] == -1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(ternarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=ternarymatrix.index, columns=ternarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(f'{output_path}/gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(ternarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=ternarymatrixT.index, columns=ternarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(f'{output_path}/attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Gene Attribute Standardized Matrix

In [None]:
standardizedmatrix = df.loc[edges['Gene'].unique(), edges['Gene Perturbation ID'].unique()]
standardizedmatrix.columns = standardizedmatrix.columns.map(lambda x: '_'.join(x.split('_')[:-1]).upper())
standardizedmatrix.to_csv(f'{output_path}/gene_attribute_matrix_standardized.txt.gz', sep='\t', compression='gzip')
standardizedmatrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for pert in attributeslist.index:
    pert = attributeslist.loc[pert]
    nodes[pert['Gene Perturbation ID']] = {
        "type":"gene perturbation",
        "properties": {
            "label":pert['Gene Perturbation'],
            "id":pert['Gene Perturbation ID']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    if edge['Threshold Value'] == 1:
        edges.append({
            "source": edge['Gene Perturbation ID'],
            "relation": "increases expression of",
            "target": int(edge['Gene ID']),
            "properties":{
                "id": edge['Gene Perturbation ID']+":"+str(edge['Gene ID']),
                "source_id": edge['Gene Perturbation ID'],
                "source_label":edge['Gene Perturbation'],
                "target_label":edge['Gene'],
                "target_id": int(edge['Gene ID']),
                "directed":True,
                "standardized": float(edge['Standardized Value']),
                "threshold":1
            }})
    else:
        edges.append({
            "source": edge['Gene Perturbation ID'],
            "relation": "decreases expression of",
            "target": int(edge['Gene ID']),
            "properties":{
                "id": edge['Gene Perturbation ID']+":"+str(edge['Gene ID']),
                "source_id": edge['Gene Perturbation ID'],
                "source_label":edge['Gene Perturbation'],
                "target_label":edge['Gene'],
                "target_id": int(edge['Gene ID']),
                "directed":True,
                "standardized": float(edge['Standardized Value']),
                "threshold":-1
            }})

#### RDF

In [None]:
with open(output_path+'/kg_serializations/reploglerpe1essential.rdf', 'w') as f:
    print('@prefix gene: <https://ncbi.nlm.nih.gov/gene/> .', file=f)
    print('@prefix RO: <http://purl.obolibrary.org/RO_> .', file=f)
    
    print('', file=f)
    for edge in edges:
        if edge['properties']['threshold'] == 1:
            print(f"<{edge['properties']['source_id']}>", 'RO:0003003', 'gene:'+str(edge['properties']['target_id']), end=' .\n', file=f)
        else:
            print(f"<{edge['properties']['source_id']}>", 'RO:0003002', 'gene:'+str(edge['properties']['target_id']), end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'/kg_serializations/reploglerpe1essential.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
namespace = {'gene':'NCBI Entrez', 'gene perturbation':'Replogle et al., Cell, 2022'}
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: namespace[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'/kg_serializations/reploglerpe1essential_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['standardized'] = edgeframe['properties'].apply(lambda x: x['standardized'])
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'standardized', 'threshold'])
edgeframe.to_csv(output_path+'/kg_serializations/reploglerpe1essential_tsv/edges.tsv', sep='\t')
edgeframe

## Prepare Visualizations

### Gene-Attribute Clustermap

In [None]:
sns.clustermap(ternarymatrix, cmap='seismic', center=0, figsize=(10,10), xticklabels=False, yticklabels=False)

### Gene-Gene Similarity Clustermap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute-Attribute Similarity Clustermap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0, xticklabels=False, yticklabels=False)

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, blank, *geneset = line.strip().split('\t')
        if 'consensus' not in term:
            term = f'{term}_up'
        gmt[term] = ' '.join(set(geneset))
    return gmt
libdict = load_gmt(open(f'{output_path}/gene_set_library_up_crisp.gmt', 'r'))
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, blank, *geneset = line.strip().split('\t')
        gmt[f'{term}_down'] = ' '.join(set(geneset))
    return gmt
downlibdict = load_gmt(open(f'{output_path}/gene_set_library_dn_crisp.gmt', 'r'))
libdict.update(downlibdict)
scatterdir = 'rpe1essentialimages'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors, use_rep='X')
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in the Replogle et al., Cell, 2022 RPE1 Essential Perturb-seq Gene Perturbation Signatures Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, nneighbors=35,mindist=0.1
     #,spread=1.5 
     ,maxdf=0.5
     ,mindf=2
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
output_file(filename=f"{scatterdir}/umap.html", title = 'Gene Sets in Replogle et al., Cell, 2022 RPE1 Essential Perturb-seq Gene Perturbation Signatures Library')
save(plot)