# LINCS L1000 CRISPR KO Consensus Signatures
This notebook contains the script used to process the LINCS L1000 CMap CRISPR KO Consensus Signatures dataset for Harmonizome. The CRISPR Knockout mean expression matrix was downloaded from [SigCom LINCS](https://maayanlab.cloud/sigcom-lincs/#/Download). Genes and KO Genes were mapped to approved and up-to-date gene symbols, and up to 250 of the highest and lowest scored genes were kept for every KO gene.

In all, this processed dataset contains 2,517,262 associations between 9,551 genes and 5,049 gene perturbations.

In [None]:
import pandas as pd
import datetime
import numpy as np
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm

# UMAP
from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load and Pre-Process Data

In [None]:
L1000CRISPR = pd.read_csv('newdata/L1000CRISPR/xpr_mean_coeff_mat.tsv', sep='\t', index_col=0)
L1000CRISPR = L1000CRISPR[L1000CRISPR.index.map(lambda x: not pd.isna(x))]
L1000CRISPR

### Remove Unmapped Gene KOs

In [None]:
L1000CRISPR =  L1000CRISPR[L1000CRISPR.index.map(lambda x: not x.startswith('BRDN'))]
L1000CRISPR

In [None]:
genemapping = pd.read_csv('mapping/mappingFile_2023.tsv', sep='\t', header=None).set_index(1)[2].to_dict()
geneids = pd.read_csv('mapping/GeneSymbolsAndIDS_2023.tsv', sep='\t').set_index('Human, Mouse, and Rat Approved Symbol')['Entrez Gene ID(supplied by NCBI)'].to_dict()
geneinfo = pd.read_csv('tables/gene_info', sep='\t').set_index('Symbol')['description'].to_dict()

In [None]:
L1000CRISPR = L1000CRISPR[L1000CRISPR.index.map(lambda x: x in genemapping)]
L1000CRISPR.index = L1000CRISPR.index.map(lambda x: genemapping[x])
L1000CRISPR

In [None]:
L1000CRISPR = L1000CRISPR.T
L1000CRISPR = L1000CRISPR[L1000CRISPR.index.map(lambda x: x in genemapping)]
L1000CRISPR.index = L1000CRISPR.index.map(lambda x: genemapping[x])
L1000CRISPR = L1000CRISPR.sort_index(axis=1)
L1000CRISPR = L1000CRISPR.rename_axis('Gene', axis=0).rename_axis('Gene KO', axis=1)
L1000CRISPR

In [None]:
L1000CRISPR = L1000CRISPR.stack().reset_index()
L1000CRISPR

In [None]:
L1000CRISPRUp = L1000CRISPR[L1000CRISPR[0]>0].groupby('Gene KO').apply(lambda x: x.sort_values(0, ascending=False).head(250)).reset_index(drop=True)
L1000CRISPRUp

In [None]:
L1000CRISPRDown = L1000CRISPR[L1000CRISPR[0]<0].groupby('Gene KO').apply(lambda x: x.sort_values(0).head(250)).reset_index(drop=True)
L1000CRISPRDown

In [None]:
L1000CRISPR = L1000CRISPRUp.append(L1000CRISPRDown).reset_index(drop=True)
L1000CRISPR

## Process Data for SQL

### Dataset

In [None]:
(145, 'LINCS L1000 CMAP CRISPR Knockout Consensus Signatures', 'CRISPR Knockout Consensus Signatures', 'gene association consensus signatures following CRISPR gene knockout', 'gene-gene associations by differential expression of gene A following perturbation of gene B', 'genes differentially expressed following the {0} gene perturbation from the LINCS L1000 CMAP CRISPR Knockout Consensus Signatures dataset.', 'sets of genes diffeentially expressed following gene perturbation from the LINCS L1000 CMAP CRISPR Knockout Consensus Signatures dataset.', 'gene perturbations changing expression of {0} gene from the LINCS L1000 CMAP CRISPR Knockout Consensus Signatures dataset.', 'increased expression', 'decreased expression', 1, 1, '2023-09-05', 'l1000crispr', 0, 50, 13, 7, 27, 5, 'gene expression by L1000 assay', 'primary experimental data', 'high throughput, data-driven', 'gene perturbations', 0)

### Gene

In [None]:
genefks = pd.read_csv('tables/gene.csv').set_index('symbol')['id'].to_dict()
geneinfo = pd.read_csv('tables/gene_info', sep='\t', index_col='Symbol').get(['GeneID','description'])
genelist = pd.read_csv('tables/gene.csv').set_index('symbol')
geneurl = 'https://ncbi.nlm.nih.gov/gene/'

index = 57253
for gene in L1000CRISPR['Gene'].unique():
    if gene.upper() not in genelist.index:
        print((index, gene, geneids[gene], geneinfo.loc[gene, 'description'], geneurl+str(geneids[gene])), end=',\n')
        genefks[gene] = geneids[gene]
        index += 1


### Attribute

In [None]:
attributes = pd.read_csv('tables/attribute.csv', index_col='name_from_naming_authority')
attributes = attributes[attributes['naming_authority_fk']==18]
geneinfo = pd.read_csv('tables/gene_info', sep='\t').set_index('Symbol')['description'].to_dict()

In [None]:
attributefks = attributes['id'].to_dict()

index=365322

for attribute in L1000CRISPR['Gene KO'].unique():
    if attribute not in attributes.index:
        attributefks[attribute] = index
        print((index, attribute, geneids[attribute], geneinfo[attribute], geneurl+str(geneids[attribute]), 18), end=',\n')
        index += 1

### Gene Set

In [None]:
genesetfks = {}
index = 134600000
for attribute in L1000CRISPR['Gene KO'].unique():
    genesetfks[attribute] = index
    print((index, attribute, geneids[attribute], geneinfo[attribute], geneurl+str(geneids[attribute]), 145, 27, attributefks[attribute]), end=',\n')
    index += 1

### Association

In [None]:
geneids = pd.read_csv('mapping/GeneSymbolsAndIDS_2023.tsv', sep='\t').set_index('Human, Mouse, and Rat Approved Symbol')['Entrez Gene ID(supplied by NCBI)'].to_dict()

In [None]:
genefks = pd.read_csv('tables/gene.csv').set_index('ncbi_entrez_gene_id')['id'].to_dict()

L1000CRISPR['threshold'] = L1000CRISPR[0].apply(lambda x: 1 if x>0 else -1)
associations = L1000CRISPR.copy()
associations['Gene'] = associations['Gene'].apply(lambda x: genefks[geneids[x]])
associations['Gene KO'] = associations['Gene KO'].apply(lambda x: genesetfks[x])
associations.columns = ['gene_fk', 'gene_set_fk', 'standardized_value', 'threshold_value']
associations = associations.rename_axis('id', axis=0)
associations.index += 25000000
associations.to_csv('harmonizome-update/l1000crispr.csv')

associations

In [None]:
error

## Create Downloads

In [None]:
output_path = 'newdata/L1000CRISPR/downloads/'

In [None]:
L1000CRISPR['Gene ID'] = L1000CRISPR['Gene'].apply(lambda x: geneids[x])
L1000CRISPR['Gene KO ID'] = L1000CRISPR['Gene KO'].apply(lambda x: str(geneids[x])+'_KO')
L1000CRISPR = L1000CRISPR.get(['Gene', 'Gene ID', 'Gene KO', 'Gene KO ID', 0, 'threshold'])
L1000CRISPR.columns = ['Gene', 'Gene ID', 'Gene KO', 'Gene KO ID', 'Standardized Value', 'Threshold Value']
L1000CRISPR

### Gene-Attribute Ternary Matrix

In [None]:
binarymatrix = pd.crosstab(L1000CRISPR['Gene'], L1000CRISPR['Gene KO'], L1000CRISPR['Threshold Value'], aggfunc=np.max).replace(np.nan, 0)
binarymatrixT = binarymatrix.T
binarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binarymatrix

### Gene-Attribute Edge List

In [None]:
edgelist = L1000CRISPR.copy()
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = L1000CRISPR.get(['Gene', 'Gene ID']).drop_duplicates().reset_index(drop=True)
geneslist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = L1000CRISPR.get(['Gene KO', 'Gene KO ID']).drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Up Gene Set Library

In [None]:
with open(output_path+'gene_set_library_up_crisp.gmt', 'w') as f:
    arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], *binarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Down Gene Set Library

In [None]:
with open(output_path+'gene_set_library_dn_crisp.gmt', 'w') as f:
    arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrix.index[arr[:, i] == -1]])>= 5:
            print(attributes[i], *binarymatrix.index[arr[:, i] == -1], sep='\t', end='\n', file=f)

### Up Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_up_crisp.gmt', 'w') as f:
    arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = binarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], *binarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Down Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_dn_crisp.gmt', 'w') as f:
    arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = binarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrixT.index[arr[:, i] == -1]])>= 5:
            print(genes[i], *binarymatrixT.index[arr[:, i] == -1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(binarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=binarymatrix.index, columns=binarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(binarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=binarymatrixT.index, columns=binarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for geneko in attributeslist.index:
    geneko = attributeslist.loc[geneko]
    nodes[geneko['Gene KO ID']] = {
        "type":"gene perturbation",
        "properties": {
            "label":geneko['Gene KO'],
            "id":geneko['Gene KO ID']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    if edge['Threshold Value']==1:
        edges.append({
            "source": int(edge['Gene ID']),
            "relation": "positively regulated by",
            "target": edge['Gene KO ID'],
            "properties":{
                "id":str(edge['Gene ID'])+":"+edge['Gene KO ID'],
                "source_id":int(edge['Gene ID']),
                "source_label":edge['Gene'],
                "target_id":edge['Gene KO ID'],
                "target_label":edge['Gene KO'],
                "directed":True,
                "standardized_value":edge['Standardized Value'],
                "threshold":1
            }})
    else:
            edges.append({
            "source": int(edge['Gene ID']),
            "relation": "negatively regulated by",
            "target": edge['Gene KO ID'],
            "properties":{
                "id":str(edge['Gene ID'])+":"+edge['Gene KO ID'],
                "source_id":int(edge['Gene ID']),
                "source_label":edge['Gene'],
                "target_id":edge['Gene KO ID'],
                "target_label":edge['Gene KO'],
                "directed":True,
                "standardized_value":edge['Standardized Value'],
                "threshold":-1
            }})

#### RDF

In [None]:
with open(output_path+'kg_serializations/l1000crispr.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)

    print('', file=f)
    for edge in edges:
        if edge["properties"]["threshold"]==1:
            print('gene:'+str(edge['properties']['source_id']), 'RO:0002336', 'gene:'+edge['properties']['target_id'].replace('_KO',''), end=' .\n', file=f)
        else:
            print('gene:'+str(edge['properties']['source_id']), 'RO:0002335 ', 'gene:'+edge['properties']['target_id'].replace('_KO',''), end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/l1000crispr.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe['namespace'] = 'NCBI Entrez'
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/l1000crispr_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['standardized'] = edgeframe['properties'].apply(lambda x: x['standardized_value'])
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/l1000crispr_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

### Gene-Attribute Clustered Heatmap

In [None]:
sns.clustermap(binarymatrix, cmap='seismic', center=0)

### Gene-Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute-Attribute Similarity Clustered Heatmap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0)

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term+'_up'] = ' '.join(set(geneset))
    return gmt
libdict = load_gmt(open('newdata/L1000CRISPR/downloads/gene_set_library_up_crisp.gmt', 'r'))
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term+'_down'] = ' '.join(set(geneset))
    return gmt
downlibdict = load_gmt(open('newdata/L1000CRISPR/downloads/gene_set_library_dn_crisp.gmt', 'r'))
libdict.update(downlibdict)
scatterdir = 'newdata/L1000CRISPR/images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors)
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in LINCS L1000 CMAP CRISPR Knockout Consensus Signatures Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, nneighbors=200,mindist=0.001
     ,spread=1.5 
     ,maxdf=0.8 
     ,mindf=0.2
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
output_file(filename=f"{scatterdir}/umap.html", title = 'Gene Sets in LINCS L1000 CMAP CRISPR Knockout Consensus Signatures Library')
save(plot)