# LINCS L1000 CMAP Chemical Perturbation Consensus Signatures
This notebook contains the processing script for the LINCS L1000 CMAP Chemical Perturbation Consensus Signatures dataset. The Chemical Perturbations Consensus Signatures file was downloaded from the [SigCom LINCS download page](https://maayanlab.cloud/sigcom-lincs/#/Download) and filtered using gene-wise z-scoring to include associations with z≥|3|. This left an edgelist of 5,086,167 associations between 12,126 genes, 23,913 small chemical perturbations.

In [None]:
import pandas as pd
import datetime
import math
import numpy as np
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm

# UMAP
from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load and pre-process Data

In [None]:
chempert = pd.read_csv('cp_mean_coeff_mat.tsv.gz', compression='gzip', sep='\t')
chempert

In [None]:
chempert = chempert.set_index('Unnamed: 0').T.rename_axis('Gene', axis=0).rename_axis('Chemical Perturbation', axis=1)
chempert

### Map Genes to Up-to-Date and Approved Gene Symbols

In [None]:
genemapping = pd.read_csv('../../mapping/mappingFile_2023.tsv', sep='\t', header=None).set_index(1)[2].to_dict()
geneids = pd.read_csv('../../mapping/GeneSymbolsAndIDS_2023.tsv', sep='\t')
geneids['Human, Mouse, and Rat Approved Symbol'] = geneids['Human, Mouse, and Rat Approved Symbol'].apply(str.upper)
geneids = geneids.set_index('Human, Mouse, and Rat Approved Symbol')['Entrez Gene ID(supplied by NCBI)'].to_dict()
geneinfo = pd.read_csv('../../tables/gene_info', sep='\t')
geneinfo['Symbol'] = geneinfo['Symbol'].apply(str.upper)
geneinfo = geneinfo.set_index('Symbol')['description'].to_dict()

In [None]:
chempert.index = chempert.index.map(genemapping)
chempert = chempert[chempert.index.duplicated()==False]
chempert

### Apply Gene-Wise Z-Scoring and Filter

In [None]:
def zscore(gene):
    mean = gene.mean()
    std = gene.std()
    gene = gene.apply(lambda x: (x-mean)/std)
    return gene

chempert = chempert.apply(zscore, axis=1)
chempert = chempert[abs(chempert)>=3].stack().sort_values().to_frame().reset_index()
chempert = chempert[chempert['Gene'].isna()==False]
chempert

## Process Data for SQL

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, positive_association, negative_association, is_signed, is_continuous_valued, last_updated, directory, num_page_views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural, is_archived)
(150, 'LINCS L1000 CMAP Chemical Perturbation Consensus Signatures', 'Chemical Perturbations Consensus Signatures', 'gene association consensus signatures following small molecule perturbation', 'gene-small molecule associations by differential expression of gene following small molecule perturbation', 'genes differentially expressed following the {0} small molecule perturbation from the LINCS L1000 CMAP Chemical Perturbation Consensus Signatures dataset.', 'sets of genes differentially expressed following small molecule perturbations from the LINCS L1000 CMAP Chemical Perturbation Consensus Signatures dataset.', 'small molecule perturbations changing expression of {0} gene from the LINCS L1000 CMAP Chemical Perturbations Consensus Signatures dataset.', 'increased expression', 'decreased expression', 1, 1, '2023-10-27', 'l1000chempert', 0, 50, 13, 7, 12, 2, 'gene expression by L1000 assay', 'primary experimental data', 'high throughput, datadriven, small molecule perturbations', 0)

### Publication

In [None]:
#(id, long_citation, short_citation, url, pmid, pubmed_url, first_author_last_name, first_author_initials, journal_abbreviation, year, title, volume, pages)
(149, 'Evangelista, JE et al. (2022) SigCom LINCS: data and metadata search engine for a million gene expression signatures. Nucleic Acids Res. 50:W697-709.', 'Evangelista, Nucleic Acids Res, 2022', 'dx.doi.org/10.1093/nar/gkac328', 35524556, 'https://ncbi.nlm.nih.gov/pubmed/35524556', 'Evangelista', 'JE', 'Nucleic Acids Res', 2022, 'SigCom LINCS: data and metadata search engine for a million gene expression signatures', 50, 'W697-709')

### Gene

In [None]:
index = 57620
genes = pd.read_csv('../../tables/gene.csv')
genes['symbol'] = genes['symbol'].apply(str.upper)
genefks = genes.set_index('symbol')['id'].to_dict()
geneslist = genes['symbol'].tolist()
chempert['Gene'] = chempert['Gene'].apply(str.upper)
for gene in chempert['Gene'].unique():
    if gene not in geneslist:
        print((index, gene, geneids[gene], geneinfo[gene], 'https://ncbi.nlm.nih.gov/gene/'+str(geneids[gene])), end=',\n')
        genefks[gene] = index
        index += 1

### Attribute

In [None]:
index = 368645
attributefks = {}
for perturbation in chempert['Chemical Perturbation'].unique():
    print((index, perturbation, 73), end=',\n')
    attributefks[perturbation] = index
    index += 1

### Gene Set

In [None]:
index = 135100000
genesetfks = {}
for perturbation in chempert['Chemical Perturbation'].unique():
    print((index, perturbation, 150, 12, attributefks[perturbation]), end=',\n')
    genesetfks[perturbation] = index
    index += 1

### Association

In [None]:
def threshold(score):
    if score > 0:
        return 1
    elif score < 0:
        return -1
    return 0

In [None]:
associations = chempert.copy()
associations['Gene'] = associations['Gene'].map(genefks)
associations['Chemical Perturbation'] = associations['Chemical Perturbation'].map(genesetfks)
associations.columns = ['gene_fk', 'gene_set_fk', 'standardized_value']
associations['threshold_value'] = associations['standardized_value'].apply(threshold)
associations = associations.dropna().reset_index(drop=True)
associations.index += 32000000
associations.to_csv('../../harmonizome-update/l1000chempert.csv')
associations

## Create Downloads

In [None]:
output_path = 'downloads/'
chempert.columns = ['Gene', 'Chemical Perturbation', 'z']
chempert['threshold'] = chempert['z'].apply(threshold)
chempert

### Gene Attribute Ternary Matrix

In [None]:
ternarymatrix = pd.crosstab(chempert['Gene'], chempert['Chemical Perturbation'], chempert['threshold'], aggfunc=max).replace(np.nan, 0)
ternarymatrixT = ternarymatrix.T
ternarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
ternarymatrix

### Gene Attribute Edge List

In [None]:
edgelist = chempert.copy()
edgelist['Gene ID'] = edgelist['Gene'].apply(lambda x: geneids[x])
edgelist = edgelist.get(['Gene', 'Gene ID', 'Chemical Perturbation', 'z', 'threshold'])
edgelist.columns = ['Gene', 'Gene ID', 'Chemical Perturbation', 'Z-score', 'Threshold Value']
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = edgelist.get(['Gene', 'Gene ID']).drop_duplicates().reset_index(drop=True)
geneslist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = edgelist.get(['Chemical Perturbation']).drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Up Gene Set Library

In [None]:
with open(output_path+'gene_set_library_up_crisp.gmt', 'w') as f:
    arr = ternarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = ternarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], *ternarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Down Gene Set Library

In [None]:
with open(output_path+'gene_set_library_dn_crisp.gmt', 'w') as f:
    arr = ternarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = ternarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrix.index[arr[:, i] == -1]])>= 5:
            print(attributes[i], *ternarymatrix.index[arr[:, i] == -1], sep='\t', end='\n', file=f)

### Up Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_up_crisp.gmt', 'w') as f:
    arr = ternarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = ternarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], *ternarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Down Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_dn_crisp.gmt', 'w') as f:
    arr = ternarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = ternarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrixT.index[arr[:, i] == -1]])>= 5:
            print(genes[i], *ternarymatrixT.index[arr[:, i] == -1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(ternarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=ternarymatrix.index, columns=ternarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(ternarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=ternarymatrixT.index, columns=ternarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Gene Attribute Standardized Matrix

In [None]:
standardizedmatrix = pd.crosstab(chempert['Gene'], chempert['Chemical Perturbation'], chempert['z'], aggfunc=max).replace(np.nan, 0)
standardizedmatrix.to_csv(output_path+'gene_attribute_matrix_standardized.txt.gz', sep='\t', compression='gzip')
standardizedmatrix

### Knowledge Graph Serialization

In [None]:
geneslist = pd.read_csv('downloads/gene_list_terms.txt.gz', sep='\t', compression='gzip', index_col='Unnamed: 0')
attributeslist = pd.read_csv('downloads/attribute_list_entries.txt.gz', sep='\t', compression='gzip', index_col='Unnamed: 0')

In [None]:
edgelist = pd.read_csv('downloads/gene_attribute_edges.txt.gz', sep='\t', compression='gzip', index_col='Unnamed: 0')
edgelist

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for chempert in attributeslist.index:
    chempert = attributeslist.loc[chempert]
    nodes[chempert['Chemical Perturbation']] = {
        "type":"chemical perturbation",
        "properties": {
            "label":chempert['Chemical Perturbation'],
            "id":chempert['Chemical Perturbation']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    if edge['Threshold Value']==1:
        edges.append({
            "source": int(edge['Gene ID']),
            "relation": "positively regulated by",
            "target": edge['Chemical Perturbation'],
            "properties":{
                "id":str(edge['Gene ID'])+":"+edge['Chemical Perturbation'],
                "source_id":int(edge['Gene ID']),
                "source_label":edge['Gene'],
                "target_id":edge['Chemical Perturbation'],
                "target_label":edge['Chemical Perturbation'],
                "directed":True,
                "standardized_value":edge['Z-score'],
                "threshold":1
            }})
    else:
            edges.append({
            "source": int(edge['Gene ID']),
            "relation": "negatively regulated by",
            "target": edge['Chemical Perturbation'],
            "properties":{
                "id":str(edge['Gene ID'])+":"+edge['Chemical Perturbation'],
                "source_id":int(edge['Gene ID']),
                "source_label":edge['Gene'],
                "target_id":edge['Chemical Perturbation'],
                "target_label":edge['Chemical Perturbation'],
                "directed":True,
                "standardized_value":edge['Z-score'],
                "threshold":-1
            }})

#### RDF

In [None]:
output_path='downloads/'

In [None]:
with open(output_path+'kg_serializations/l1000chempert.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)

    print('', file=f)
    for edge in edges:
        if edge["properties"]["threshold"]==1:
            print('gene:'+str(edge['properties']['source_id']), 'RO:0002336', edge['properties']['target_id'], end=' .\n', file=f)
        else:
            print('gene:'+str(edge['properties']['source_id']), 'RO:0002335 ', edge['properties']['target_id'], end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/l1000chempert.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
namespace = {'gene':'NCBI Entrez', 'chemical perturbation':'LINCS L1000 CMAP'}
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: namespace[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/l1000chempert_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['standardized'] = edgeframe['properties'].apply(lambda x: x['standardized_value'])
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/l1000chempert_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

In [None]:
ternarymatrix = pd.read_csv('downloads/gene_attribute_matrix.txt.gz', sep='\t', compression='gzip', index_col='Gene')
gene_similarity_matrix = pd.read_csv('downloads/gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip', index_col='Unnamed: 0')
attribute_similarity_matrix = pd.read_csv('downloads/attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip', index_col='Unnamed: 0')

### Gene Attribute Clustered Heatmap

In [None]:
sns.clustermap(ternarymatrix, cmap='seismic', center=0)

### Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute Similarity Clustered Heatmap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0)

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term+'_up'] = ' '.join(set(geneset))
    return gmt
libdict = load_gmt(open('downloads/gene_set_library_up_crisp.gmt', 'r'))
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term+'_down'] = ' '.join(set(geneset))
    return gmt
downlibdict = load_gmt(open('downloads/gene_set_library_dn_crisp.gmt', 'r'))
libdict.update(downlibdict)
scatterdir = 'images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors)
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in the LINCS L1000 CMAP Chemical Pertubation Consensus Signatures Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, nneighbors=100,mindist=0.1
     ,spread=10
     ,maxdf=0.1 
     ,mindf=5
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
output_file(filename=f"{scatterdir}/l1000chempert.html", title = 'Gene Sets in the LINCS L1000 CMAP Chemical Pertubation Consensus Signatures Library')
save(plot)