In [1]:
import os
import pathlib
import hashlib
import datetime
import json

import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import decoupler
import requests

In [2]:
pd.options.display.max_rows = 300

In [3]:
%config InlineBackend.figure_format = "retina"

In [4]:
# Nick needs this because he's using Firefox which apparently noone uses anymore
from IPython.display import display, HTML
display(HTML("<style>.jp-OutputArea-output {display:flex}</style>"))

In [5]:
def sanitize_name(name):
    return name.replace(' ', '_').replace('*', '').replace(';', '_and').replace('/', '_')

In [6]:
msigdb = decoupler.get_resource('MSigDB', organism='human')
hallmark = msigdb[msigdb.collection.eq('hallmark')]
hallmark = hallmark[~hallmark.duplicated(['geneset', 'genesymbol'])]

  from .autonotebook import tqdm as notebook_tqdm


**NB!** some genes have new names in HALLMARK

In [7]:
hallmark.genesymbol.eq('SEPP1').sum()

0

In [8]:
hallmark.genesymbol.eq('SELENOP').sum()

2

In [9]:
PF_GENESET_URL = (
    'https://maayanlab.cloud/Harmonizome/api/1.0/gene_set/'
    'pulmonary+fibrosis/DISEASES+Text-mining+Gene-Disease+Assocation+Evidence+Scores'
)
PF_GENESET_FILE = 'Harmonizome_PF_set.json'
PF_GENESET_SHA1 = '3f3dd4f618e335382982d16e0afae9f2240dcbd6'

In [10]:
if not os.path.exists(PF_GENESET_FILE):
    with open(PF_GENESET_FILE, 'w') as f:
        contents = requests.get(PF_GENESET_URL).text
        f.write(contents)
        print(f'Downloaded PF geneset with sha1 {hashlib.sha1(contents.encode()).hexdigest()}')
contents = open(PF_GENESET_FILE).read()
if hashlib.sha1(contents.encode()).hexdigest() != PF_GENESET_SHA1:
    print('SHA1 of PF gene set does not match the stored one')
pf_genes = json.loads(contents)
pf_genes = [x['gene']['symbol'] for x in pf_genes['associations']]

In [11]:
hallmark = pd.concat([
    hallmark,
    pd.DataFrame(dict(genesymbol=pf_genes, collection='Harmonizome', geneset='Harmonizome_PF'))
])

In [12]:
DATA = pathlib.Path('../../data/31_bal-object/')

In [13]:
BASE = DATA / 'pseudobulk-gsva'

In [14]:
OUTDIR = DATA / 'gsva'

In [15]:
os.makedirs(OUTDIR, exist_ok=True)

In [16]:
clinical_data = pd.read_csv('../00clinical-v2.csv')

In [18]:
samples = pd.read_csv("../00all-samples.csv")

In [19]:
clinical_data = clinical_data.merge(
    samples[['Sample', 'External Sample ID']],
    left_on='study_code',
    right_on='Sample',
    how='left'
)

In [20]:
clinical_data['mmf'] = clinical_data.patient_on_mmf.map(
    {True: 'MMF', False: 'Naive'}
)

In [21]:
sample_to_fvc = clinical_data.set_index('External Sample ID').fvc_pred

In [None]:
HTML = """
<!DOCTYPE html>
<html>
<head>
    <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
    <meta http-equiv="content-type" content="text/html; charset=utf8"/>
    <title>%(ct)s GSVA exploration</title>
    <style type="text/css">
        html {
          margin: 0; padding: 0;
          font-size: 20px; font-family: Helvetica, Verdana, sans-serif;
        }
        body {margin: 0; padding: 10px 10px;}
        a {color: #1385cb}
        a:visited {color: #0e74bc}
        .header {margin-bottom: 10px;}
        .header h2, .header h3 {font-weight: normal; text-align: center; margin: 0 0 6px 0;}
        #morpheus-container {
            width: auto;
            height: auto;
            min-width: 1200px;
            min-height: 800px;
            visibility: hidden;
        }
      </style>
    <link rel="stylesheet" href="https://software.broadinstitute.org/morpheus/css/morpheus-latest.min.css">
    <script>if (typeof module === 'object') {
    window.module = module;
    module = undefined;

    }</script>

    <script type="text/javascript" src="https://software.broadinstitute.org/morpheus/js/morpheus-external-latest.min.js"></script>
    <script src="https://software.broadinstitute.org/morpheus/js/morpheus-latest.min.js"></script>
</head>
<body>
    <div class="header">
        <h2>%(ct)s GSVA scores</h2>
    </div>
    <div id="morpheus-container"></div>

    <script type="text/javascript">
        window.heatmap = new morpheus.HeatMap({
            el: document.querySelector('#morpheus-container'),
            name: "%(ct)s",
            dataset: "gsva.tsv",
            rows: [
                {field: "id", display: ["text"]}
            ],
            columns: [
                {field: "id", display: ["color"]},
                {field: "n_cells", display: ["color"]},
                {field: "Sex", display: ["color"]},
                {field: "Study", display: ["color"]},
                {field: "Chemistry", display: ["color"]},
                {field: "Status", display: ["color"]},
                {field: "fvc_pred", display: ["color"]},
                {field: "mmf", display: ["color"]}
            ],
            columnColorModel: {
                "perturbation_groups": {
                    'Healthy': "#5cd061",
                    'NPC': "#256b33",
                    'SARS-CoV-2' : "#e51d23",
                    'SARS-CoV-2; Gram+' : "#ffc326",
                    'Pseudomonas aeruginosa; SARS-CoV-2': "#9858d6",
                    'Pseudomonas; SARS-CoV-2': "#9858d6",
                    'Gram+': "#013265",
                    'Pseudomonas aeruginosa': "#fb4e93",
                    'Pseudomonas': "#fb4e93",
                    'Gram-*; Gram+': "#1ceaf9",
                    'Gram-*': "#770c2e"
                },
                "pathogen_groups": {
                    'Healthy': "#5cd061",
                    'NPC': "#256b33",
                    'SARS-CoV-2' : "#e51d23",
                    'SARS-CoV-2; Gram+' : "#ffc326",
                    'Pseudomonas aeruginosa; SARS-CoV-2': "#9858d6",
                    'Pseudomonas; SARS-CoV-2': "#9858d6",
                    'Gram+': "#013265",
                    'Pseudomonas aeruginosa': "#fb4e93",
                    'Pseudomonas': "#fb4e93",
                    'Gram-*; Gram+': "#1ceaf9",
                    'Gram-*': "#770c2e"
                },
                "days_on_ventilator": {
                    "values": [0, 10],
                    "colors": ["#f7fbff", "#08306b"]
                },
                "Chemistry": {
                    "10x 3' V2": "#56407f",
                    "10x 3' V3": "#ed6146"
                },
                "mmf": {
                    "Naive": "#227f69",
                    "MMF": "#b54519"
                }
            },
            columnAnnotations: [{
                file: "meta.tsv",
                datasetField: "id",
                fileField: "External Sample ID"
            }],
            /*colorScheme: { // optional color scheme. default is relative
                type: 'fixed',
                map: [{
                    value: -2,
                    color: '#0000ff'
                }, {
                    value: 0,
                    color: '#ffffff'
                }, {
                    value: 2,
                    color: '#ff0000'
                }]
            },*/
            tools: [{ name: "Hierarchical Clustering", params: {cluster: "Rows and columns", background: false} }]
        });
        window.onload = function() {
            /*morpheus.HClusterTool.execute({
                heatMap: window.heatmap,
                project: window.heatmap.getProject(),
                input:
            });*/
            window.setTimeout(function() {
                window.heatmap.fitToWindow({fitRows: true, fitColumns: true, repaint: true});
                window.heatmap.fitToWindow({fitRows: true, fitColumns: true, repaint: true});
                document.getElementById('morpheus-container').style.visibility = 'visible';
            }, 800)
        }
    </script>
</body>
</html>
"""

In [23]:
TABLE_HTML = """
<!DOCTYPE html>
<html>
<head>
  <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
  <meta http-equiv="content-type" content="text/html; charset=utf8"/>
  <style type="text/css">
    html {
      margin: 0; padding: 0;
      font-size: 20px; font-family: Helvetica, Verdana, sans-serif;
    }
    .header {margin-bottom: 10px;}
    .header h2, .header h3 {font-weight: normal; text-align: center; margin: 0 0 6px 0;}
    body {margin: 0; padding: 50px 100px;}
    a {color: #1385cb}
    a:visited {color: #0e74bc}
    table {border-collapse: collapse; border-top: 1px solid #ccc; border-left: 1px solid #ccc;}
    table td, table th {padding: 2px 5px; border-bottom: 1px solid #ccc; border-right: 1px solid #ccc;}
    table td {font-size: 14px;}
  </style>
</head>
<body>
<div class="header">
    <h2>GSVA analysis</h2>
    Generated on %s
</div>
<table>
%s
</table>
</body>
</html>
"""

TABLE = """
<tr>
    <th>%s</th>
    <td><a href="%s" target="_blank">HALLMARK GSVA scores</a></td>
</tr>
"""

In [24]:
%%time
table = ''
ct_gsva = {}
for ct in sorted(os.listdir(BASE)):
    if not (BASE / ct / 'data' / 'transformed.tsv').exists():
        print(f'{ct} does not have counts')
        continue
    pseudobulk = pd.read_table(BASE / ct / 'data' / 'transformed.tsv', delim_whitespace=True).T
    metadata = pd.read_csv(BASE / ct / 'data' / f'{ct}-meta.csv', index_col=0)
    metadata.index = metadata.index.str.replace(r'\_[ATCG]+$', '', regex=True)
    if metadata.shape[0] < 10:
        print(f'{ct} has less than 10 samples')
        continue
    if (OUTDIR / ct / 'gsva.tsv').exists():
        gsva = pd.read_table(OUTDIR / ct / 'gsva.tsv', index_col=0)
    else:
        try:
            gsva = decoupler.run_gsva(
                pseudobulk,
                net=hallmark,
                source='geneset',
                target='genesymbol',
                kcdf=True,
                seed=1066
            ).T
        except:
            print(f'{ct} had error in GSVA estimation')
            continue
    metadata = metadata.merge(
        clinical_data[['External Sample ID', 'fvc_pred', 'mmf']],
        on='External Sample ID',
        how='left'
    )
    # Copy value for SSc8 from SSc6 (repeat sample)
    metadata.loc[metadata['External Sample ID'].eq('SSc_SSc8'), 'mmf'] = 'Naive'
    gsva.index = gsva.index.str.replace('HALLMARK_', '')
    ct_gsva[ct] = (gsva, metadata)
    os.makedirs(OUTDIR / ct, exist_ok=True)
    gsva.to_csv(OUTDIR / ct / 'gsva.tsv', sep='\t')
    metadata.to_csv(OUTDIR / ct / 'meta.tsv', sep='\t')
    html = HTML % dict(
        ct=ct,
    )
    table += TABLE % (
        ct, ct
    )
    with open(OUTDIR / ct / 'index.html', 'w') as f:
        f.write(html)
html = TABLE_HTML % (
    datetime.datetime.now().strftime('%d %b %Y, %I:%M%p'),
    table
)
with open(OUTDIR / 'index.html', 'w') as out:
    out.write(html)

.DS_Store does not have counts
._.DS_Store does not have counts
Activated_monocytes does not have counts
CCR7+_DC has less than 10 samples
CD8_TEM_cells has less than 10 samples
Ciliated_cells does not have counts
Mast_cells does not have counts
Proliferating_T_cells does not have counts
Secretory_cells does not have counts
gd_NKT_cells has less than 10 samples
pDC does not have counts
CPU times: user 2.06 s, sys: 119 ms, total: 2.18 s
Wall time: 6.92 s


In [25]:
moam_gsva = []
moam_meta = []
for ct, (gsva, meta) in ct_gsva.items():
    if not ct.startswith('MoAM') and not ct.startswith('Mono'):
        continue
    gsva.columns = ct + '_' + gsva.columns
    moam_gsva.append(gsva)
    meta['cell_type'] = ct
    moam_meta.append(meta)

In [26]:
moam_gsva = pd.concat(moam_gsva, axis=1)
moam_meta = pd.concat(moam_meta, axis=0)

In [27]:
moam_meta['id'] = moam_meta.cell_type + '_' + moam_meta['External Sample ID']

In [None]:
HTML = """
<!DOCTYPE html>
<html>
<head>
    <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
    <meta http-equiv="content-type" content="text/html; charset=utf8"/>
    <title>%(ct)s GSVA exploration</title>
    <style type="text/css">
        html {
          margin: 0; padding: 0;
          font-size: 20px; font-family: Helvetica, Verdana, sans-serif;
        }
        body {margin: 0; padding: 10px 10px;}
        a {color: #1385cb}
        a:visited {color: #0e74bc}
        .header {margin-bottom: 10px;}
        .header h2, .header h3 {font-weight: normal; text-align: center; margin: 0 0 6px 0;}
        #morpheus-container {
            width: auto;
            height: auto;
            min-width: 1200px;
            min-height: 800px;
            visibility: hidden;
        }
      </style>
    <link rel="stylesheet" href="https://software.broadinstitute.org/morpheus/css/morpheus-latest.min.css">
    <script>if (typeof module === 'object') {
    window.module = module;
    module = undefined;

    }</script>

    <script type="text/javascript" src="https://software.broadinstitute.org/morpheus/js/morpheus-external-latest.min.js"></script>
    <script src="https://software.broadinstitute.org/morpheus/js/morpheus-latest.min.js"></script>
</head>
<body>
    <div class="header">
        <h2>%(ct)s GSVA scores</h2>
    </div>
    <div id="morpheus-container"></div>

    <script type="text/javascript">
        window.heatmap = new morpheus.HeatMap({
            el: document.querySelector('#morpheus-container'),
            name: "%(ct)s",
            dataset: "gsva.tsv",
            rows: [
                {field: "id", display: ["text"]}
            ],
            columns: [
                {field: "id", display: ["color"]},
                {field: "n_cells", display: ["color"]},
                {field: "Sex", display: ["color"]},
                {field: "Study", display: ["color"]},
                {field: "Chemistry", display: ["color"]},
                {field: "Status", display: ["color"]},
                {field: "fvc_pred", display: ["color"]},
                {field: "mmf", display: ["color"]},
                {field: "cell_type", display: ["color", "text"]}
            ],
            columnColorModel: {
                "perturbation_groups": {
                    'Healthy': "#5cd061",
                    'NPC': "#256b33",
                    'SARS-CoV-2' : "#e51d23",
                    'SARS-CoV-2; Gram+' : "#ffc326",
                    'Pseudomonas aeruginosa; SARS-CoV-2': "#9858d6",
                    'Pseudomonas; SARS-CoV-2': "#9858d6",
                    'Gram+': "#013265",
                    'Pseudomonas aeruginosa': "#fb4e93",
                    'Pseudomonas': "#fb4e93",
                    'Gram-*; Gram+': "#1ceaf9",
                    'Gram-*': "#770c2e"
                },
                "pathogen_groups": {
                    'Healthy': "#5cd061",
                    'NPC': "#256b33",
                    'SARS-CoV-2' : "#e51d23",
                    'SARS-CoV-2; Gram+' : "#ffc326",
                    'Pseudomonas aeruginosa; SARS-CoV-2': "#9858d6",
                    'Pseudomonas; SARS-CoV-2': "#9858d6",
                    'Gram+': "#013265",
                    'Pseudomonas aeruginosa': "#fb4e93",
                    'Pseudomonas': "#fb4e93",
                    'Gram-*; Gram+': "#1ceaf9",
                    'Gram-*': "#770c2e"
                },
                "days_on_ventilator": {
                    "values": [0, 10],
                    "colors": ["#f7fbff", "#08306b"]
                },
                "Chemistry": {
                    "10x 3' V2": "#56407f",
                    "10x 3' V3": "#ed6146"
                },
                "mmf": {
                    "Naive": "#227f69",
                    "MMF": "#b54519"
                }
            },
            columnAnnotations: [{
                file: "meta.tsv",
                datasetField: "id",
                fileField: "id"
            }],
            /*colorScheme: { // optional color scheme. default is relative
                type: 'fixed',
                map: [{
                    value: -2,
                    color: '#0000ff'
                }, {
                    value: 0,
                    color: '#ffffff'
                }, {
                    value: 2,
                    color: '#ff0000'
                }]
            },*/
            tools: [{ name: "Hierarchical Clustering", params: {cluster: "Rows and columns", background: false} }]
        });
        window.onload = function() {
            /*morpheus.HClusterTool.execute({
                heatMap: window.heatmap,
                project: window.heatmap.getProject(),
                input:
            });*/
            window.setTimeout(function() {
                window.heatmap.fitToWindow({fitRows: true, fitColumns: true, repaint: true});
                window.heatmap.fitToWindow({fitRows: true, fitColumns: true, repaint: true});
                document.getElementById('morpheus-container').style.visibility = 'visible';
            }, 800)
        }
    </script>
</body>
</html>
"""

In [29]:
os.makedirs(OUTDIR / 'all-moam', exist_ok=True)
moam_gsva.to_csv(OUTDIR / 'all-moam' / 'gsva.tsv', sep='\t')
moam_meta.to_csv(OUTDIR / 'all-moam' / 'meta.tsv', sep='\t')
html = HTML % dict(
    ct='All MoAM + Mono',
)
table += TABLE % (
    'All MoAM + Mono', 'all-moam'
)
with open(OUTDIR / 'all-moam' / 'index.html', 'w') as f:
    f.write(html)
html = TABLE_HTML % (
    datetime.datetime.now().strftime('%d %b %Y, %I:%M%p'),
    table
)
with open(OUTDIR / 'index.html', 'w') as out:
    out.write(html)