  # BioGPS (Human Cell Line)

  Author: Moshe Silverstein <br/>
  Date: 8-17 <br/>
  Data Source: http://biogps.org/downloads/

  Reviewer: Charles Dai <br>
  Updated: 6-20

In [1]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [2]:
import sys
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [3]:
# from clustergrammer_widget import *
# net = Network(clustergrammer_widget)

In [4]:
%load_ext autoreload
%autoreload 2

 ### Python Version

In [5]:
sys.version

'3.8.0 (default, Oct 28 2019, 16:14:01) \n[GCC 8.3.0]'

 # Initialization

 ### Choose Database

In [6]:
%%appyter code_eval

{% set database = ChoiceField(
    name='database',
    label='Attributes',
    choices={
        'Human U133A/GNF1H Gene Atlas': "'U133A'",
        'Human NCI60 Cell Lines': "'NCI60'"
    },
    default='Human U133A/GNF1H Gene Atlas',
    section='data'
) %}

 ### Load Mapping Dictionaries

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

 ### Output Path

In [8]:
%%appyter code_exec

output_name = 'bioGPS-' + {{database}}

path = 'Output/BioGPS-' + {{database}}
if not os.path.exists(path):
    os.makedirs(path)

```python
output_name = 'bioGPS-' + 'U133A'
path = 'Output/BioGPS-' + 'U133A'
if not os.path.exists(path):
    os.makedirs(path)
```

In [9]:
%%appyter hide_code
{% do SectionField(
    name='data',
    title='Load Data',
    subtitle='Upload Files from the BioGPS Data Sets',
) %}}

```python

}
```

 # Load Data

In [11]:
%%appyter code_exec

matrix = pd.read_csv({{FileField(
    constraint='.*\.csv$',
    name='data_set', 
    label='Data Set', 
    default='Input/BioGPS/NCI60_U133A_20070815.raw.csv',
    section='data')
}}, sep=',', index_col=0)

```python

matrix = pd.read_csv('Input/BioGPS/NCI60_U133A_20070815.raw.csv', sep=',', index_col=0)
```

In [12]:
matrix.head()

Unnamed: 0,huh-7,huh-7.1,HEK293,HEK293.1,HL60,HL60.1,HEK 293T,HEK 293T.1,DU145,ACHN,...,NCI H226.1,U87,U87.1,U118,U118.1,U138,U138.1,HEK 293 T-rex,HEK 293 T-rex .1,HT29
1007_s_at,31.727,55.551,360.731,334.639,33.483,49.691,50.638,55.918,346.27,309.175,...,184.953,211.477,210.273,242.274,302.585,320.841,274.376,164.18,162.016,258.504
1053_at,402.633,441.622,557.782,869.346,336.006,114.442,662.357,435.929,593.534,556.449,...,554.368,280.474,303.285,298.215,373.033,474.686,447.062,744.322,679.515,624.654
117_at,9.968,10.178,9.565,8.118,13.902,17.116,10.039,8.978,7.896,7.811,...,10.875,8.442,8.697,8.862,8.054,7.47,8.098,18.473,11.431,9.929
121_at,117.051,115.552,117.618,111.873,118.448,124.955,112.094,109.924,107.586,14698.486,...,111.738,111.995,110.149,114.154,110.82,213.564,229.69,124.694,118.34,116.772
1255_g_at,4.341,4.278,4.32,4.062,4.525,4.38,4.262,4.124,4.034,3.857,...,4.17,4.115,3.959,4.116,4.031,3.851,3.911,4.254,4.501,4.498


In [13]:
matrix.shape

(22283, 108)

 ## Load Probe Annotationsa

In [15]:
%%appyter code_exec

gene_meta = pd.read_csv({{FileField(
    constraint='.*\.csv$',
    name='gene_meta', 
    label='Probe Annotations', 
    default='Input/BioGPS/HG-U133A.na36.annot.csv',
    section='data')
}}, sep=',', skiprows=25, 
    usecols=['Probe Set ID', 'Gene Symbol'], 
    index_col=0)

```python
gene_meta = pd.read_csv('Input/BioGPS/HG-U133A.na36.annot.csv', sep=',', skiprows=25,
    usecols=['Probe Set ID', 'Gene Symbol'],
    index_col=0)
```

In [16]:
gene_meta.head()

Unnamed: 0_level_0,Gene Symbol
Probe Set ID,Unnamed: 1_level_1
1007_s_at,DDR1
1053_at,RFC2
117_at,HSPA6
121_at,PAX8
1255_g_at,GUCA1A


In [17]:
gene_meta.shape

(22283, 1)

 # Pre-process Data

 ## Map Gene to Row

In [20]:
matrix.index = gene_meta.reindex(matrix.index)['Gene Symbol']
matrix.index.name = None
matrix.head()

Unnamed: 0,huh-7,huh-7.1,HEK293,HEK293.1,HL60,HL60.1,HEK 293T,HEK 293T.1,DU145,ACHN,...,NCI H226.1,U87,U87.1,U118,U118.1,U138,U138.1,HEK 293 T-rex,HEK 293 T-rex .1,HT29
DDR1,31.727,55.551,360.731,334.639,33.483,49.691,50.638,55.918,346.27,309.175,...,184.953,211.477,210.273,242.274,302.585,320.841,274.376,164.18,162.016,258.504
RFC2,402.633,441.622,557.782,869.346,336.006,114.442,662.357,435.929,593.534,556.449,...,554.368,280.474,303.285,298.215,373.033,474.686,447.062,744.322,679.515,624.654
HSPA6,9.968,10.178,9.565,8.118,13.902,17.116,10.039,8.978,7.896,7.811,...,10.875,8.442,8.697,8.862,8.054,7.47,8.098,18.473,11.431,9.929
PAX8,117.051,115.552,117.618,111.873,118.448,124.955,112.094,109.924,107.586,14698.486,...,111.738,111.995,110.149,114.154,110.82,213.564,229.69,124.694,118.34,116.772
GUCA1A,4.341,4.278,4.32,4.062,4.525,4.38,4.262,4.124,4.034,3.857,...,4.17,4.115,3.959,4.116,4.031,3.851,3.911,4.254,4.501,4.498


 ## Revert Duplicate Column Names

In [21]:
matrix.columns = matrix.columns.map(lambda x: x.split('.')[0])
matrix.head()

Unnamed: 0,huh-7,huh-7.1,HEK293,HEK293.1,HL60,HL60.1,HEK 293T,HEK 293T.1,DU145,ACHN,...,NCI H226,U87,U87.1,U118,U118.1,U138,U138.1,HEK 293 T-rex,HEK 293 T-rex.1,HT29
DDR1,31.727,55.551,360.731,334.639,33.483,49.691,50.638,55.918,346.27,309.175,...,184.953,211.477,210.273,242.274,302.585,320.841,274.376,164.18,162.016,258.504
RFC2,402.633,441.622,557.782,869.346,336.006,114.442,662.357,435.929,593.534,556.449,...,554.368,280.474,303.285,298.215,373.033,474.686,447.062,744.322,679.515,624.654
HSPA6,9.968,10.178,9.565,8.118,13.902,17.116,10.039,8.978,7.896,7.811,...,10.875,8.442,8.697,8.862,8.054,7.47,8.098,18.473,11.431,9.929
PAX8,117.051,115.552,117.618,111.873,118.448,124.955,112.094,109.924,107.586,14698.486,...,111.738,111.995,110.149,114.154,110.82,213.564,229.69,124.694,118.34,116.772
GUCA1A,4.341,4.278,4.32,4.062,4.525,4.38,4.262,4.124,4.034,3.857,...,4.17,4.115,3.959,4.116,4.031,3.851,3.911,4.254,4.501,4.498


 ## Save Unfiltered Matrix to file

In [None]:
uf.saveData(matrix, path, output_name + '_matrix_unfiltered',
            compression='gzip', dtype=np.float32)

 # Filter Data

 ## Remove Data that is More Than 95% Missing and Impute Missing Data

In [22]:
matrix = uf.removeAndImpute(matrix)
matrix.head()

KeyboardInterrupt: 

In [None]:
matrix.shape

 ## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [None]:
matrix = uf.mapgenesymbols(matrix, symbol_lookup)
matrix.shape

 ## Merge Duplicate Genes By Rows and Duplicate Columns

In [None]:
matrix = uf.merge(matrix, 'row', 'mean')
matrix = uf.merge(matrix, 'column', 'mean')
matrix.shape

 ## Log2 Transform

In [None]:
matrix = uf.log2(matrix)
matrix.head()

 ## Normalize Matrix (Quantile Normalize the Matrix by Column)

In [None]:
matrix = uf.quantileNormalize(matrix)
matrix.head()

 ## Normalize Matrix (Z-Score the Rows)

In [None]:
matrix = uf.zscore(matrix, 'row')
matrix.head()

 ## Histogram of First Sample

In [None]:
matrix.iloc[:, 0].hist(bins=100)

 ## Histogram of First Gene

In [None]:
matrix.iloc[0, :].hist(bins=100)

 ## Save Filtered Matrix

In [None]:
uf.saveData(matrix, path, output_name + '_matrix_filtered', 
            ext='tsv', compression='gzip')

 # Analyze Data

 ## Create Gene List

In [None]:
gene_list = uf.createGeneList(matrix, geneid_lookup)
gene_list.head()

In [None]:
gene_list.shape

In [None]:
uf.saveData(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

 ## Create Attribute List

In [None]:
attribute_list = uf.createAttributeList(matrix)
attribute_list.head()

In [None]:
attribute_list.shape

In [None]:
uf.saveData(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

 ## Create matrix of Standardized values (values between -1, and 1)

In [None]:
standard_matrix = uf.createStandardizedMatrix(matrix)
standard_matrix.head()

In [None]:
uf.saveData(standard_matrix, path, output_name + '_standard_matrix',
            ext='tsv', compression='gzip')

 ## Plot of A Single Celltype, Normalized Value vs. Standardized Value

In [None]:
plt.plot(matrix[matrix.columns[0]],
         standard_matrix[standard_matrix.columns[0]], 'bo')
plt.xlabel('Normalized Values')
plt.ylabel('Standardized Values')
plt.title(standard_matrix.columns[0])
plt.grid(True)

 ## Create Ternary Matrix

In [None]:
ternary_matrix = uf.createTernaryMatrix(standard_matrix)
ternary_matrix.head()

In [None]:
uf.saveData(ternary_matrix, path, output_name + '_ternary_matrix',
            ext='tsv', compression='gzip')

 ## Create Gene and Attribute Set Libraries

In [None]:
uf.createUpGeneSetLib(ternary_matrix, path, output_name + '_gene_up_set')

In [None]:
uf.createDownGeneSetLib(ternary_matrix, path, output_name + '_gene_down_set')

In [None]:
uf.createUpAttributeSetLib(ternary_matrix, path, 
                           output_name + '_attribute_up_set')

In [None]:
uf.createDownAttributeSetLib(ternary_matrix, path, 
                             output_name + '_attribute_down_set')

 ## Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.createSimilarityMatrix(matrix.T, 'cosine')
attribute_similarity_matrix.head()

In [None]:
uf.saveData(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

In [None]:
# net.load_df(attribute_similarity_matrix.iloc[:,:].copy())
# net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

 ## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.createSimilarityMatrix(matrix, 'cosine')
gene_similarity_matrix.head()

In [None]:
uf.saveData(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

 ## Create Gene-Attribute Edge List

In [None]:
uf.createGeneAttributeEdgeList(standard_matrix, attribute_list, gene_list, 
                               path, output_name + '_gene_attribute_edge_list')

 # Create Downloadable Save File

In [None]:
uf.createArchive(path)

 ### Link to download output files: [click here](./output_archive.zip)