In [1]:
import numpy as np
import pandas as pd

In [2]:
def serialize(df, name, how='columns'): open(f"{name}.js", 'w').write(f"var {name} = {df.to_json(orient=how)};\n")

## Differential Proteomics

In [8]:
diff_proteins = pd.read_excel('20170323_ALS_CTR_iMNs_protein_log2FC.xls')
diff_proteins.head()

Unnamed: 0,Uniprot,Protein,log2FC,score,FDR,log_oddsDE,Symbol
0,Q9NX47,MARH5_HUMAN,-0.148362,0.000435,0.317977,-7.74037,MARCH5
1,Q15019,SEPT2_HUMAN,-0.993235,1.0,0.0,184.098,SEPT2
2,Q9UH03,SEPT3_HUMAN,-0.785533,1.0,0.0,53.9605,SEPT3
3,Q99719,SEPT5_HUMAN,0.118518,0.00054,0.313857,-7.52392,SEPT5
4,Q14141,SEPT6_HUMAN,-0.619176,0.996636,0.000119,5.69116,SEPT6


In [9]:
diff_proteins = diff_proteins.set_index('Symbol')[['log2FC', 'FDR', 'score', 'log_oddsDE']]
diff_proteins = diff_proteins.replace(' ', np.nan)
diff_proteins = diff_proteins.rename(columns={'log2FC':'logFC', 'FDR':'q'})
diff_proteins = diff_proteins.dropna(how='any', subset=['logFC','q'])
diff_proteins = diff_proteins[~diff_proteins.index.duplicated(keep='first')]


diff_proteins['q'] = -np.log10(diff_proteins['q'])
human_max_finite_log_qVal = np.max(diff_proteins['q'][np.isfinite(diff_proteins['q'])])
diff_proteins['q'] = np.around(np.clip(diff_proteins['q'], 0, human_max_finite_log_qVal), decimals=2)
print("max q-val: " + str(human_max_finite_log_qVal))

diff_proteins = diff_proteins.round(2)
diff_proteins.head()

max q-val: 18.42829116819131


  


Unnamed: 0_level_0,logFC,q,score,log_oddsDE
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MARCH5,-0.15,0.5,0.0,-7.74
SEPT2,-0.99,18.43,1.0,184.1
SEPT3,-0.79,18.43,1.0,53.96
SEPT5,0.12,0.5,0.0,-7.52
SEPT6,-0.62,3.92,1.0,5.69


In [10]:
serialize(diff_proteins[['logFC','q']], 'diff_proteins', how='index')

## Differential Transcriptomics

In [24]:
diff_transcripts = pd.read_csv('ALSvsCtrl_DEseq_iMN_18JAN2016_converted_symbols.csv')
diff_transcripts.head()

Unnamed: 0,Symbol,ID,Exp Log Ratio,Exp p-value,Exp False Discovery Rate (q-value),Entrez Gene Name
0,A1BG,ENSG00000121410,-0.168,0.608,0.919,alpha-1-B glycoprotein
1,A1BG-AS1,ENSG00000268895,0.348,0.265,0.736,A1BG antisense RNA 1
2,A1CF,ENSG00000148584,-0.159,0.842,,APOBEC1 complementation factor
3,A2M,ENSG00000175899,-0.259,0.685,0.944,alpha-2-macroglobulin
4,A2M-AS1,ENSG00000245105,-1.151,0.0847,0.48,A2M antisense RNA 1 (head to head)


In [25]:
diff_transcripts = diff_transcripts[['Symbol', 'Exp Log Ratio', 'Exp p-value', 'Exp False Discovery Rate (q-value)']]
diff_transcripts = diff_transcripts.rename(columns={'Exp Log Ratio':'logFC', 'Exp p-value':'p', 'Exp False Discovery Rate (q-value)':'q'})
diff_transcripts = diff_transcripts.replace(' ', np.nan)
diff_transcripts = diff_transcripts.dropna(how='all', subset=['q'])
diff_transcripts = diff_transcripts.set_index('Symbol')
diff_transcripts = diff_transcripts[~diff_transcripts.index.duplicated(keep='first')]

diff_transcripts.logFC = pd.to_numeric(diff_transcripts.logFC)
diff_transcripts.p = pd.to_numeric(diff_transcripts.p)
diff_transcripts.q = pd.to_numeric(diff_transcripts.q)


diff_transcripts['q'] = -np.log10(diff_transcripts['q'])
human_max_finite_log_qVal = np.max(diff_transcripts['q'][np.isfinite(diff_transcripts['q'])])
diff_transcripts['q'] = np.around(np.clip(diff_transcripts['q'], 0, human_max_finite_log_qVal), decimals=2)
print("max q-val: " + str(human_max_finite_log_qVal))

diff_transcripts = diff_transcripts.round(2)
diff_transcripts.head()

max q-val: 30.221848749616356


Unnamed: 0_level_0,logFC,p,q
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A1BG,-0.17,0.61,0.04
A1BG-AS1,0.35,0.26,0.13
A2M,-0.26,0.68,0.03
A2M-AS1,-1.15,0.08,0.32
A2ML1,-1.98,0.01,0.74


In [26]:
serialize(diff_transcripts[['logFC','q']], 'diff_transcripts', how='index')

## Proteomics

In [5]:
protein = pd.read_csv('./protein_abundances_JL.tsv', sep='\t')
protein.head()

Unnamed: 0,Symbol,28iALS_1,28iALS_2,29iALS_1,29iALS_2,30iALS_1,52iALS_1,52iALS_2,00iCTR_1,00iCTR_2,25iCTR_1_A,25iCTR_1_B,25iCTR_2,83iCTR_1,83iCTR_2
0,UBA6,1515.96,1325.49,2046.09,2373.03,3139.98,1888.97,1133.23,2428.8,2369.05,3298.28,3562.29,4558.27,2698.67,2410.73
1,ESYT2,0.0,0.0,166.46,245.38,140.69,0.0,0.0,0.0,0.0,88.14,81.26,0.0,92.94,24.49
2,SHTN1,0.0,0.0,0.0,64.01,206.67,0.0,179.03,93.06,70.56,329.18,274.13,758.07,1469.21,1535.94
3,GXYLT2,45.16,0.0,220.66,135.43,0.0,556.94,390.97,40.96,37.02,22.63,0.0,0.0,0.0,25.72
4,ILVBL,0.0,200.78,670.33,765.68,493.34,426.52,122.09,355.28,155.03,690.57,623.43,1104.38,645.16,752.21


In [6]:
protein[protein.duplicated('Symbol', keep=False)]

Unnamed: 0,Symbol,28iALS_1,28iALS_2,29iALS_1,29iALS_2,30iALS_1,52iALS_1,52iALS_2,00iCTR_1,00iCTR_2,25iCTR_1_A,25iCTR_1_B,25iCTR_2,83iCTR_1,83iCTR_2
1142,TMPO,97.56,277.71,421.52,516.68,622.26,915.61,492.38,821.63,743.06,796.69,604.21,646.67,616.47,514.38
1143,TMPO,11056.1,15455.2,19644.7,18770.5,12320.4,22956.8,13455.5,38782.2,31021.3,27139.3,27785.9,26149.8,21507.1,13254.5


In [7]:
protein = protein.drop(protein.index[[1142]])
protein.loc[1141:1144]

Unnamed: 0,Symbol,28iALS_1,28iALS_2,29iALS_1,29iALS_2,30iALS_1,52iALS_1,52iALS_2,00iCTR_1,00iCTR_2,25iCTR_1_A,25iCTR_1_B,25iCTR_2,83iCTR_1,83iCTR_2
1141,ECI1,417.55,1708.57,4864.76,4880.71,3327.45,2309.01,700.29,3516.64,2464.87,5150.95,5255.14,5286.32,3024.46,3124.55
1143,TMPO,11056.1,15455.2,19644.7,18770.5,12320.4,22956.8,13455.5,38782.2,31021.3,27139.3,27785.9,26149.8,21507.1,13254.5
1144,STAT1,511.31,700.82,2181.72,2205.62,3070.15,1426.33,666.01,800.58,479.76,3673.22,3546.43,4603.48,1832.12,1887.76


In [8]:
protein = protein.set_index('Symbol')
protein = protein.round(2)
protein.head()

Unnamed: 0_level_0,28iALS_1,28iALS_2,29iALS_1,29iALS_2,30iALS_1,52iALS_1,52iALS_2,00iCTR_1,00iCTR_2,25iCTR_1_A,25iCTR_1_B,25iCTR_2,83iCTR_1,83iCTR_2
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
UBA6,1515.96,1325.49,2046.09,2373.03,3139.98,1888.97,1133.23,2428.8,2369.05,3298.28,3562.29,4558.27,2698.67,2410.73
ESYT2,0.0,0.0,166.46,245.38,140.69,0.0,0.0,0.0,0.0,88.14,81.26,0.0,92.94,24.49
SHTN1,0.0,0.0,0.0,64.01,206.67,0.0,179.03,93.06,70.56,329.18,274.13,758.07,1469.21,1535.94
GXYLT2,45.16,0.0,220.66,135.43,0.0,556.94,390.97,40.96,37.02,22.63,0.0,0.0,0.0,25.72
ILVBL,0.0,200.78,670.33,765.68,493.34,426.52,122.09,355.28,155.03,690.57,623.43,1104.38,645.16,752.21


In [9]:
serialize(protein, 'protein')

## Transcriptomics

In [10]:
transcripts = pd.read_csv('./rna_abundances_JL.tsv', sep='\t')
transcripts.head()

Unnamed: 0,symbol,00iCTR_1,25iCTR_1,83iCTR_1,28iALS_1,29iALS_1,30iALS_1,52iALS_1,00iCTR_2,25iCTR_2,83iCTR_2,28iALS_2,29iALS_2,30iALS_2,52iALS_2
0,TSPAN6,15.0752,42.2122,31.0459,23.4388,18.6743,23.5183,15.9514,18.9935,50.8106,37.2026,19.376,19.7261,24.3771,16.9351
1,TNMD,3.46409,0.180462,0.209293,0.379753,1.38633,0.097987,7.19405,5.3175,0.152618,0.322817,0.603215,1.6997,0.12456,7.99093
2,DPM1,14.4585,15.4738,16.3545,12.3721,24.0165,17.426,22.0234,18.7923,15.5127,17.5219,11.4146,24.0487,15.7579,20.5892
3,SCYL3,3.62369,2.4685,2.89503,3.20292,2.35784,2.19669,2.4542,3.91679,3.17987,2.83527,2.73102,2.44931,2.30431,2.35572
4,C1orf112,4.55001,2.77359,2.88055,1.73893,2.35351,4.30085,2.57059,6.0291,2.75266,3.49348,1.97749,2.68149,4.26484,3.07295


In [11]:
transcripts[transcripts.duplicated('symbol', keep=False)].head(15)

Unnamed: 0,symbol,00iCTR_1,25iCTR_1,83iCTR_1,28iALS_1,29iALS_1,30iALS_1,52iALS_1,00iCTR_2,25iCTR_2,83iCTR_2,28iALS_2,29iALS_2,30iALS_2,52iALS_2
119,KRT33A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
137,AGK,6.65292,6.78247,6.70549,8.49844,7.17546,7.46348,6.28378,8.40121,7.82607,7.4774,8.24286,7.73161,6.80381,6.30191
728,SYNE2,12.5437,9.21955,20.3214,14.2366,6.34531,9.83282,6.65519,11.6112,10.9741,20.8798,13.1949,5.2726,10.1885,7.36699
729,SYNE2,0.125712,0.0,0.055844,0.075691,0.0,0.1226,0.0,0.0,0.0,0.0602,0.057673,0.05143,0.140259,0.050605
762,KIFC1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
835,CDH3,0.062083,0.10711,0.055234,0.073313,0.102287,0.057206,0.0,0.057683,0.086164,0.0,0.112172,0.142941,0.08677,0.0
836,CDH3,1.89866,1.53707,2.94354,3.18692,4.93068,3.31017,3.41086,2.08919,1.50815,2.44841,2.6813,4.90778,3.46802,3.07017
970,ZNRD1,9.72312,8.06977,9.22141,7.67877,10.0921,9.16019,10.551,12.3982,10.8669,11.3851,6.94699,9.90493,8.11434,12.0816
1046,ATP11A,4.90896,2.76523,3.98457,3.1174,2.58459,2.81019,3.33199,4.27014,3.46618,3.98733,2.56906,2.52062,2.78743,3.94849
1047,ATP11A,0.101993,0.0,0.0,0.0,0.0,0.0,0.103436,0.0,0.0,0.0,0.0,0.0,0.0,0.234438


In [12]:
transcripts = transcripts.groupby('symbol').sum(axis=1)

In [13]:
transcripts = transcripts.round(2)

In [14]:
serialize(transcripts, 'transcripts')

## Metadata

In [15]:
set(transcripts.columns)

{'00iCTR_1',
 '00iCTR_2',
 '25iCTR_1',
 '25iCTR_2',
 '28iALS_1',
 '28iALS_2',
 '29iALS_1',
 '29iALS_2',
 '30iALS_1',
 '30iALS_2',
 '52iALS_1',
 '52iALS_2',
 '83iCTR_1',
 '83iCTR_2'}

In [16]:
set(protein.columns)

{'00iCTR_1',
 '00iCTR_2',
 '25iCTR_1_A',
 '25iCTR_1_B',
 '25iCTR_2',
 '28iALS_1',
 '28iALS_2',
 '29iALS_1',
 '29iALS_2',
 '30iALS_1',
 '52iALS_1',
 '52iALS_2',
 '83iCTR_1',
 '83iCTR_2'}

In [17]:
set(transcripts.columns) - set(protein.columns)

{'25iCTR_1', '30iALS_2'}

In [18]:
set(protein.columns) - set(transcripts.columns)

{'25iCTR_1_A', '25iCTR_1_B'}

In [27]:
transcriptomic_classes = pd.DataFrame(transcripts.columns.str.split('i').str[1].str.split('_').str[0], index=transcripts.columns, columns=['condition'])
transcriptomic_classes

Unnamed: 0,condition
00iCTR_1,CTR
25iCTR_1,CTR
83iCTR_1,CTR
28iALS_1,ALS
29iALS_1,ALS
30iALS_1,ALS
52iALS_1,ALS
00iCTR_2,CTR
25iCTR_2,CTR
83iCTR_2,CTR


In [28]:
proteomic_classes = pd.DataFrame(protein.columns.str.split('i').str[1].str.split('_').str[0], index=protein.columns, columns=['condition'])
proteomic_classes

Unnamed: 0,condition
28iALS_1,ALS
28iALS_2,ALS
29iALS_1,ALS
29iALS_2,ALS
30iALS_1,ALS
52iALS_1,ALS
52iALS_2,ALS
00iCTR_1,CTR
00iCTR_2,CTR
25iCTR_1_A,CTR


In [30]:
serialize(transcriptomic_classes, 'transcriptomic_classes', how='index')

In [31]:
serialize(proteomic_classes, 'proteomic_classes', how='index')