In [1]:
import glob

In [2]:
import numpy as np
import pandas as pd

In [3]:
from scipy.stats import rankdata

In [4]:
H_df = pd.read_csv('write/VHL_kidney_GEPs.csv', index_col=0)
H_genes = list(H_df.columns)

In [10]:
tcga_transcriptome_files = glob.glob('data/tcga_transcriptome/clean_*')

In [11]:
program_usages = []

for each_file in tcga_transcriptome_files:
    trans_df = pd.read_table(each_file, sep='\t', index_col=0)
    trans_df.drop('Entrez_Gene_Id', axis='columns', inplace=True)

    log_trans_df = np.log1p(trans_df)
    log_trans_df = log_trans_df.T

    trans_genes = list(log_trans_df.columns)
    comm_genes = list(set(H_genes) & set(trans_genes))

    used_H_df = H_df[comm_genes]
    used_trans_df = log_trans_df[comm_genes]

    H = used_H_df.values
    inv_H = np.linalg.pinv(H) # Compute the (Moore-Penrose) pseudo-inverse of a matrix

    trans = used_trans_df.values
    ranked_trans = rankdata(trans, method='min', axis=1) # normalization per sample

    W = np.dot(ranked_trans, inv_H)

    W_df = pd.DataFrame(W, index=used_trans_df.index, columns=list(H_df.index))

    program_usages.append(W_df)

program_usage_df = pd.concat(program_usages)
program_usage_df.to_csv('write/program_usages.csv')    

In [45]:
trans_df = pd.read_table('data/tcga_transcriptome/clean_kirc-rsem-fpkm-tcga-t.txt', sep='\t', index_col=0)
trans_df.drop('Entrez_Gene_Id', axis='columns', inplace=True)

In [48]:
log_trans_df = np.log1p(trans_df)
log_trans_df = log_trans_df.T

In [49]:
H_df = pd.read_csv('write/VHL_kidney_GEPs.csv', index_col=0)
H_df.shape

(11, 768)

In [50]:
H_genes = list(H_df.columns)
trans_genes = list(log_trans_df.columns)
comm_genes = list(set(H_genes) & set(trans_genes))
len(comm_genes)

768

In [51]:
used_H_df = H_df[comm_genes]
used_trans_df = log_trans_df[comm_genes]

In [52]:
H = used_H_df.values
inv_H = np.linalg.pinv(H) # Compute the (Moore-Penrose) pseudo-inverse of a matrix
inv_H.shape

(768, 11)

In [53]:
trans = used_trans_df.values
ranked_trans = rankdata(trans, method='min', axis=1) # normalization per sample
ranked_trans.shape

(475, 768)

In [54]:
W = np.dot(ranked_trans, inv_H)
W.shape

(475, 11)

In [55]:
W_df = pd.DataFrame(W, index=used_trans_df.index, columns=list(H_df.index))
W_df

Unnamed: 0,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11
TCGA_B2_5641_01,12.277100,2.635807,4.756027,1.106056,-1.008482,1.746075,1.324845,2.982919,0.338918,1.275817,1.320590
TCGA_A3_3362_01,12.439110,2.384621,4.896829,1.183359,-1.051265,1.904904,1.526953,2.849425,0.171614,0.856817,1.238161
TCGA_A3_A6NL_01,12.485909,2.221967,4.649102,0.905750,-1.069340,2.160378,1.318062,3.029958,0.222383,1.056029,1.157859
TCGA_BP_5178_01,12.332143,3.008580,4.051656,0.714401,-0.877147,1.941596,1.365741,2.854424,-0.016993,1.068508,1.255925
TCGA_BP_4756_01,12.431744,2.657062,5.080684,1.244671,-1.164986,1.876914,1.226853,2.979888,0.026623,0.845917,1.375037
...,...,...,...,...,...,...,...,...,...,...,...
TCGA_BP_5196_01,12.517496,2.723995,4.255305,0.892271,-1.048513,2.249125,1.302354,2.969942,0.107708,1.482435,1.402295
TCGA_B8_4622_01,12.336851,2.449119,4.984000,1.099463,-1.052179,1.689862,1.245303,2.954300,0.269707,1.383613,1.308764
TCGA_B0_5077_01,12.409614,2.819222,4.707054,0.728188,-1.104287,2.220756,1.245515,3.197033,0.234873,1.129149,1.314706
TCGA_AK_3465_01,12.521595,2.771590,5.389962,1.199140,-1.179051,1.716548,1.294073,2.768207,-0.099936,0.862942,1.524736
