In [1]:
import pandas as pd
import networkx as nx
import pickle
import numpy as np
from scipy.stats import spearmanr
from statsmodels.stats.multitest import multipletests

In [2]:
tissue_samples = pd.read_csv('GTEx_v7_Annotations_SampleAttributesDS.txt', sep = '\t')
tissue_list = tissue_samples.loc[tissue_samples['SMTSD'] == 'Muscle - Skeletal']['SAMPID'].to_numpy()
tissue_list

array(['GTEX-1117F-0426-SM-5EGHI', 'GTEX-111CU-2026-SM-5GZZC',
       'GTEX-111FC-0326-SM-5GZZ1', 'GTEX-111VG-2626-SM-5GZY2',
       'GTEX-111YS-2326-SM-5987L', 'GTEX-1122O-2426-SM-5GIDN',
       'GTEX-1128S-2426-SM-5H11B', 'GTEX-113JC-2726-SM-5EGIS',
       'GTEX-117XS-2526-SM-5H11G', 'GTEX-117YW-2426-SM-5Q5AE',
       'GTEX-117YX-2526-SM-5EQ4Q', 'GTEX-1192X-0426-SM-5GIEE',
       'GTEX-11DXW-0726-SM-5H12J', 'GTEX-11DXX-2726-SM-5PNXO',
       'GTEX-11DXY-2726-SM-5GID2', 'GTEX-11DXZ-2426-SM-5N9DT',
       'GTEX-11DZ1-0926-SM-5EQ5R', 'GTEX-11EI6-0326-SM-5EQ6G',
       'GTEX-11EM3-2126-SM-5H11M', 'GTEX-11EMC-2626-SM-59864',
       'GTEX-11EQ8-0526-SM-5N9BC', 'GTEX-11EQ9-2126-SM-5PNVW',
       'GTEX-11GS4-2526-SM-5A5KT', 'GTEX-11GSO-2526-SM-5PNVX',
       'GTEX-11GSP-2726-SM-5A5LJ', 'GTEX-11H98-0326-SM-5HL4S',
       'GTEX-11I78-2426-SM-5A5K9', 'GTEX-11LCK-1226-SM-5Q5AM',
       'GTEX-11NSD-2026-SM-5HL5U', 'GTEX-11NUK-0226-SM-5A5L4',
       'GTEX-11NV4-0326-SM-5HL58', 'GTEX-11O72-0326-SM-

In [3]:
columns_data = pd.read_csv('GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct', index_col = 'Name', sep = '\t', header = 2 , nrows = 2)
available_cols = columns_data.columns
common_columns = list(set(available_cols).intersection(tissue_list))
#common_columns.append('Name')
print(len(common_columns))

564


In [4]:
variance_dataframe = pd.read_csv('variance_dataset.csv')
variance_dataframe.index.values

array([    0,     1,     2, ..., 56199, 56200, 56201])

In [5]:
all_rows = pd.read_csv('GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct', usecols = ['Name'], sep = '\t', header = 2)
len(all_rows)

56202

In [7]:
rows = variance_dataframe.nlargest(100, 'Variance').index.values
rows = np.append(rows, [0, 1, 2])
print(len(rows))
skip_rows = np.setdiff1d(all_rows.index.values,rows)
skip_rows

103


array([    3,     4,     5, ..., 56198, 56200, 56201])

In [8]:
def logic(index):
    if index in skip_rows:
        return True
    return False

small_dataset = pd.read_csv('GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct', usecols = common_columns, sep = '\t', header = 2 , skiprows= lambda x: logic(x))
small_dataset

Unnamed: 0,GTEX-1117F-0426-SM-5EGHI,GTEX-111CU-2026-SM-5GZZC,GTEX-111FC-0326-SM-5GZZ1,GTEX-111VG-2626-SM-5GZY2,GTEX-111YS-2326-SM-5987L,GTEX-1122O-2426-SM-5GIDN,GTEX-1128S-2426-SM-5H11B,GTEX-113JC-2726-SM-5EGIS,GTEX-117XS-2526-SM-5H11G,GTEX-117YW-2426-SM-5Q5AE,...,GTEX-ZYFC-0526-SM-5GIDF,GTEX-ZYFD-0326-SM-5NQ8I,GTEX-ZYFG-2426-SM-5GIE8,GTEX-ZYT6-1626-SM-5E45R,GTEX-ZYVF-0626-SM-5E43Q,GTEX-ZYW4-0526-SM-5GZZ5,GTEX-ZYY3-0526-SM-5E45G,GTEX-ZZ64-1526-SM-5E43K,GTEX-ZZPT-0626-SM-5GZXT,GTEX-ZZPU-2626-SM-5E45Y
0,0.1388,0.10160,0.18120,0.00000,0.00000,0.00000,0.07281,0.11910,0.00000,0.0000,...,0.00000,0.11890,0.1049,0.08330,0.00000,0.0000,0.00000,0.00000,0.14250,0.1002
1,0.2196,0.10050,0.07167,0.03801,0.09146,0.02708,0.01440,0.07068,0.05287,0.0521,...,0.10920,0.02351,0.1452,0.04942,0.06495,0.0000,0.11100,0.09546,0.08454,0.0991
2,0.0000,0.04244,0.03784,0.12040,0.04829,0.00000,0.03041,0.14930,0.03722,0.1100,...,0.09226,0.00000,0.1752,0.03479,0.00000,0.1393,0.03906,0.08065,0.11900,0.0000
3,72.7800,89.10000,121.10000,82.15000,92.45000,106.90000,277.20000,105.50000,118.00000,113.1000,...,10440.00000,312.20000,69.8200,145.00000,62.23000,79.0300,152.40000,415.30000,93.62000,125.0000
4,0.0000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0000,...,0.00000,0.00000,0.0000,0.00000,0.00000,0.1432,0.00000,0.00000,0.00000,0.1291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,0.0000,3.46100,0.51440,0.00000,1.31300,1.55500,0.82690,1.35300,1.01200,0.9972,...,0.00000,0.00000,0.5954,2.83800,1.24300,0.0000,1.59300,0.54820,0.80910,2.2760
99,10280.0000,14280.00000,6258.00000,6983.00000,10920.00000,12560.00000,14200.00000,7985.00000,11240.00000,11350.0000,...,10430.00000,7660.00000,18860.0000,6213.00000,4777.00000,11210.0000,6843.00000,13270.00000,4408.00000,14710.0000
100,26010.0000,33250.00000,35060.00000,18360.00000,31320.00000,37060.00000,38790.00000,31520.00000,28310.00000,29780.0000,...,26400.00000,31510.00000,36380.0000,30270.00000,23330.00000,21030.0000,28210.00000,30740.00000,28450.00000,36240.0000
101,6.3410,1.03100,1.83900,0.48780,2.34700,0.69500,2.58700,0.60470,0.00000,2.2290,...,125.60000,4.22300,3.7260,0.42280,0.00000,2.2570,3.32200,2.94000,0.72330,47.3100


In [None]:
#dataset = pd.read_csv('GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct', usecols = common_columns, sep = '\t', header = 2 , nrows = 50)
#dataset = dataset1.drop('Description', axis = 1)
#print(len(dataset))

#for i in dataset.index.values:
#    if np.sum(dataset.loc[i].values) == 0.0:
#        dataset = dataset.drop(i, axis=0)
            
#len(dataset)

In [28]:
dataset = small_dataset
len(dataset)



array([ 83,   1,  71,   4,  24,   0,   5,  97,  40,  98,  32,  25,  68,
       102,  55,  68,  87,  25,   4,  55,  25,   3,  34,  76,  24,   4,
        30, 101,   1,  47,  83,   3,   9,   2,   7,  51,  72,  37,  22,
        16,  52,  21,  99,  46,  40,  90,  72,  54,  81,  91,  70,  30,
        21,  74,  14,  14, 101,  19,  27,  13,  45,  75,  60,  45,  70,
         8,  40,  21,  84,  48,  36,  27,  67,  39,  44,  81,   1,  12,
        14,  62,  51,  24,  16,   8,  91,  70,  72,  32,  15,   7,  11,
        14,  83,  12,  26,   1,  29,  56, 102,  88,  38,  85,   2])

In [25]:
def get_bootstrapped_bitmatrix():
    
    individuals = dataset.index.values
    bootstrapped_individuals = np.random.choice(individuals, len(individuals), replace=True)  #Get bootstrapped samples
    #print("bootstrapped_individuals")
    #print(bootstrapped_individuals)
    p_values = []
    corr_values = []

    count = 0
    for i in bootstrapped_individuals:
        for j in bootstrapped_individuals:
        
            count = count + 1
            #if(count%20000 == 0):
            #    print(count)

            gene1 = dataset.loc[i].values
            gene2 = dataset.loc[j].values

            corr, p = spearmanr(gene1, gene2) #calculated spearman correlation for each pair

            corr_values.append(corr)
            p_values.append(p)

    results = multipletests(p_values, alpha = 0.05, method='fdr_bh')
    reject = results[0]
    p_values_corrected = results[1]
    
    bit_vector = []
    
    for i in range(len(dataset)):
        for j in range(len(dataset)):
            if i!=j:
                if p_values_corrected[i*len(dataset) + j] > 0.1:  #If its greater than cutoff 
                    bit_vector.append(1)
                else:
                    bit_vector.append(0)

    return bit_vector

def get_results(gene1, gene2):
    
    corr_values = []
    p_values = []
    
    corr, p = spearmanr(gene1, gene2) #calculated spearman correlation for each pair

    corr_values.append(corr)
    p_values.append(p)

    results = multipletests(p_values, alpha = 0.05, method='fdr_bh')
    reject = results[0]
    p_values_corrected = results[1]

    return (corr_values,results)

int1 = np.random.randint(0, 99)
int2 = np.random.randint(0, 99)

geneone = dataset.loc[int1].values
genetwo = dataset.loc[int2].values
corr,results = get_results(geneone, genetwo)
results

In [26]:
def get_bit_matrix():
    
    B = 10
    matrix = []
    for i in range(B):
        if(i % 5 == 0):
            print('B' + ' is ' + str(i))       #Get the binary string for B values and make the matrix
        
        #print('len of bit vector' + ' is ' + str(len(bit_vector)))
        matrix.append(get_bootstrapped_bitmatrix())
        
    return matrix

In [27]:
import time
start_time = time.time()
matrix = get_bit_matrix()
end_time = time.time()
print("Time spent " + str(end_time - start_time))

B is 0
B is 5
Time spent 99.37149500846863


In [33]:
matrix.shape

AttributeError: 'list' object has no attribute 'shape'

In [24]:
matrix2 = np.array(matrix)
matrix2.shape

(10, 10506)

In [60]:
#from numpy import asarray
#from numpy import savetxt

#matrix_array = asarray(matrix2)
# save to csv file
#savetxt('bootstrapped_data.csv', matrix_array, delimiter=',')

In [15]:
from numpy import asarray
from numpy import savetxt
from numpy import loadtxt

matrix_array1 = loadtxt('bootstrapped_data.csv', delimiter=',')
matrix_array = np.concatenate((matrix_array1, matrix2))
savetxt('bootstrapped_data.csv', matrix_array, delimiter=',')
matrix_array = loadtxt('bootstrapped_data.csv', delimiter=',')
print(matrix_array.shape)

(6000, 10506)


In [2]:
from numpy import asarray
from numpy import savetxt
from numpy import loadtxt

matrix2 = loadtxt('bootstrapped_data.csv', delimiter=',')
print(matrix2.shape)

(1000, 10506)


In [None]:
B = 500
verify_G = nx.Graph()
conflict_G = nx.Graph()
matrix = matrix2[:500, ]
print(matrix.shape)
for i in range(matrix.shape[1]):
    
    if(i%1000 == 0):
        print(i)
    #temp = []
    for j in range(matrix.shape[1]):
        
        gamets = [0,0,0,0]
        for string in range(matrix.shape[0]):
            
            if(matrix[string][i] == 0 and matrix[string][j] == 1):
                gamets[1] = 1
            if(matrix[string][i] == 1 and matrix[string][j] == 0):
                gamets[2] = 1
            if(matrix[string][i] == 1 and matrix[string][j] == 1):
                gamets[3] = 1
                
            if(np.sum(gamets) == 3):
                conflict_G.add_edge(i, j)
                break
        
        #print(gamets)
        #if(np.sum(gamets)) == 3:
            #temp.append(j)
             #conflict_G.add_edge(i, j)         
    
    #conflict_graph.append(temp)
#conflict_graph

(500, 10506)
0
1000
2000


In [22]:
print(conflict_G.number_of_nodes())
print(conflict_G.number_of_edges())

#print(verify_G.number_of_nodes())
#print(verify_G.number_of_edges())
#conflict_G.edges(0) 

2756
3795012


# Calculating cardinalty cover vertex

In [23]:
graph_nodes = list(conflict_G.nodes())

visited = {}
for node in graph_nodes:
    visited.update({node : False})


for u in graph_nodes:    
    #print(conflict_G.edges(u))
    if visited[u] == False:
        
        for v in conflict_G.edges(u):
            if visited[v[1]] == False:
                #print((u,v[1]), end = " ")
                visited[v[1]] = True
                visited[u] = True
                break
                

cardinality_vertex_cover = []
for node in graph_nodes:
    if visited[node]:
        cardinality_vertex_cover.append(node)
        
print(len(cardinality_vertex_cover))

2756
