In [1]:
import re
import numpy as np
import scipy.signal as signal
import pandas as pd
import seaborn as sns

from multiprocessing import Pool

from matplotlib import pyplot as plt
from scipy import spatial
from scipy.stats import zscore
from scipy.sparse import coo_matrix
from sklearn.cluster import KMeans
from tqdm import tqdm

# ATAC

In [2]:
data = pd.read_csv("LungCancer10x/09_bulk/ATAC/computedMatrix/all.csv.gz", index_col=0)

  mask |= (ar1 == a)


In [0]:
data.head()

In [24]:
from multiprocessing import Queue, Process
import gc
import warnings
warnings.filterwarnings('ignore')


def mean(data):
    if len(data) == 0:
        return 0
    
    return sum(data) / len(data)

def calculate_freq_of_cluster(labels, ident, total):
    res = {}
    for i, j in zip(labels, ident):
        i = int(i)
        j = re.subn(r"\d+", "", j)[0]
        temp = res.get(i, {})
        temp[j] = temp.get(j, 0) + 1
        res[i] = temp

    return {
        "score": mean([max([v / total[k] for k, v in x.items()]) for x in res.values()]),
        "data": res
    }

def __calculate_atac_score_using_kmeans__(data):
    data, total, n_cluster, random_state, group_by, columns = data
    res = []
    kmeans = KMeans(n_clusters=n_cluster, random_state=random_state)
    for i in data[group_by].unique():
        temp_data = data.loc[data[group_by] == i, :]
        temp_mtx = temp_data.iloc[:, columns]
        temp_mtx.index = temp_data["ident"]
        temp_mtx = zscore(temp_mtx, axis = 1)
        try:
            kmeans.fit(temp_mtx)
            temp_res = calculate_freq_of_cluster(kmeans.labels_, temp_data["ident"], total)
            temp_res["ident"] = i

            res.append(temp_res)
        except ValueError:
            continue
    return res

def calculate_atac_score_using_kmeans(data, n_jobs, total, columns, n_cluster=3, random_state=0, group_by="trans_id"):
    res = []
    gc.collect()
    tasks = []
    genes = list(data[group_by].unique())
    bk = len(genes) // n_jobs

    for i in range(0, len(genes), bk):
        tasks.append([data.loc[data[group_by].isin(genes[i: i + bk]), :], total, n_cluster, random_state, group_by, columns])

    with Pool(n_jobs) as p:
        for i in list(p.imap(__calculate_atac_score_using_kmeans__, tasks)):
            res += i
    gc.collect()
    return res

In [None]:
total = {}
for i in data["ident"].unique():
    i = re.subn(r"\d+", "", i)[0]
    total[i] = total.get(i, 0) + 1

genes = list(data["trans_id"].unique())
bk = len(genes) // 20

res = []
for i in tqdm(range(0, len(genes), bk)):
    res += calculate_atac_score_using_kmeans(data.loc[data["trans_id"].isin(genes[i: i + bk]), :], 10, total, [x for x in range(1, 201)])

In [23]:
with open("LungCancer10x/09_bulk/ATAC/computedMatrix/all.res", "w+") as w:
    for i in res:
        w.write("{}\t{}\n".format(i["score"], i["ident"]))

Unnamed: 0,X,X0,X1,X2,X3,X4,X5,X6,X7,X8,...,X198,X199,ident,chrom,start,end,strand,gene_id,trans_id,gene_name
1,chr1:11868-14409:+\tENST00000456328.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,BSPN1,chr1,11868,14409,+,ENSG00000223972,ENST00000456328,AP001094.3
2,chr1:12009-13670:+\tENST00000450305.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,BSPN1,chr1,12009,13670,+,ENSG00000223972,ENST00000450305,AP001094.3
3,chr1:14403-29570:-\tENST00000488147.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,BSPN1,chr1,14403,29570,-,ENSG00000227232,ENST00000488147,SNX6P1
4,chr1:17368-17436:-\tENST00000619216.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,BSPN1,chr1,17368,17436,-,ENSG00000278267,ENST00000619216,AL161785.1
5,chr1:29553-31097:+\tENST00000473358.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,BSPN1,chr1,29553,31097,+,ENSG00000243485,ENST00000473358,RNU6-815P


# WGS

In [2]:
data = pd.read_csv("LungCancer10x/09_bulk/DNA/computedMatrix/wgs.csv", index_col=0)

In [7]:
data.head()

Unnamed: 0,chrom,start,end,strand,gene_name,V6,V7,V8,V9,V10,...,V98,V99,V100,V101,V102,V103,V104,V105,SampleID,SampleType
1,chr1,569076,569756,+,MTATP6P1,56.0,56.0,56.0,82.142857,117.0,...,73.0,73.0,73.0,73.0,73.0,73.0,73.0,68.714286,BSPN1,Tumor
2,chr1,879584,894689,-,NOC2L,22.776316,40.509934,51.84106,38.463576,34.456954,...,43.172185,48.668874,46.649007,48.05298,50.311258,46.072848,42.622517,43.523179,BSPN1,Tumor
3,chr1,934342,935552,-,HES4,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.333333,BSPN1,Tumor
4,chr1,936518,949921,+,ISG15,52.164179,34.164179,30.059701,21.776119,7.58209,...,43.985075,37.074627,37.597015,29.985075,30.447761,31.19403,44.343284,43.77037,BSPN1,Tumor
5,chr1,955503,991498,+,AGRN,4.473538,13.011111,39.822222,40.633333,34.638889,...,36.322222,44.833333,31.177778,35.361111,39.344444,47.038889,45.883333,32.772222,BSPN1,Tumor


In [9]:
data["ident"] = ["{}-{}".format(re.subn(r"\d+", "", x["SampleID"])[0], x["SampleType"]) for _, x in data.iterrows()]
data.head()

Unnamed: 0,chrom,start,end,strand,gene_name,V6,V7,V8,V9,V10,...,V99,V100,V101,V102,V103,V104,V105,SampleID,SampleType,ident
1,chr1,569076,569756,+,MTATP6P1,56.0,56.0,56.0,82.142857,117.0,...,73.0,73.0,73.0,73.0,73.0,73.0,68.714286,BSPN1,Tumor,BSPN-Tumor
2,chr1,879584,894689,-,NOC2L,22.776316,40.509934,51.84106,38.463576,34.456954,...,48.668874,46.649007,48.05298,50.311258,46.072848,42.622517,43.523179,BSPN1,Tumor,BSPN-Tumor
3,chr1,934342,935552,-,HES4,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,2.0,1.333333,BSPN1,Tumor,BSPN-Tumor
4,chr1,936518,949921,+,ISG15,52.164179,34.164179,30.059701,21.776119,7.58209,...,37.074627,37.597015,29.985075,30.447761,31.19403,44.343284,43.77037,BSPN1,Tumor,BSPN-Tumor
5,chr1,955503,991498,+,AGRN,4.473538,13.011111,39.822222,40.633333,34.638889,...,44.833333,31.177778,35.361111,39.344444,47.038889,45.883333,32.772222,BSPN1,Tumor,BSPN-Tumor


In [35]:
from itertools import combinations
from scipy.stats import kruskal


def pandas_to_list(data, columns):
    res = []

    for _, row in data.iterrows():
        res += list(row)
    return res


def calculate_pvalue(data):
    res = {}
    for i in data["gene_name"].unique():
        temp_data = data.loc[data["gene_name"] == i, :]

        temp_res = {}
        for j, k in combinations(temp_data["ident"], 2):
            temp_1 = pandas_to_list(temp_data.loc[temp_data["ident"] == j, ], [x for x in range(5, 105)])
            temp_2 = pandas_to_list(temp_data.loc[temp_data["ident"] == k, ], [x for x in range(5, 105)])

            temp_res["{}|{}".format(j, k)] = kruskal(temp_1, temp_2).pvalue
        res[i] = temp_res
    return res


In [19]:
genes = list(data["gene_name"].unique())
bk = len(genes) // n_jobs


tasks = []
for i in tqdm(range(0, len(genes), bk)):
    tasks.append(data.loc[data["gene_name"].isin(genes[i: i + bk]), :])


res = {}
with Pool(n_jobs) as p:
    for i in p.imap(calculate_pvalue, tasks):
        res.update(i)

8352

In [0]:
for i in res:
    print(i)
    break

In [26]:
with open("LungCancer10x/09_bulk/DNA/computedMatrix/wgs.res", "w+") as w:

    for i in res:
        w.write("{}\t{}\n".format(i["score"], i["ident"]))

In [28]:
import json
with open("LungCancer10x/09_bulk/DNA/computedMatrix/wgs.json", "w+") as w:
    json.dump(res, w, indent = 4)

In [None]:
stats.kruskal(x, y)