In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [None]:
import gzip
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import shutil

from copy import deepcopy
from statistics import stdev

# Global Variables

In [None]:
data_dir = "/content/drive/MyDrive/Classes/CSCE 6810 Advanced Topics in Computational Life Science/Group Project/Dataset/"

prep_dir = data_dir + "Preprocessed/"
multi_dir = prep_dir + "Multi Cell/data/"
# multi_dir = "/content/drive/MyDrive/data/"
single_dir = prep_dir + "Single Cell/"

unbatched_dir = data_dir + "Model Ready/Multi Cell/Unbatched/"
batched_dir = data_dir + "Model Ready/Multi Cell/Batched/"

In [None]:
for d in ["batched/", "unbatched/"]:
    if not os.path.exists(d):
        os.makedirs(d)
unbatched_dir = "unbatched/"
batched_dir = "batched/"

# Functions

In [None]:
def format_df(df):
    df = df.transpose()
    cols = list(df.columns)
    cols = cols[:-5]
    return df[cols]

In [None]:
def get_seqs(df):
    seqs = {"healthy": {}, "unhealthy": {}}
    for i, row in df.iterrows():
        vals = [x for x in row.values if x > 0]

        while len(vals) < 8000:
            vals = vals + [0.0] + vals

        s = []
        while len(vals) > 200:
            s.append(vals[:200])
            vals = vals[200:]

        n = row.name
        k = "unhealthy"
        if "11A" in n:
            k = "healthy"
        seqs[k].update({row.name: s})

    return seqs

In [None]:
def format_h_uh(df, n):
    for i, d1 in enumerate(df):
        master = []
        for j, d2 in enumerate(df[d1]):
            for k, row in enumerate(df[d1][d2]):
                row = row + [n]
                master.append(row)
        df[d1] = master
    return df

# Get Sequenced Files

In [None]:
data = {"healthy": [], "unhealthy": []}
healthy = []
for i, f in enumerate(os.listdir(multi_dir)):
    # print(f)
    if i % 5 == 0:
        print(i)
    if ".tsv" in f:
        n = f.split(".")[0].split("-")[-1]

        df = pd.read_csv(multi_dir + f, sep="\t", index_col="Ensembl_ID")
        df = format_df(df)
        df = get_seqs(df)
        df = format_h_uh(df, n)

        healthy = healthy + df["healthy"]
        df = pd.DataFrame(df["unhealthy"])
        cols = list(df.columns)
        cols[-1] = "label"
        df.columns = cols

        print("\t", df.shape)
        fn = "convert_" + ".".join(f.split(".")[:-1]) + ".csv"
        df.to_csv(unbatched_dir + fn, index=False)
        # print(df)
        # break

print()
c = 0
while len(healthy) > 300:
    df = pd.DataFrame(healthy[:300])
    healthy = healthy[300:]
    cols = list(df.columns)
    cols[-1] = "label"
    df.columns = cols
    df.to_csv(unbatched_dir + "healthy_%d.csv"%c, index=False)
    c += 1

0
	 (5524, 201)
	 (28661, 201)
	 (86346, 201)
	 (83517, 201)
	 (52752, 201)
5
	 (67415, 201)
	 (69753, 201)
	 (63800, 201)
	 (180675, 201)
	 (10835, 201)
10
	 (47946, 201)
	 (7343, 201)
	 (78177, 201)
	 (9896, 201)
	 (45280, 201)
15
	 (90951, 201)
	 (88092, 201)
	 (28512, 201)
	 (13428, 201)
	 (28083, 201)
20
	 (24682, 201)
	 (79843, 201)
	 (28635, 201)
	 (39961, 201)
	 (26190, 201)
25
	 (72299, 201)
	 (72617, 201)
	 (9107, 201)
	 (81203, 201)
	 (18987, 201)
30
	 (11499, 201)



# Get Batched Files

In [None]:
(5524 * 31) / 64

2675.6875

In [None]:
x = 300
(x * 31) / 128

72.65625

In [None]:
(9746*2)/128/31

4.912298387096774

In [None]:
9746*2

19492

## Unhealthy

In [None]:
c = 0
for f in os.listdir(unbatched_dir):
    if "convert_" in f:
        print(c)
        df = pd.read_csv(unbatched_dir + f)
        df = df.assign(label=c)
        for i in range(10000):
            tmp = df.sample(4, replace=False, axis=0)

            if os.path.exists("batched/" + "batch_%d.csv"%i):
                batch = pd.read_csv("batched/" + "batch_%d.csv"%i)
                batch = pd.concat([batch, tmp], axis=0, ignore_index=True)
                batch = batch.sample(frac=1, random_state=42)
                batch.to_csv("batched/" + "batch_%d.csv"%i, index=False)

            else:
                tmp.to_csv("batched/" + "batch_%d.csv"%i, index=False)
        c += 1
        # if c == 5:
        #     break

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


In [None]:
for i, d1 in enumerate(data):
    print(d1)
    for j, d2 in enumerate(data[d1]):
        print("\t", d2)
        # for k, d3 in enumerate(data[d1][d2]):
        #     print("\t\t", d3)
        #     for l, row in enumerate(data[d1][d2][d3]):
        #         print("\t\t\t", row)

        #         if l == 9:
        #             break
        #     if k == 9:
        #         break
        if j == 9:
            break
    if i == 9:
        break

healthy
unhealthy


In [None]:
shutil.copytree("batched/", data_dir + "Model Ready/Multi Cell/Batched Long/")

'/content/drive/MyDrive/Classes/CSCE 6810 Advanced Topics in Computational Life Science/Group Project/Dataset/Model Ready/Multi Cell/Batched Long/'