## Making the EBcache
+ should work similar to EBscore with these differences:
    * no tumor bam is needed

In [None]:
# get the code
import sys
import os
sys.path.append('../code')

# set the paths
home = '/Users/martinscience'
home = '/Users/mahtin'


somvar_path = os.path.join(home, "Dropbox/Icke/Work/somVar")
testdata = os.path.join(somvar_path, "testdata")
ebdata = os.path.join(somvar_path, "tooldata/EBdata")
pon_path = os.path.join(testdata, "PON")

static = os.path.join(home, "Dropbox/Icke/Work/static")

## make EBcache matrix
+ to be used by bam2matrix in cache-mode

In [None]:
from file2matrix import PON2matrix

EBconfig = {
    "temp_dir": os.path.join(ebdata, "temp"),
    "pon_path": pon_path,
    "zero_path": os.path.join(pon_path, "zero"),
    "bed_file": os.path.join(static, "bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed"),
    "mawk_path": "../shell",
    "genome_split": f"{static}/genome/gatk/hg38/split",
    "MAPQ": 20,
    "Q": 25,
    "fit_pen": 0.5,
    "use_cache": True
}
pon_list = "PON_chr19.txt"
chrom = "chr19"

PON2matrix(pon_list, chrom, EBconfig)

## computation has  to be done per PON string
+ allows use of same function for both PONcache and matrix2AB
+ allows optimal caching of the results

In [None]:
from ebcache import PONmatrix2AB_multi

# _ = collapse_zeros(os.path.join(pon_path, "zero"), ponsize=5)

PONAB_config = {
    "fit_pen": 0.5,
    "threads": 8,
    "pon_path": pon_path,
    "zero_path": os.path.join(pon_path, "zero"),
    "chunksize": 50000,
    "ZDfactor": 13,  # how much complexity remains after flattening the tumor-zero lines
    "min_zt": 2000 # min number of zeroT lines to bother zerocache
}


chr19_gen = pd.read_csv(os.path.join(pon_path, "matrix/chr19.pon.gz"), compression="gzip", sep="\t", chunksize=config['chunksize'], nrows=6000)
chr7_AB_df = PONmatrix2AB_multi(chr19_gen, config=PONAB_config)
chr7_AB_df

In [None]:
df = chr7_AB_df.reset_index(drop=True)
df.iloc[10].name

In [None]:
chr7_AB_df.to_csv(os.path.join(pon_path, "ABcache/chr7.AB"), sep="\t", index=False)

In [None]:
from zerocache import collapse_zeros
zdf = collapse_zeros(os.path.join(pon_path, "zero"), ponsize=5, ZDfactor=13)
zdf

### test collapse_zeros

In [None]:
from zerocache import flatten_zeros
from script_utils import show_output

def collapse_zeros(zero_path, ZDfactor=13):
    """
    reduces all zero_files to zero.0.csv
    """
    # get all the zero files
    show_output(f"Collapsing all zero files in {zero_path} into zero.0.csv")
    zero_files = [
        os.path.join(zero_path, file)
        for file in os.listdir(zero_path)
        if os.path.isfile(os.path.join(zero_path, file)) and "zero" in file
    ]
    zero_df = pd.DataFrame()
    if len(zero_files) == 0:
        show_output(f"No zerofiles found in {zero_path}!")
        return
    for file in zero_files:
        show_output(f"Cleaning up zero file {file}", time=False)
        try:
            zdf = pd.read_csv(file, sep="\t")
            # concat to zdf_all and drop duplicates
            zero_df = pd.concat([zero_df, zdf]).drop_duplicates("D")
            # reduce complexity using condense factor
            zero_df.loc[:, "D"] = flatten_zeros(
                zero_df["D"], ZDfactor=ZDfactor
            )
        except:
            show_output(f"{file} could not be loaded", color="warning", time=False)
        os.remove(file)

    zero_df = zero_df.sort_values("D")

    zero_df.to_csv(os.path.join(zero_path, "zero.0.csv"), sep="\t", index=False)
    show_output(f"Written collapsed zerofile ({len(zero_df.index)} lines) to {zero_path}/zero.0.csv")
    return zero_df

In [None]:
zdf = collapse_zeros(os.path.join(pon_path, "zero"), ZDfactor=13)