In [14]:
import pandas as pd
from rbm_torch.utils import data_prep as dp
from rbm_torch.utils.utils import fasta_read
import rbm_torch.analysis.analysis_methods as am
from rbm_torch.utils.seq_utils import prune_similar_sequences

In [None]:
exo_df = am.fetch_data(["r2", "r3", "r4", "r5"], dir="./exo/", threads=6, molecule="dna")
exo_ct = dp.copynum_topology_faster(exo_df, ["r2", "r3", "r4", "r5"])
exo_ct.to_csv("./exo/exo_ct.csv", index=False)

In [4]:
exo_df = am.fetch_data(["r2", "r3", "r4", "r5"], dir="./exo/raw_rounds/", threads=6, molecule="dna")

Process Time 0.5521154403686523
Process Time 0.2996940612792969
Process Time 0.17485761642456055
Process Time 0.12303280830383301


In [6]:
rpm_dict = {}  # normalize counts to reads per million (added option in fasta_read to do this automatically)
for r in ["r2", "r3", "r4", "r5"]:
    round_data = exo_df[exo_df["round"] == r]
    rpm_dict[r] = round_data["copy_num"].sum()/1000000

In [7]:
rpm_dict

{'r2': 9.232675, 'r3': 10.478473, 'r4': 10.173198, 'r5': 8.49302}

In [10]:
import pandas as pd
import numpy as np
exo_pd = pd.read_csv("./exo/exo_ct.csv")

# normalize counts
for r in ["r2", "r3", "r4", "r5"]:
    exo_pd[r] = exo_pd[r].div(rpm_dict[r])

# exo_pd["mean"] = exo_pd.apply(lambda row : np.nanmean(np.asarray([row[x] for x in ["r2", "r3", "r4", "r5"]])), axis=1)
exo_pd["max"] = exo_pd.apply(lambda row : np.nanmax(np.asarray([row[x] for x in ["r2", "r3", "r4", "r5"]])), axis=1)


def fold(dataframe, cols):
    """ create fold column as col2/col1 for all columns"""
    for cid, col in enumerate(cols):
        for did, dol in enumerate(cols):
            if cid >= did:
                continue
            else:
                dataframe[f"{dol}/{col}_fold"] = dataframe.apply(lambda row: row[dol]/row[col] if row[col] != np.nan and row[dol] != np.nan else np.nan, axis=1)

    return dataframe


exo_pd = fold(exo_pd, ["r2", "r3", "r4", "r5"])

In [11]:
exo_pd.head()

Unnamed: 0,sequence,r2,r3,r4,r5,max,r3/r2_fold,r4/r2_fold,r5/r2_fold,r4/r3_fold,r5/r3_fold,r5/r4_fold
0,TGCGGGGCAATTTGAACACACCCGCAATCCCAGTTTGA,3.465951,2.958446,,,3.465951,0.853574,,,,,
1,ATGGATCACAAGGTGTTTCTGTTTTTTTTGGGGTAA--,0.108311,0.095434,0.098298,0.235487,0.235487,0.881109,0.907549,2.17418,1.030008,2.467549,2.395661
2,TTTGAACGTCCGCAGCTGCAATCGGGCGCTTAGCCA--,2.057908,1.240639,,,2.057908,0.602864,,,,,
3,TGACGTAGTGACTGGATCTACACATTTTTCTTACT---,1.191421,0.477169,0.098298,0.117744,1.191421,0.400504,0.082504,0.098826,0.206002,0.246755,1.19783
4,ATTAAGTTGGTAGCCGCCACCATGTTTGTCAGATC---,0.974799,0.381735,4.71828,,4.71828,0.391604,4.840261,,12.360093,,


In [22]:
from copy import copy
enriched = copy(exo_pd[(exo_pd["r5/r4_fold"] > 2) | (exo_pd["r5/r3_fold"] > 3)])
enriched["fold"] = enriched.apply(lambda row: np.nanmax(np.asarray([row["r5/r4_fold"], row["r5/r3_fold"]])), axis=1)
print(enriched.index.__len__())

23382


In [23]:
enriched.sort_values("fold", ascending=False, inplace=True)
enriched_trimmed = prune_similar_sequences(enriched, hamming_threshold=4, molecule="dna")

Kept 20919 of 23382


In [24]:
dp.dataframe_to_fasta(enriched_trimmed, "./exo/enriched.fasta", count_key="fold")

In [25]:
seqs, folds, chars, q = fasta_read("./exo/enriched.fasta", "dna", threads=6)

Process Time 0.024022579193115234


In [36]:
std_folds = dp.standardize_affinities(folds, out_plots="./exo/enriched", scale="log", dividers=[10], target_scaling=[2.49], divider_type="percentile")

In [38]:
dp.make_weight_file("./exo/en_fold_st", std_folds)