In [1]:
import pandas as pd
from rbm_torch.utils import data_prep as dp
from rbm_torch.utils.utils import fasta_read
import rbm_torch.analysis.analysis_methods as am
from rbm_torch.utils.seq_utils import prune_similar_sequences

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
exo_df = am.fetch_data(["r2", "r3", "r4", "r5"], dir="./exo/", threads=6, molecule="dna")
exo_ct = dp.copynum_topology_faster(exo_df, ["r2", "r3", "r4", "r5"])
exo_ct.to_csv("./exo/exo_ct.csv", index=False)

In [2]:
exo_df = am.fetch_data(["r2", "r3", "r4", "r5"], dir="./exo/raw_rounds/", threads=6, molecule="dna")

Process Time 0.5721981525421143
Process Time 0.295612096786499
Process Time 0.1601245403289795
Process Time 0.09906506538391113


In [3]:
rpm_dict = {}  # normalize counts to reads per million (added option in fasta_read to do this automatically)
for r in ["r2", "r3", "r4", "r5"]:
    round_data = exo_df[exo_df["round"] == r]
    rpm_dict[r] = round_data["copy_num"].sum()/1000000

In [7]:
rpm_dict

{'r2': 9.232675, 'r3': 10.478473, 'r4': 10.173198, 'r5': 8.49302}

In [5]:
import pandas as pd
import numpy as np
exo_pd = pd.read_csv("./exo/exo_ct.csv")

# normalize counts
for r in ["r2", "r3", "r4", "r5"]:
    exo_pd[r] = exo_pd[r].div(rpm_dict[r])

# exo_pd["mean"] = exo_pd.apply(lambda row : np.nanmean(np.asarray([row[x] for x in ["r2", "r3", "r4", "r5"]])), axis=1)
exo_pd["max"] = exo_pd.apply(lambda row : np.nanmax(np.asarray([row[x] for x in ["r2", "r3", "r4", "r5"]])), axis=1)


# def fold(dataframe, cols):
#     """ create fold column as col2/col1 for all columns"""
#     for cid, col in enumerate(cols):
#         for did, dol in enumerate(cols):
#             if cid >= did:
#                 continue
#             else:
#                 dataframe[f"{dol}/{col}_fold"] = dataframe.apply(lambda row: row[dol]/row[col] if row[col] != np.nan and row[dol] != np.nan else np.nan, axis=1)
#
#     return dataframe

# exo_pd = fold(exo_pd, ["r2", "r3", "r4", "r5"])

def enrichment_averge(df, round_names, min_diff=1, max_diff=None, diff_weights=None, round_weights=None):
    round_number = len(round_names)

    if max_diff is None:
        max_diff = round_number-1

    if diff_weights is None:
        diff_weights = [1. for x in range(min_diff, max_diff+1)]

    if round_weights is None:
        round_weights = [1. for x in range(len(round_names))]

    # first let's remove all the nan values in the dataframe, set nan values as the minimum normalized count for each round
    for r in round_names:
        df[r] = df[r].fillna(df[r].min())

    # Get fold value for round differences
    fold_keys = {diff: [] for diff in range(min_diff, max_diff+1)}
    for i in range(round_number):
        for j in range(round_number):
            if i >= j or j - i < min_diff or j - i > max_diff:
                continue
            fold_column_name = f"fold_{round_names[j]}v{round_names[i]}"
            fold_keys[j-i].append(fold_column_name)
            # fold_diffs.append(j-i)
            df[fold_column_name] = df[round_names[j]]/df[round_names[i]] * (round_weights[j] + round_weights[i])

    diff_keys = []
    for i in range(min_diff, max_diff+1):
        diff_avg_key = f"fold_diff{i}_avg"
        df[diff_avg_key] = df[fold_keys[i]].sum(axis=1).div(len(fold_keys[i])).mul(diff_weights[i-1])
        diff_keys.append(diff_avg_key)

    df["Final_Fold_Avg"] = df[diff_keys].sum(axis=1).div(len(diff_keys))

    return df

exo_pd = enrichment_averge(exo_pd, ["r2", "r3", "r4", "r5"], min_diff=1, max_diff=None, diff_weights=[1, 1.2, 1.3], round_weights=[1, 1.1,  1.15, 1.2])


In [9]:
exo_pd["Fitness_Value"] = dp.scale_values_np(dp.log_scale(exo_pd["Final_Fold_Avg"].tolist(), base=1.0), min=0.01, max=1.0)

exo_pd.sort_values("Fitness_Value", ascending=False, inplace=True)
dp.dataframe_to_fasta(exo_pd, "./exo/fold_avg_all.fasta", count_key="Fitness_Value")

In [7]:
exo_pd.index.__len__()

242496

In [6]:
exo_pd.head()

Unnamed: 0,sequence,r2,r3,r4,r5,max,fold_r3vr2,fold_r4vr2,fold_r5vr2,fold_r4vr3,fold_r5vr3,fold_r5vr4,fold_diff1_avg,fold_diff2_avg,fold_diff3_avg,Final_Fold_Avg
0,TGCGGGGCAATTTGAACACACCCGCAATCCCAGTTTGA,3.465951,2.958446,0.098298,0.117744,3.465951,1.792506,0.060976,0.074737,0.074759,0.091538,2.814902,1.560722,0.091508,0.097159,0.58313
1,ATGGATCACAAGGTGTTTCTGTTTTTTTTGGGGTAA--,0.108311,0.095434,0.098298,0.235487,0.235487,1.850329,1.95123,4.783195,2.317517,5.675364,5.629803,3.265883,4.575956,6.218153,4.686664
2,TTTGAACGTCCGCAGCTGCAATCGGGCGCTTAGCCA--,2.057908,1.240639,0.098298,0.117744,2.057908,1.266014,0.102696,0.125874,0.178271,0.218283,2.814902,1.419729,0.192588,0.163636,0.591984
3,TGACGTAGTGACTGGATCTACACATTTTTCTTACT---,1.191421,0.477169,0.098298,0.117744,1.191421,0.841058,0.177385,0.217418,0.463503,0.567536,2.814902,1.373154,0.446953,0.282643,0.700917
4,ATTAAGTTGGTAGCCGCCACCATGTTTGTCAGATC---,0.974799,0.381735,4.71828,0.117744,4.71828,0.822368,10.406561,0.265733,27.81021,0.70942,0.058644,9.563741,6.669589,0.345453,5.526261


In [22]:
from copy import copy
enriched = copy(exo_pd[(exo_pd["r5/r4_fold"] > 2) | (exo_pd["r5/r3_fold"] > 3)])
enriched["fold"] = enriched.apply(lambda row: np.nanmax(np.asarray([row["r5/r4_fold"], row["r5/r3_fold"]])), axis=1)
print(enriched.index.__len__())

23382


In [23]:
enriched.sort_values("fold", ascending=False, inplace=True)
enriched_trimmed = prune_similar_sequences(enriched, hamming_threshold=4, molecule="dna")

Kept 20919 of 23382


In [24]:
dp.dataframe_to_fasta(enriched_trimmed, "./exo/enriched.fasta", count_key="fold")

In [25]:
seqs, folds, chars, q = fasta_read("./exo/enriched.fasta", "dna", threads=6)

Process Time 0.024022579193115234


In [36]:
std_folds = dp.standardize_affinities(folds, out_plots="./exo/enriched", scale="log", dividers=[10], target_scaling=[2.49], divider_type="percentile")

In [38]:
dp.make_weight_file("./exo/en_fold_st", std_folds)