In [189]:
import pandas as pd
import numpy as np
import os
import glob
import re

In [190]:
# path to the dir that contains the output files from the cpp_code (filenames must end with .out)
dir = 'out'

In [191]:
all_files = glob.glob(os.path.join(dir, "*.out"))
df = pd.concat((pd.read_csv(f, header=None) for f in all_files), ignore_index=True)
# seq. name: from the fasta or fastq file
# frame: [0, ..., k]
# repeat_representation: for example AG(GAC)_4 TGT
# score_type: run c++ main -h to see the available score types
# score: the score of the repeat_representation
# was_too_long: 1 if the if the input seq was longer than the configured max length, see c++ main -h
df.columns = ['seq_name', 'frame', 'repeat_representation', 'score_type', 'score', 'was_too_long']

In [192]:
# extract the values from the strings
df['frame'] = df['frame'].str.extract(r'frame[:\s]+(\d+)', expand=False).astype(int)
df['score_type'] = df["score_type"].str.extract(r'score_type[:\s]+(\w+)', expand=False)
df['score'] = df['score'].str.extract(r'score[:\s]+(\d+)', expand=False).astype(int)
df['was_too_long'] = df['was_too_long'].str.extract(
    r'seqlen too long[:\s]+(\w+)', expand=False).astype(int).astype(bool)
# creating new columns
df["no_flanks"] = df["repeat_representation"].apply(lambda x: x[x.index("("):x.rfind(" ")+1]) # TODO: maybe dont use the +1 and adapt the pattern in seq_conforms_with_category accordingly


In [193]:
# these are currently not used
def color_negative_red(val):
    if val < 1:
        font_color = 'white'
        background_color = 'red'
    else:
        font_color = 'black'
        background_color = 'white'
    return 'color: %s; background-color: %s' % (font_color, background_color)


def highlight_chars_with_A(text):
    styled_text = ''
    for char in text:
        if char == 'A':
            styled_text += '<span style="color: white; background-color: red;">%s</span>' % char
        else:
            styled_text += char
    return styled_text
# df.style.applymap(color_negative_red, subset=["frame"])
# df['repeat_representation'] = df['repeat_representation'].apply(highlight_chars_with_A)


In [209]:
categories = [["(CAG)", "TAG", "(CAG)"],
              ["(CAG)", "CAA", "(CAG)"],
              ["(CAG)", "CCG", "(CAG)"]]  # also maybe with numbers as minimum requirements

In [210]:
def seq_conforms_with_category(row, categories, max_interruption_len):
    minimum = float("inf")
    maximum = float("-inf")
    seq = row["no_flanks"]
    found_categories = [0] * len(categories)
    for i, category in enumerate(categories):
        # find all start and stops, see if all tandem repeats in the string are covered
        pattern = (p if p[0] != "(" else p.replace("(", "\(").replace(")", "\)") + "\_\d+\s" for p in category)
        pattern = "(?=(" + "".join(pattern) + "))"
        matches = re.finditer(pattern, seq)
        for hit in matches:

            start = hit.start()
            end = hit.end() + len(hit.group(1))

            found_categories[i] = 1

            if start < minimum:
                minimum = start
            if end > maximum:
                maximum = end

    if minimum == float("inf"):
        new_row = found_categories + [False]
    elif ")" in seq[:minimum] or "(" in seq[maximum:]:
        new_row = found_categories + [False]
    else:
        new_row = found_categories + [True]
    return pd.Series(new_row)

print(list(seq_conforms_with_category({"no_flanks":"CGA(CAG)_2 TAG(CAG)_12 TG"}, categories, 6)))
print(list(seq_conforms_with_category({"no_flanks":"CGA(CAG)_2 TAG(CAG)_12 CAA(CAG)_2 TG"}, categories, 6)))
print(list(seq_conforms_with_category({"no_flanks":"CGA(CAG)_2 TAG(CAG)_12 CAA(CAG)_2 TG(CAG)_2 "}, categories, 6)))
print(list(seq_conforms_with_category({"no_flanks":"(CAG)_3 TAG(CAG)_4 TAG(CAG)_4 CAA(CAG)_5 CAA(CAG)_3 "}, categories, 6)))


[1, 0, 0, True]
[1, 1, 0, True]
[1, 1, 0, False]
[1, 1, 0, True]


In [211]:
categories_as_col_names = ["".join(c) for c in categories]
max_interruption_len = 6

# how to recognize spurious repeats? that are small and far away from the repeat of interest?

new_cols = df.apply(lambda x: seq_conforms_with_category(x, categories, max_interruption_len), axis=1)
new_cols.columns = ["".join(c) for c in categories] + ["completely_defined"]
for column in new_cols.columns:
    if column in df.columns:
        df[column] = new_cols[column]
        del new_cols[column]

df = pd.concat([df, new_cols], axis=1)


In [212]:
pd.set_option('display.max_colwidth', None)
# df[["no_flanks", "completely_defined", *categories_as_col_names]][(df["(CAG)TAG(CAG)"]==1) & (df["(CAG)CAA(CAG)"]==1)]
df[(df["(CAG)TAG(CAG)"]==1) & (df["(CAG)CAA(CAG)"]==1)]

Unnamed: 0,seq_name,frame,repeat_representation,score_type,score,was_too_long,no_flanks,(CAG)TAG(CAG),(CAG)CAA(CAG),completely_defined,(CAG)CCG(CAG)
13,@SRR23922262.828599.1 828599 length=151,1,G(GCT)_2 CCTGGGTGTAGTGAGATGTCTCCAGCCAGGGCCAAG(CAG)_3 TAG(CAG)_4 TAG(CAG)_4 CAA(CAG)_5 CAA(CAG)_3 CAAAGGGTCTGTGTTGCTAAGAGGCTTTTGGTTTCTTTC,CAG,15,False,(GCT)_2 CCTGGGTGTAGTGAGATGTCTCCAGCCAGGGCCAAG(CAG)_3 TAG(CAG)_4 TAG(CAG)_4 CAA(CAG)_5 CAA(CAG)_3,1,1,False,0
235,@SRR23922262.15732653.2 15732653 length=151,0,CCTGGGTGTAGTGAGATGTCTCCAGCCAGGGCCAAG(CAG)_3 TAG(CAG)_4 TAG(CAG)_4 CAA(CAG)_5 CAA(CAG)_3 CAAAGGGTCTGTGTTGCTAAGAGGCTTTTGGTTTCTTTCCCTCCAC,CAG,15,False,(CAG)_3 TAG(CAG)_4 TAG(CAG)_4 CAA(CAG)_5 CAA(CAG)_3,1,1,True,0


In [227]:
print(f"from all samples ({len(df)}) {len(df[df['completely_defined']==1])} were categorizes completely")

from all samples (240) 23 were categorizes completely
