In [71]:
import pandas as pd
import numpy as np
import os
import glob
import re

In [72]:
# path to the dir that contains the output files from the cpp_code (filenames must end with .out)
dir = 'out'

In [73]:
all_files = glob.glob(os.path.join(dir, "*.out"))
df = pd.concat((pd.read_csv(f, header=None) for f in all_files), ignore_index=True)
# seq. name: from the fasta or fastq file
# frame: [0, ..., k]
# repeat_representation: for example AG(GAC)_4 TGT
# score_type: run c++ main -h to see the available score types
# score: the score of the repeat_representation
# was_too_long: 1 if the if the input seq was longer than the configured max length, see c++ main -h
df.columns = ['seq_name', 'frame', 'repeat_representation', 'score_type', 'score', 'was_too_long']

In [74]:
# extract the values from the strings
df['frame'] = df['frame'].str.extract(r'frame[:\s]+(\d+)', expand=False).astype(int)
df['score_type'] = df["score_type"].str.extract(r'score_type[:\s]+(\w+)', expand=False)
df['score'] = df['score'].str.extract(r'score[:\s]+(\d+)', expand=False).astype(int)
df['was_too_long'] = df['was_too_long'].str.extract(
    r'seqlen too long[:\s]+(\w+)', expand=False).astype(int).astype(bool)
# creating new columns
df["no_flanks"] = df["repeat_representation"].apply(lambda x: x[x.index("("):x.rfind(" ")])


In [75]:
# these are currently not used
def color_negative_red(val):
    if val < 1:
        font_color = 'white'
        background_color = 'red'
    else:
        font_color = 'black'
        background_color = 'white'
    return 'color: %s; background-color: %s' % (font_color, background_color)


def highlight_chars_with_A(text):
    styled_text = ''
    for char in text:
        if char == 'A':
            styled_text += '<span style="color: white; background-color: red;">%s</span>' % char
        else:
            styled_text += char
    return styled_text
# df.style.applymap(color_negative_red, subset=["frame"])
# df['repeat_representation'] = df['repeat_representation'].apply(highlight_chars_with_A)


In [160]:
categories = [["(CAG)", "TAG", "(CAG)"],
              ["(CAG)", "CAA", "(CAG)"]]  # also maybe with numbers as minimum requirements

In [171]:
def seq_conforms_with_category(row, categories, max_interruption_len):
    minimum = float("inf")
    maximum = float("-inf")
    seq = row["no_flanks"]
    found_categories = [0] * len(categories)
    for i, category in enumerate(categories):
        # find all start and stops, see if all tandem repeats in the string are covered
        pattern = (p if p[0] != "(" else p.replace("(", "\(").replace(")", "\)") + "\_\d+\s" for p in category)
        pattern = "(?=(" + "".join(pattern) + "))"
        matches = re.finditer(pattern, seq)
        for hit in matches:

            start = hit.start()
            end = hit.end() + len(hit.group(1))

            found_categories[i] = 1

            if start < minimum:
                minimum = start
            if end > maximum:
                maximum = end

    if minimum == float("inf"):
        new_row = found_categories + [False]
    elif ")" in seq[:minimum] or "(" in seq[maximum:]:
        new_row = found_categories + [False]
    else:
        new_row = found_categories + [True]
        if len(found_categories) == 2:
            print("foudn 2")
            print("minimun", minimum)
            print("max", maximum)
        print("found complete")
    return pd.Series(new_row)

list(seq_conforms_with_category({"no_flanks":"CAG"}, categories, 6))


[0, 0, False]

In [173]:
categories_as_col_names = ["".join(c) for c in categories]
max_interruption_len = 6

# how to recognize spurious repeats? that are small and far away from the repeat of interest?

new_cols = df.apply(lambda x: seq_conforms_with_category(x, categories, max_interruption_len), axis=1)
new_cols.columns = ["".join(c) for c in categories] + ["completly_defined"]
for column in new_cols.columns:
    if column in df.columns:
        df[column] = new_cols[column]
        del new_cols[column]

df = pd.concat([df, new_cols], axis=1)


In [163]:
df[["no_flanks", "completly_defined", *categories_as_col_names]][(df["(CAG)TAG(CAG)"]==1) & (df["(CAG)CAA(CAG)"]==1)]

Unnamed: 0,no_flanks,completly_defined,(CAG)TAG(CAG),(CAG)CAA(CAG)
13,(GCT)_2 CCTGGGTGTAGTGAGATGTCTCCAGCCAGGGCCAAG(C...,False,1,1
235,(CAG)_3 TAG(CAG)_4 TAG(CAG)_4 CAA(CAG)_5 CAA(C...,False,1,1
