In [7]:
import pandas as pd
import numpy as np
import os
import glob
from eval_methods import *

In [8]:
# path to the dir that contains the output files from the cpp_code (filenames must end with .out)
dir = '../out'

In [9]:
all_files = glob.glob(os.path.join(dir, "*.out"))
df = pd.concat((pd.read_csv(f, header=None) for f in all_files), ignore_index=True)
# seq. name: from the fasta or fastq file
# frame: [0, ..., k]
# repeat_representation: for example AG(GAC)_4 TGT
# score_type: run c++ main -h to see the available score types
# score: the score of the repeat_representation
# was_too_long: 1 if the if the input seq was longer than the configured max length, see c++ main -h
df.columns = ['seq_name', 'frame', 'repeat_representation', 'score_type', 'score', 'was_too_long']

In [10]:
# extract the values from the strings
df['frame'] = df['frame'].str.extract(r'frame[:\s]+(\d+)', expand=False).astype(int)
df['score_type'] = df["score_type"].str.extract(r'score_type[:\s]+(\w+)', expand=False)
df['score'] = df['score'].str.extract(r'score[:\s]+(\d+)', expand=False).astype(int)
df['was_too_long'] = df['was_too_long'].str.extract(
    r'seqlen too long[:\s]+(\w+)', expand=False).astype(int).astype(bool)
# creating new columns
df["no_flanks"] = df["repeat_representation"].apply(lambda x: x[x.index("("):x.rfind(" ")+1]) # TODO: maybe dont use the +1 and adapt the pattern in seq_conforms_with_category accordingly


In [11]:
categories = [["(CAG)", "TAG", "(CAG)"],
              ["(CAG)", "CAA", "(CAG)"],
              ["(CAG)", "CCG", "(CAG)"],
              ["(CAG)", "CATCAGCAT", "(CAG)"]]  # also maybe with numbers as minimum requirements

In [12]:
categories_as_col_names = ["".join(c) for c in categories]

# TODO how to find seqs that only have a (CAG)_n without interruptions (other than spurious repeats)

# how to recognize spurious repeats? that are small and far away from the repeat of interest?
max_repeat_len = 3
min_distance = 6
def f(x): return seq_conforms_with_category(x, categories, lambda seq,
                                            neighbour: is_spurious_by_max_repeat_len_and_min_distance(seq, max_repeat_len, min_distance, neighbour))


new_cols = df.apply(f, axis=1)
new_cols.columns = ["".join(c) for c in categories] + ["completely_defined"]
for column in new_cols.columns:
    if column in df.columns:
        df[column] = new_cols[column]
        del new_cols[column]

df = pd.concat([df, new_cols], axis=1)


In [16]:
pd.set_option('display.max_colwidth', None)
# df[["no_flanks", "completely_defined", *categories_as_col_names]][(df["(CAG)TAG(CAG)"]==1) & (df["(CAG)CAA(CAG)"]==1)]
df[(df["(CAG)TAG(CAG)"]==1) & (df["(CAG)CAA(CAG)"]==1)]

Unnamed: 0,seq_name,frame,repeat_representation,score_type,score,was_too_long,no_flanks,(CAG)TAG(CAG),(CAG)CAA(CAG),(CAG)CCG(CAG),(CAG)CATCAGCAT(CAG),completely_defined
13,@SRR23922262.828599.1 828599 length=151,1,G(GCT)_2 CCTGGGTGTAGTGAGATGTCTCCAGCCAGGGCCAAG(CAG)_3 TAG(CAG)_4 TAG(CAG)_4 CAA(CAG)_5 CAA(CAG)_3 CAAAGGGTCTGTGTTGCTAAGAGGCTTTTGGTTTCTTTC,CAG,15,False,(GCT)_2 CCTGGGTGTAGTGAGATGTCTCCAGCCAGGGCCAAG(CAG)_3 TAG(CAG)_4 TAG(CAG)_4 CAA(CAG)_5 CAA(CAG)_3,1,1,0,0,True
235,@SRR23922262.15732653.2 15732653 length=151,0,CCTGGGTGTAGTGAGATGTCTCCAGCCAGGGCCAAG(CAG)_3 TAG(CAG)_4 TAG(CAG)_4 CAA(CAG)_5 CAA(CAG)_3 CAAAGGGTCTGTGTTGCTAAGAGGCTTTTGGTTTCTTTCCCTCCAC,CAG,15,False,(CAG)_3 TAG(CAG)_4 TAG(CAG)_4 CAA(CAG)_5 CAA(CAG)_3,1,1,0,0,True


In [17]:
print(f"from all {len(df)} samples {len(df[df['completely_defined']==1])} were categorizes completely")

from all 240 samples 55 were categorizes completely


In [18]:
df[df["completely_defined"]==0]["no_flanks"][0:20]

0                                                      (CAG)_2 CCA(CAG)_4 CAA(CAG)_3 CCG(CAG)_4 CCA(CAG)_3 CCG(CAG)_4 CCA(CAG)_3 
1                                                                     (AAG)_2 CAGCTTGAG(CAG)_5 CAA(CAG)_3 CAA(CAG)_10 ACA(GAA)_2 
7                                                                     (AAG)_2 CAGCTTGAG(CAG)_5 CAA(CAG)_3 CAA(CAG)_11 ACA(GAA)_2 
8                                                                                                   (CAC)_2 CAGCAACAGCAA(CAG)_19 
10                                             (CAG)_19 CATCACGGAAACTCTGGGCCC(CCT)_3 GGAGCATTTCCCCACCCACTGGAGGGCGGTAGCTCC(CAC)_2 
11                                                  (CAG)_5 CAA(CAG)_3 CAA(CAG)_10 ACA(GAA)_2 TGGACAGAAGATCACTCAGCCCTTGTG(CCT)_2 
12                                                                     (CAC)_2 CAGCAACAGCAA(CAG)_19 CATCACGGAAACTCTGGGCCC(CCT)_3 
14                           (CAG)_2 (CAA)_2 (CAG)_14 CCCATGCCCCGC(AGC)_2 CAG(GAG)_2 AAAGA

In [19]:
df[(df["completely_defined"]==1) & (df["(CAG)CATCAGCAT(CAG)"]==1)]["no_flanks"]

2                                                                                (CAG)_12 CATCAGCAT(CAG)_14 
24                                                                               (CAG)_12 CATCAGCAT(CAG)_14 
89     (GCT)_3 CAGCCTTGTGTCCCGGCGTCTGGCTCAGACTGCCCATGTTGGCCAGCAGAGTGGAATAGGCCTGAG(CAG)_12 CATCAGCAT(CAG)_10 
134                         (CTG)_2 GCCAACATGGGCAGTCTGAGCCAGACGCCGGGACACAAGGCTGAG(CAG)_12 CATCAGCAT(CAG)_15 
185                         (CTG)_2 GCCAACATGGGCAGTCTGAGCCAGACGCCGGGACACAAGGCTGAG(CAG)_12 CATCAGCAT(CAG)_11 
187                                                                              (CAG)_12 CATCAGCAT(CAG)_14 
197                                                                              (CAG)_12 CATCAGCAT(CAG)_14 
232                                                 (CAG)_12 CATCAGCAT(CAG)_14 CACCTCAGCAGGGCTCCGGGG(CTC)_2 
Name: no_flanks, dtype: object