# Evaluating the consecutive k-mers c++ program output

In [2]:
import pandas as pd
import numpy as np
import os
import glob
from eval_methods import *

## Reading the Data
Reading the data from the c++ program output into a pandas data frame.

In [3]:
# path to the dir that contains the output files from the cpp_code (filenames must end with .out)
dir = '../out'

In [4]:
all_files = glob.glob(os.path.join(dir, "*.out"))
df = pd.concat((pd.read_csv(f, header=None) for f in all_files), ignore_index=True)
# seq. name: from the fasta or fastq file
# frame: [0, ..., k]
# repeat_representation: for example AG(GAC)_4 TGT
# score_type: run c++ main -h to see the available score types
# score: the score of the repeat_representation
# was_too_long: 1 if the if the input seq was longer than the configured max length, see c++ main -h
df.columns = ['seq_name', 'frame', 'repeat_representation', 'score_type', 'score', 'was_too_long']

In [5]:
# extract the values from the strings
df['frame'] = df['frame'].str.extract(r'frame[:\s]+(\d+)', expand=False).astype(int)
df['score_type'] = df["score_type"].str.extract(r'score_type[:\s]+(\w+)', expand=False)
df['score'] = df['score'].str.extract(r'score[:\s]+(\d+)', expand=False).astype(int)
df['was_too_long'] = df['was_too_long'].str.extract(
    r'seqlen too long[:\s]+(\w+)', expand=False).astype(int).astype(bool)
# creating new columns
# TODO: maybe dont use the +1 and adapt the pattern in seq_conforms_with_category accordingly
df["no_flanks"] = df["repeat_representation"].apply(lambda x: x[x.index("("):x.rfind(" ")+1])


## Grouping repeats into categories
Sine there are many repeats detected we want to categorize them. For this you can add categories to the list below. This
is a tow dimensional list. Every repeat category is a list of strings. For example (dropping "): `[(CAG), TAG, (CAG)]`
will add every sequence to this category if it contains a substring of this form `(CAG)_n TAG(CAG)_m`. When calling the
method `seq_conforms_with_category` to calculate the categories one can also pass a function that filters spurious
repeats. For example there might be the sequence `AAA(TTT)_2 ACGTAACCGGTT(GAC)_12 `, where the `(TTT)_2` repeat is probably
spurious and should be ignored. Here the method `is_spurious_by_max_repeat_len_and_min_distance` is applied.

In [6]:
categories = [["(CAG)", "TAG", "(CAG)"],
              ["(CAG)", "CAA", "(CAG)"],
              ["(CAG)", "CCG", "(CAG)"],
              ["(CAG)", "CATCAGCAT", "(CAG)"]]  # also maybe with numbers as minimum requirements

In [7]:
# TODO how to find seqs that only have a (CAG)_n without interruptions (other than spurious repeats)

max_repeat_len = 3
min_distance = 6


def f(x): return seq_conforms_with_category(x, categories, lambda seq,
                                            neighbour: is_spurious_by_max_repeat_len_and_min_distance(seq,
                                                                                                      max_repeat_len,
                                                                                                      min_distance,
                                                                                                      neighbour))


new_cols = df.apply(f, axis=1)
new_cols.columns = ["".join(c) for c in categories] + ["completely_defined"]
# deleting previously contained columns to avoid duplicates when concatenating
for column in new_cols.columns:
    if column in df.columns:
        df[column] = new_cols[column]
        del new_cols[column]

df = pd.concat([df, new_cols], axis=1)


## Viewing the data
Here you can have a look at the categorized data. If you still see some sequences for which the column
`completely_defined` is `False` consider adding new categories.

In [12]:
pd.set_option('display.max_colwidth', None)
print(f"From all {len(df)} samples {len(df[df['completely_defined']==1])} were categorizes completely")
df


From all 240 samples 55 were categorizes completely


Unnamed: 0,seq_name,frame,repeat_representation,score_type,score,was_too_long,no_flanks,(CAG)TAG(CAG),(CAG)CAA(CAG),(CAG)CCG(CAG),(CAG)CATCAGCAT(CAG),completely_defined
0,@SRR23922262.25917.2 25917 length=151,2,CCAAGCGAGCTACAGCCAACAAT(CAG)_2 CCA(CAG)_4 CAA(CAG)_3 CCG(CAG)_4 CCA(CAG)_3 CCG(CAG)_4 CCA(CAG)_3 CCACAGGCCTTGCCTCGGTATCCTCGTGAAGTACCTCCACG,CAG,17,False,(CAG)_2 CCA(CAG)_4 CAA(CAG)_3 CCG(CAG)_4 CCA(CAG)_3 CCG(CAG)_4 CCA(CAG)_3,0,1,1,0,False
1,@SRR23922262.71172.1 71172 length=151,0,TATGTAGACCACATCAACAGGAAGACACAATATGAGGACCCGGTTCTAGAAGCCAAACGG(AAG)_2 CAGCTTGAG(CAG)_5 CAA(CAG)_3 CAA(CAG)_10 ACA(GAA)_2 TGGACAG,CAG,16,False,(AAG)_2 CAGCTTGAG(CAG)_5 CAA(CAG)_3 CAA(CAG)_10 ACA(GAA)_2,0,1,0,0,False
2,@SRR23922262.90539.2 90539 length=151,0,AGTCTGAGCCAGACGCCGGGACACAAGGCTGAG(CAG)_12 CATCAGCAT(CAG)_14 CACCTCAGCAGGGCTCCGGGGCTCATCACCC,CAG,23,False,(CAG)_12 CATCAGCAT(CAG)_14,0,0,0,1,True
3,@SRR23922262.93952.1 93952 length=151,1,GTATTCATAC(CAG)_8 TAG(CAG)_7 TAG(CAG)_10 TAG(CAG)_3 TAG(CAG)_3 ATCGGAAGAGCACACGTCTGAACTCCAGTCACGCTA,CAG,27,False,(CAG)_8 TAG(CAG)_7 TAG(CAG)_10 TAG(CAG)_3 TAG(CAG)_3,1,0,0,0,True
4,@SRR23922262.311393.2 311393 length=151,1,T(CAA)_2 CCCCACAGAGGACTTTAGGAGAAAGTTGCTGAAGGC(AGG)_2 GGACCCTAATAGGAGTATTCATAC(CAG)_8 TAG(CAG)_11 TAG(CAG)_5,CAG,22,False,(CAA)_2 CCCCACAGAGGACTTTAGGAGAAAGTTGCTGAAGGC(AGG)_2 GGACCCTAATAGGAGTATTCATAC(CAG)_8 TAG(CAG)_11 TAG(CAG)_5,1,0,0,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
235,@SRR23922262.15732653.2 15732653 length=151,0,CCTGGGTGTAGTGAGATGTCTCCAGCCAGGGCCAAG(CAG)_3 TAG(CAG)_4 TAG(CAG)_4 CAA(CAG)_5 CAA(CAG)_3 CAAAGGGTCTGTGTTGCTAAGAGGCTTTTGGTTTCTTTCCCTCCAC,CAG,15,False,(CAG)_3 TAG(CAG)_4 TAG(CAG)_4 CAA(CAG)_5 CAA(CAG)_3,1,1,0,0,True
236,@SRR23922262.15758323.2 15758323 length=151,0,CAT(CAC)_2 CAGCAACAGCAA(CAG)_19 CATCACGGAAACTCGGGGCCC(CCT)_3 GGAGCATTACCCCACCCACAGGCGGGCGGTCGCTCC(CAC)_2 C,CAG,19,False,(CAC)_2 CAGCAACAGCAA(CAG)_19 CATCACGGAAACTCGGGGCCC(CCT)_3 GGAGCATTACCCCACCCACAGGCGGGCGGTCGCTCC(CAC)_2,0,0,0,0,False
237,@SRR23922262.15838799.1 15838799 length=151,2,GGGAAATGCTCCAGTCCACCGCCCAC(CCA)_2 GTCTCAACACATCACCAT(CAC)_2 CAGCAACAGCAA(CAG)_15 CATCACGGAAACTCTGGGCCC(CCT)_3 GGAGCATT,CAG,15,False,(CCA)_2 GTCTCAACACATCACCAT(CAC)_2 CAGCAACAGCAA(CAG)_15 CATCACGGAAACTCTGGGCCC(CCT)_3,0,0,0,0,False
238,@SRR23922262.15877253.2 15877253 length=151,0,GGACCCTAATAGGAGTATTCATAC(CAG)_8 TAG(CAG)_10 TAG(CAG)_10 TAG(CAG)_3 TAG(CAG)_6 TAGT,CAG,33,False,(CAG)_8 TAG(CAG)_10 TAG(CAG)_10 TAG(CAG)_3 TAG(CAG)_6,1,0,0,0,True


Look at sequences which are not yet completely categorized.

In [15]:
df[df["completely_defined"]==False]["no_flanks"][0:20]

0                                                      (CAG)_2 CCA(CAG)_4 CAA(CAG)_3 CCG(CAG)_4 CCA(CAG)_3 CCG(CAG)_4 CCA(CAG)_3 
1                                                                     (AAG)_2 CAGCTTGAG(CAG)_5 CAA(CAG)_3 CAA(CAG)_10 ACA(GAA)_2 
7                                                                     (AAG)_2 CAGCTTGAG(CAG)_5 CAA(CAG)_3 CAA(CAG)_11 ACA(GAA)_2 
8                                                                                                   (CAC)_2 CAGCAACAGCAA(CAG)_19 
10                                             (CAG)_19 CATCACGGAAACTCTGGGCCC(CCT)_3 GGAGCATTTCCCCACCCACTGGAGGGCGGTAGCTCC(CAC)_2 
11                                                  (CAG)_5 CAA(CAG)_3 CAA(CAG)_10 ACA(GAA)_2 TGGACAGAAGATCACTCAGCCCTTGTG(CCT)_2 
12                                                                     (CAC)_2 CAGCAACAGCAA(CAG)_19 CATCACGGAAACTCTGGGCCC(CCT)_3 
14                           (CAG)_2 (CAA)_2 (CAG)_14 CCCATGCCCCGC(AGC)_2 CAG(GAG)_2 AAAGA

Inspect all sequences for a category:

In [16]:
category = "(CAG)CATCAGCAT(CAG)"
df[(df["completely_defined"]==1) & (df[category]==1)]["no_flanks"]

2                                                                                (CAG)_12 CATCAGCAT(CAG)_14 
24                                                                               (CAG)_12 CATCAGCAT(CAG)_14 
89     (GCT)_3 CAGCCTTGTGTCCCGGCGTCTGGCTCAGACTGCCCATGTTGGCCAGCAGAGTGGAATAGGCCTGAG(CAG)_12 CATCAGCAT(CAG)_10 
134                         (CTG)_2 GCCAACATGGGCAGTCTGAGCCAGACGCCGGGACACAAGGCTGAG(CAG)_12 CATCAGCAT(CAG)_15 
185                         (CTG)_2 GCCAACATGGGCAGTCTGAGCCAGACGCCGGGACACAAGGCTGAG(CAG)_12 CATCAGCAT(CAG)_11 
187                                                                              (CAG)_12 CATCAGCAT(CAG)_14 
197                                                                              (CAG)_12 CATCAGCAT(CAG)_14 
232                                                 (CAG)_12 CATCAGCAT(CAG)_14 CACCTCAGCAGGGCTCCGGGG(CTC)_2 
Name: no_flanks, dtype: object