# Evaluating the consecutive k-mers c++ program output

In [10]:
import pandas as pd
import numpy as np
import os
import glob
from eval_methods import *


## Reading the Data
Reading the data from the c++ program output into a pandas data frame.

In [11]:
# path to the dir that contains the output files from the cpp_code (filenames must end with .out)
# for example from the run: ./main -t 5 -i SRR23922262.fastq -s CAG
dir = '../out'
dir = '/home/mattes/Seafile/Meine_Bibliothek/SickKids/repeat_util/consecutive_kmers/cpp_code/'


In [12]:
all_files = glob.glob(os.path.join(dir, "*.out"))
df = pd.concat((pd.read_csv(f, header=None) for f in all_files), ignore_index=True)
# seq. name: from the fasta or fastq file
# frame: [0, ..., k]
# repeat_representation: for example AG(GAC)_4 TGT
# score_type: run c++ main -h to see the available score types
# score: the score of the repeat_representation
# was_too_long: 1 if the if the input seq was longer than the configured max length, see c++ main -h
df.columns = ['seq_name', 'frame', 'repeat_representation', 'score_type', 'score', 'was_too_long']


In [13]:
# extract the values from the strings
df['frame'] = df['frame'].str.extract(r'frame[:\s]+(\d+)', expand=False).astype(int)
df['score_type'] = df["score_type"].str.extract(r'score_type[:\s]+(\w+)', expand=False)
df['score'] = df['score'].str.extract(r'score[:\s]+(\d+)', expand=False).astype(int)
df['was_too_long'] = df['was_too_long'].str.extract(
    r'seqlen too long[:\s]+(\w+)', expand=False).astype(int).astype(bool)
# creating new columns
# TODO: maybe dont use the +1 and adapt the pattern in seq_conforms_with_category accordingly
df["no_flanks"] = df["repeat_representation"].apply(lambda x: x[x.index("("):x.rfind(" ")+1])


## Grouping repeats into categories
Sine there are many repeats detected we want to categorize them. For this you can add categories to the list below. This
is a tow dimensional list. Every repeat category is a list of strings. For example (dropping "): `[(CAG), TAG, (CAG)]`
will add every sequence to this category if it contains a substring of this form `(CAG)_n TAG(CAG)_m`. When calling the
method `seq_conforms_with_category` to calculate the categories one can also pass a function that filters spurious
repeats. For example there might be the sequence `AAA(TTT)_2 ACGTAACCGGTT(GAC)_12 `, where the `(TTT)_2` repeat is probably
spurious and should be ignored. Here the method `is_spurious_by_max_repeat_len_and_min_distance` is applied. If all non
spurious repeats of a sequence are categorized and not in close proximity (separated by some minimum distance and therefore regarded
as independent) the sequence is flagged as `completely_defined`.

In [14]:
categories = [["(CAG)", "TAG", "(CAG)"],
              ["(CAG)", "CAA", "(CAG)"],
              ["(CAG)", "CCG", "(CAG)"],
              ["(CAG)", "CATCAGCAT", "(CAG)"],
              ["(CAG)"],
              ["(CAG)", "(CAA)"]]
# TODO also maybe with numbers as minimum requirements


In [15]:
max_repeat_len = 3
min_distance = 6


def f(x): return seq_conforms_with_category(x, categories, lambda seq,
                                            neighbour: is_spurious_by_max_repeat_len_and_min_distance(seq,
                                                                                                      max_repeat_len,
                                                                                                      min_distance,
                                                                                                      neighbour))


new_cols = df.apply(f, axis=1)
new_cols.columns = ["".join(c) for c in categories] + ["completely_defined"]
# deleting previously contained columns to avoid duplicates when concatenating
for column in new_cols.columns:
    if column in df.columns:
        df[column] = new_cols[column]
        del new_cols[column]

df = pd.concat([df, new_cols], axis=1)


## Viewing the data
Here you can have a look at the categorized data. If you still see some sequences for which the column
`completely_defined` is `False` consider adding new categories.

In [16]:
pd.set_option('display.max_colwidth', None)
print(f"From all {len(df)} samples {len(df[df['completely_defined']==1])} were categorizes completely")
df[:5]


From all 23 samples 11 were categorizes completely


Unnamed: 0,seq_name,frame,repeat_representation,score_type,score,was_too_long,no_flanks,(CAG)TAG(CAG),(CAG)CAA(CAG),(CAG)CCG(CAG),(CAG)CATCAGCAT(CAG),(CAG),(CAG)(CAA),completely_defined
0,@M03851:785:000000000-DCFPT:1:1102:21402:2730 1:N:0:GAGATTCC+CAGGACGT,1,CTGCGACCCTGGAAAAGCTGATGAGCCGTGTAGATCTCGGTGGTCGCCGTATCATTGCAGGCACAGCCGCTGCTGATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGATTCCATCTCGTATGCCGTCTTCTGCTTG(AAA)_4 (GAC)_2 TCG,GAC,2,True,(AAA)_4 (GAC)_2,0,0,0,0,0,0,False
1,@M03851:785:000000000-DCFPT:1:1102:12966:5040 1:N:0:GAGATTCC+CAGGACGT,0,CTGCGACCCTGGAAAAGC(TGA)_2 GCCGTGTAGATCTCGGTGGTCGCCGTATCATTGCAGGCACAGCCG(CTG)_2 ATCGGAAGAGACACGTCTGAACTCCAGTCACGAGATTCCATCTCGTATGCCGTCTTCTGCTTG(AAA)_4 TCC(GAC)_2 G,GAC,2,True,(TGA)_2 GCCGTGTAGATCTCGGTGGTCGCCGTATCATTGCAGGCACAGCCG(CTG)_2 ATCGGAAGAGACACGTCTGAACTCCAGTCACGAGATTCCATCTCGTATGCCGTCTTCTGCTTG(AAA)_4 TCC(GAC)_2,0,0,0,0,0,0,False
2,@M03851:785:000000000-DCFPT:1:1102:24952:6651 1:N:0:GAGATTCC+CAGGACGT,2,CTGCGACCCTGGAAAAGCTGATGAGAGCCGTGTAGATCTCGGTGGTCGCCGTATCATTGCAGGCACAGCCG(CTG)_2 ATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGATTCCATCTCGTATGCCGTCTTCTGCTTGAA(AAA)_3 (GAC)_2 GA,GAC,2,True,(CTG)_2 ATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGATTCCATCTCGTATGCCGTCTTCTGCTTGAA(AAA)_3 (GAC)_2,0,0,0,0,0,0,True
3,@M03851:785:000000000-DCFPT:1:1102:13973:13718 1:N:0:GAGATTCC+CAGGACGG,1,CTGCGACCCTGGAAAAGCTGATGAGCCGTGTAGATCTCGGTGGTCGCCGTATCATTGGCACAGCCGCTGCTGATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGATTCCATCTCGTATGCCGTCTTCTGCTTG(AAA)_4 (GAC)_2 ACTTGG,GAC,2,True,(AAA)_4 (GAC)_2,0,0,0,0,0,0,False
4,@M03851:785:000000000-DCFPT:1:1102:8419:14652 1:N:0:GAGATTCC+CAGGACGT,2,CTGCGACCCTGGAAAAGCTGCCCATGGCGTGCGAGTTGGTCAGTCTGGTGGCAGCG(GAC)_2 ACGTGCACCAGGCCT(GCG)_2 GCCGGCACAGAGCTGAACAGCGACTGCAGCACG(GAG)_2 CCGGCCACGGAGCCGAGGTTGGCCTGCAGGGACATGGG,GAC,2,True,(GAC)_2 ACGTGCACCAGGCCT(GCG)_2 GCCGGCACAGAGCTGAACAGCGACTGCAGCACG(GAG)_2,0,0,0,0,0,0,True


Look at the `m` most scoring sequences which are not yet completely categorized.

In [19]:
m = -1 # -1 to show almost all seqs. Adjust to positive integer to show smaller subset
sorted_by_score = df.sort_values(["score", "repeat_representation"])
sorted_by_score = sorted_by_score[sorted_by_score["completely_defined"] == False]["repeat_representation"][-m:]
color = Color_print_triplets()
for seq in list(sorted_by_score):
    print(color.color_triplets(seq, expand=True), "\\n")


 CGATGCGACCCTGGAAAAGCTGATGAGAGCCGTGTAGATCTCGGTGGTCGCCGTATCATTGCAGGCACAGCCGCTGCTTAGATCGGAAGAGCACACGTCTGAACTCCAGTCAACTGAAGCTATCTCGTATGCCGTCTTCTGCTTG[1;31;43mAAAAAAAAA[0m[1;30;41mGACGAC[0m \n
 CTGCGACCCTGGAAAAGC[1;32;47mTGATGA[0mGCCGTGTAGATCTCGGTGGTCGCCGTATCATTGCAGGCACAGCCG[1;33;45mCTGCTG[0mATCGGAAGAGACACGTCTGAACTCCAGTCACGAGATTCCATCTCGTATGCCGTCTTCTGCTTG[1;31;43mAAAAAAAAAAAA[0mTCC[1;30;41mGACGAC[0mG \n
 CTGCGACCCTGGAAAAGCTCAAGCA[1;30;41mGACGAC[0mGGCACCCCC[1;35;41mGCTGCT[0mGATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGGTTCCATCTCGTATGCCG[1;35;45mTCTTCT[0mGCTTGAACC[1;31;42mGCCGCCGCCGCC[0mACC[1;31;42mGCCGCC[0mTCCTCAGCTTCCTCA[1;31;42mGCCGCCGCC[0m \n
 CTGCGACCCTGGAAAAGCTCAAGCA[1;30;41mGACGAC[0mGGCACCTCC[1;35;41mGCTGCT[0mGATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGATTCCATCTCGTATGCCG[1;35;45mTCTTCT[0mGCTTGA[1;31;43mAAAAAAAAAAAAAAA[0mATATAAAGAAACAATTTTAACAAGATTCTCACA \n
 CTGCGACCCTGGAAAAGCTGATGAAGGCCTTCGAGTCCCTCAAGTCCTTCCA[1;32;41mGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCA[0mA

Inspect all sequences for a category:

In [18]:
category = "(CAG)TAG(CAG)"

df_of_category = df[(df["completely_defined"] == 1) & (df[category] == 1)][["no_flanks"]]
print(f"{len(df_of_category)} sequences were categorized as {category}:")
print(df_of_category)


0 sequences were categorized as (CAG)TAG(CAG):
Empty DataFrame
Columns: [no_flanks]
Index: []
