In [127]:
import os
import sys
import json
import pandas as pd
from composition_stats import closure
from composition_stats import clr

In [99]:
def get_STRS_samplelist(df):
    # Get STR and sample list
    STRS = list(set(df["STR_mark"]))
    sample_list = list(set(df.index))
    
    return STRS, sample_list

def encode_sequences(df):
    traduction_dict = dict()
    count_dict = dict()
    
    for row in range(df.shape[0]):

        STR = df.iloc[row]["STR_mark"]
        seq = df.iloc[row]["Sequence"]

        if STR not in traduction_dict.keys():
            traduction_dict[STR] = {}
            count_dict[STR] = 0 

        if seq not in traduction_dict[STR].keys():
            new_seq = f"{STR}_allele_{count_dict[STR]}"
            traduction_dict[STR][seq] = new_seq
            # Sequence = col 5
            df.iloc[row,5] = new_seq

            count_dict[STR]+=1

        else:

            df.iloc[row,5] = traduction_dict[STR][seq]
        
    return df, traduction_dict
    

def most_frequent_alleles(df):

    STRS, sample_list = get_STRS_samplelist(df)
    
    # alfreq_dict, relfreqs of alleles, starts at 0
    # seq_dict, seq of alleles, starts at None
    alfreq_dict = dict()
    seq_dict = dict()

    # get the most frequent allele and its relfreq
    for STR in STRS:
        alfreq_dict[STR] = dict()
        seq_dict[STR] = dict()
        for sample in sample_list:
            alfreq_dict[STR][sample] = 0
            seq_dict[STR][sample] = "None"
    
    for index, row in df.iterrows():
        STR = row["STR_mark"]
        al_freq = row["AlleleFrequency"]
        sample = index

        if alfreq_dict[STR][sample] < al_freq:
            seq = row["Sequence"]

            alfreq_dict[STR][sample] = al_freq
            seq_dict[STR][sample] = seq
    
    # seq of the most freq allele per str and sample
    df_mostfreqalleles_seqs = pd.DataFrame.from_dict(seq_dict)
    
    # freq of the most freq allele per str and sample
    df_mostfreqalleles_freqs = pd.DataFrame.from_dict(alfreq_dict)

            
    return df_mostfreqalleles_seqs, df_mostfreqalleles_freqs
    
def all_alleles(df, column):
    STRS, sample_list = get_STRS_samplelist(df)
    
    # rename alleles
    raw_seq_renamed_df = raw_df.copy()
    
    # list with all different alleles
    all_alleles = list(set(raw_seq_renamed_df["Sequence"]))
    
    # dict for freq of every allele
    alleles_dict_freq = { item: { allele:0 for allele in all_alleles } for item in sample_list }
    alleles_dict_binary = { item: { allele:0 for allele in all_alleles } for item in sample_list }

    # dict for presence of every allele
    
    for index, row in raw_seq_renamed_df.iterrows():
        sample = index
        allele = row["Sequence"]
        freq = row[column]

        alleles_dict_freq[sample][allele] = float(freq)
        alleles_dict_binary[sample][allele] = 1
    
        # df for presence
    df_alleles_binary = pd.DataFrame.from_dict(alleles_dict_binary)
    
    # df for freq
    df_alleles_freq = pd.DataFrame.from_dict(alleles_dict_freq)
        
    return df_alleles_binary, df_alleles_freq

In [131]:
# Import data
raw_df = pd.read_excel("Input/full_long_table.xlsx", index_col = "Sample_name")

# Encode data
renamed_df, traduction_dict = encode_sequences(raw_df)

df_mostfreqalleles_seqs, df_mostfreqalleles_freqs = most_frequent_alleles(renamed_df)
df_alleles_binary, df_alleles_freq = all_alleles(renamed_df, "AlleleFrequency")
df_alleles_reads = all_alleles(renamed_df, "Supporting_reads")[1]




In [151]:
df_alleles_reads

Unnamed: 0,395,399,403,353_R,407,411,415,416,417,418,...,345_R,457,345,347,349,350,351,352,353,347_R
STR7_allele_64,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
STR8_allele_22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
STR4_allele_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
STR2_allele_17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
STR5_allele_1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
STR9_allele_9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
STR9_allele_102,0.0,1.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
STR8_allele_42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
STR8_allele_66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [158]:
# samples will be the rows
rows = list(df_alleles_reads.columns)
# alleles will be the cols
cols = list(df_alleles_reads.index)


clr_df = pd.DataFrame(clr(closure(df_alleles_reads.transpose()+0.000001)),columns=cols, index=rows)
clr_df

395            1.0
399            1.0
403            1.0
353_R          1.0
407            1.0
411            1.0
415            1.0
416            1.0
417            1.0
418            1.0
419            1.0
420            1.0
431            1.0
435            1.0
352_R          1.0
438            1.0
441            1.0
353_R_miseq    1.0
350_R          1.0
349_R          1.0
351_R          1.0
453            1.0
345_R          1.0
457            1.0
345            1.0
347            1.0
349            1.0
350            1.0
351            1.0
352            1.0
353            1.0
347_R          1.0
dtype: float64

In [116]:
df_alleles_binary.head()

Unnamed: 0,395,399,403,353_R,407,411,415,416,417,418,...,345_R,457,345,347,349,350,351,352,353,347_R
STR7_allele_64,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
STR8_allele_22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
STR4_allele_0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
STR2_allele_17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
STR5_allele_1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [107]:
raw_df.head()

Unnamed: 0_level_0,STR_mark,STR_structure,Numer_reps,Supporting_reads,AlleleFrequency,Sequence
Sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
353_R_miseq,STR5,[T]26,26.0,1,0.125,STR5_allele_0
353_R_miseq,STR5,[T]19,19.0,1,0.125,STR5_allele_1
353_R_miseq,STR5,[T]22,22.0,1,0.125,STR5_allele_2
431,STR8,ATATTTTATTTTATATTTTATATTTTATGTTTT [ATTTT]1 [AT...,11.3,1,0.125,STR8_allele_0
353_R_miseq,STR5,[T]24,24.0,3,0.375,STR5_allele_3


In [103]:
df_alleles_freq.head()

Unnamed: 0,395,399,403,353_R,407,411,415,416,417,418,...,345_R,457,345,347,349,350,351,352,353,347_R
STR7_allele_64,0.0,0.0,0.0,0.000958,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
STR8_allele_22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
STR4_allele_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
STR2_allele_17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
STR5_allele_1,0.0,0.0,0.0,0.012346,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [104]:
df_mostfreqalleles_seqs.head()

Unnamed: 0,STR1,STR8,STR7,STR2,STR9,STR3,STR5,STR4
395,,STR8_allele_1,STR7_allele_96,STR2_allele_37,,,,
399,,STR8_allele_1,STR7_allele_96,STR2_allele_37,STR9_allele_108,,,
403,,STR8_allele_1,STR7_allele_96,,STR9_allele_108,,,
353_R,,STR8_allele_1,STR7_allele_96,STR2_allele_38,STR9_allele_108,STR3_allele_0,STR5_allele_3,
407,,,STR7_allele_96,,,,,


In [106]:
df_mostfreqalleles_freqs.head()

Unnamed: 0,STR1,STR8,STR7,STR2,STR9,STR3,STR5,STR4
395,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
399,0.0,1.0,0.934783,0.5,0.961538,0.0,0.0,0.0
403,0.0,1.0,0.857143,0.0,1.0,0.0,0.0,0.0
353_R,0.0,0.987554,0.922893,0.278689,0.954944,0.5,0.320988,0.0
407,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0


In [81]:
traduction_dict

{'STR5': {'CTAGGACGGCGATTCTTTTTTTTTTTTTTTTTTTTTTTTTTCGAATTCACGGGGTA': 'STR5_allele_0',
  'CTAGGACGGCGATTCTTTTTTTTTTTTTTTTTTTCGAATTCACGGGGTA': 'STR5_allele_1',
  'CTAGGACGGCGATTCTTTTTTTTTTTTTTTTTTTTTTCGAATTCACGGGGTA': 'STR5_allele_2',
  'CTAGGACGGCGATTCTTTTTTTTTTTTTTTTTTTTTTTTCGAATTCACGGGGTA': 'STR5_allele_3',
  'CTAGGACGGCGATTCTTTTTTTTTTTTTATTTTTTTTTCGAATTCACGGGGTA': 'STR5_allele_4',
  'CTAGGACGGCGATTCTTTTTTTTTTTTTTTTTTTTTCGAATTCACGGGGTA': 'STR5_allele_5',
  'CTAGGACGGCGATTCTTTTTTTTTTTTTTTTCTTTTTTTCGAATTCACGGGGTA': 'STR5_allele_6',
  'CTAGGACGGCGATTCTTTTTTTTTTTTTTTCGAATTCACGGGGTA': 'STR5_allele_7',
  'CTAGGACGGCGATTCTTTTTTTTTTTTTTTTTTCTTTTTTTTCGAATTCACGGGGTA': 'STR5_allele_8',
  'CTAGGACGGCGATTCTTTTTTTTTTTTTTTTTTTTTTTCGAATTCACGGGGTA': 'STR5_allele_9',
  'CTAGGACGGCGATTCTTTTTTTTTTTTTTTTTTTTCGAATTCACGGGGTA': 'STR5_allele_10',
  'CTAGGACGGCGATTCTTTTTTTTTTTTTTTTTTTTTTTTTTTCGAATTCACGGGGTA': 'STR5_allele_11',
  'CTAGGACGGCGATTCTTTTTTTTTTTTTTTTTTCGAATTCACGGGGTA': 'STR5_allele_12',
  'CTAGGACG

In [39]:
STRS = get_STRS_samplelist(raw_df)[0]

for STR in STRS:
    print(STR)
    df_per_str = raw_df.loc[raw_df["Sequence"].str.contains(STR)] 

STR1


AttributeError: Can only use .str accessor with string values!