# Testing functions in telo_tools for handling individual telomere length data

In [1]:
# enables access to directories/files
import os

# for handling data
import numpy as np
from numpy import array
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile

# graphing
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
import seaborn as sns

# for loading telo data column containing individual telomere length values
from ast import literal_eval

# statistics
from scipy import stats

# accessing telo_tools module, which is in the parent directory, for testing
import sys 
sys.path.append('..')
import telo_tools

In [2]:
# fxn for reading files
# test_df['telo data'] = test_df['telo data'].apply(lambda row: np.array(literal_eval(row)))

## Test compiling individual telomere length data and generating data into a dataframe

In [3]:
# compiling test data from excel files into a dict
test_dict = telo_tools.generate_dictionary_for_telomere_length_data('.')

xyz9999_FD45.xlsx telomere data acquisition in progress..
xyz9999_L-180.xlsx telomere data acquisition in progress..
xyz9999_L-60.xlsx telomere data acquisition in progress..
Done collecting all telomere length excel files


In [63]:
# generating a pandas dataframe from the dict
test_df = telo_tools.make_dataframe_from_telomere_data_dict(test_dict)
test_df.head(4)

Unnamed: 0,sample id,timepoint,telo data,telo means,Q1,Q2-3,Q4
0,9999,FD45,0 8695.0 1 9606.0 2 9669....,9261.618841,telos quartile 1 <0.25,telos quartile 2-3 >0.25 & <0.75,telos quartile 4 >0.75
1,9999,L-180,0 13637.0 1 8115.0 2 13243....,10317.742572,telos quartile 1 <0.25,telos quartile 2-3 >0.25 & <0.75,telos quartile 4 >0.75
2,9999,L-60,0 12514.0 1 13037.0 2 15085....,10801.377174,telos quartile 1 <0.25,telos quartile 2-3 >0.25 & <0.75,telos quartile 4 >0.75


## Test counting short/medium/long individual telomeres

In [64]:
# declaring a list of 
timepoint_list = ['L-180', 'L-60', 'FD45', 'FD90', 'FD140', 'FD260', 'R+5', 'R+7', 'R+60', 'R+105', 'R+180', 'R+270']

telo_tools.calculate_apply_teloQuartiles_dataframe(test_df, timepoint_list)

Unnamed: 0,sample id,timepoint,telo data,telo means,Q1,Q2-3,Q4
0,9999,L-180,0 13637.0 1 8115.0 2 13243....,10317.742572,[1380],[2760],[1380]
1,9999,L-60,0 12514.0 1 13037.0 2 15085....,10801.377174,[1328],[2460],[1732]
2,9999,FD45,0 8695.0 1 9606.0 2 9669....,9261.618841,[1970],[2769],[781]


## Test exploding individual telomere length measurements into the dataframe as rows

In [65]:
test_df2 = telo_tools.explode_individual_telos(test_df)

In [66]:
print(test_df2.shape)
test_df2.head(3)

(16560, 7)


Unnamed: 0,sample id,timepoint,telo means,Q1,Q2-3,Q4,individual telos
0,9999,FD45,9261.618841,telos quartile 1 <0.25,telos quartile 2-3 >0.25 & <0.75,telos quartile 4 >0.75,8695
1,9999,L-180,10317.742572,telos quartile 1 <0.25,telos quartile 2-3 >0.25 & <0.75,telos quartile 4 >0.75,13637
2,9999,L-60,10801.377174,telos quartile 1 <0.25,telos quartile 2-3 >0.25 & <0.75,telos quartile 4 >0.75,12514


In [122]:
import importlib
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Testing graphing by histogram

In [130]:
def histogram_plot_groups(x=None, df=None, sample_id_col=None, groupby=None, 
                          num_samps_per_group=None, ordered_timepoint_list=None):
    
    all_samples = list(df[sample_id_col].unique())
    group_df = df.groupby(groupby)
    
    for sample in all_samples:
        timepoint_telo_values_dict = {}
        plot_df = group_df.get_group(sample)
        plot_df = order_timepoint_col(plot_df, ordered_timepoint_list)

        sample_unique_timepoints = list(plot_df['timepoint'].unique())
        
        for timepoint in sample_unique_timepoints:
            timepoint_telo_values_dict[timepoint] = plot_df[plot_df['timepoint'] == timepoint][x]
            
        if num_samps_per_group == 4:
            initial_timept, second_timept, third_timept = map(timepoint_telo_values_dict.get, sample_unique_timepoints)
        
#         elif num_samps_per_group == 


In [131]:
def order_timepoint_col(df, ordered_timepoint_list):
    df['timepoint'] = df['timepoint'].astype('category')
    df['timepoint'].cat.set_categories(ordered_timepoint_list, inplace=True)
    df = df.sort_values(['sample id', 'timepoint']).reset_index(drop=True)
    
    return df

In [132]:
timepoint_list = ['L-180', 'L-60', 'FD45', 'FD90', 'FD140', 'FD260', 'R+5', 'R+7', 'R+60', 'R+105', 'R+180', 'R+270']

histogram_plot_groups(x='individual telos', df=test_df2, 
                      sample_id_col='sample id', groupby='sample id', num_samps_per_group=3, ordered_timepoint_list=timepoint_list)