In [1]:
# general purpose modules for handling data
import numpy as np
from numpy import array
import pandas as pd

# for loading telo data column containing individual
# telomere length values
from ast import literal_eval

# custom module for handling telomere length data
import telomere_methods_astros as telo_ma

In [2]:
import importlib
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

---
&nbsp; 

The dataframe containing our telomere length data already contains the means. However, we're interested in analyzing (graphing, stats) the individual telomere length measurements. To do so we must extract the values from the list, while keeping the values associated with their timepont/individual. We'll achieve this by 'exploding' the cell containing individual telomere length measurements as a list, into one row per measurement (5520 rows per sample, 184 telos per 30 metaphases per sample). 

As well, we're interested in defining and examining the prevalence of short/long telomeres; this is achieved by finding the values that divide the data into bottom 25% (short telos), middle 50%, and top 25% (long telos). More on that soon.
 
&nbsp;
___

# Transforming telomere length data (teloFISH)

In [3]:
astro_df = pd.read_csv('../data/compiled data/All_astronauts_telomere_length_dataframe.csv')

# literal eval enables interpretation of individual telomere length values in the list 
astro_df['telo data'] = astro_df['telo data'].apply(lambda row: np.array(literal_eval(row)))

In [4]:
astro_df.head(4)

Unnamed: 0,astro number,astro id,timepoint,flight status,telo data,telo means,Q1,Q2-3,Q4
0,1,5163,L-270,Pre-Flight,"[132.79318409622454, 73.62178416304711, 63.314...",87.67212,telos preF Q1 <0.25,telos preF Q2-3 >0.25 & <0.75,telos preF Q4 >0.75
1,1,5163,L-180,Pre-Flight,"[72.03474774473773, 85.83361176077514, 65.6364...",101.077756,telos preF Q1 <0.25,telos preF Q2-3 >0.25 & <0.75,telos preF Q4 >0.75
2,1,5163,L-60,Pre-Flight,"[89.55897093217507, 95.80688272636151, 79.8362...",128.599235,telos preF Q1 <0.25,telos preF Q2-3 >0.25 & <0.75,telos preF Q4 >0.75
3,1,5163,FD90,Mid-Flight,"[77.53090544604076, 71.38322753090544, 83.7788...",101.183129,telos preF Q1 <0.25,telos preF Q2-3 >0.25 & <0.75,telos preF Q4 >0.75


## Exploding list containing individual telomere length measurements into rows

this action explodes the list into columns bearing each datapoint.
picture the list, which contains the individual values, expanding to the
right, up to 5520 measurements

importantly, the index #s per row still refer to the timepont/sample
so we can merge these columns back to the original astro_df
then we'll melt the columns into one, resulting in a lot of rows
where each has one individual telomere length measurement

In [5]:
explode_telos_raw = astro_df['telo data'].apply(pd.Series)

In [6]:
explode_telos_raw.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5510,5511,5512,5513,5514,5515,5516,5517,5518,5519
0,132.793184,73.621784,63.3144,83.929168,139.776144,89.475443,81.740728,94.303375,74.5573,76.729034,...,87.253592,82.642833,67.423989,102.656198,122.819913,105.67992,103.575008,109.856331,105.095222,127.313732
1,72.034748,85.833612,65.636485,81.724023,185.64985,144.019379,138.706983,90.56131,41.162713,117.858336,...,71.817574,126.762446,78.115603,68.359506,75.008353,106.932843,130.771801,116.739058,102.405613,58.703642


In [7]:
exploded_telos_astro_df = (explode_telos_raw
    # merge 5520 columns with original dataframe
    .merge(astro_df, right_index = True, left_index = True)
                           
    # drop unnecessary columns
    .drop(['telo data', 'Q1', 'Q2-3', 'Q4'], axis = 1)
                           
    #specify which columns remain constant per indviidual telo 
    .melt(id_vars = ['astro number', 'astro id', 'timepoint', 'flight status', 'telo means'], value_name = "telo data exploded") 
    .drop("variable", axis = 1)
    .dropna())

exploded_telos_astro_df.shape

(436080, 6)

In [8]:
copy_exploded_telos_astros_df = exploded_telos_astro_df

copy_exploded_telos_astros_df.to_csv('../data/compiled and processed data/exploded_telos_astros_df.csv', index = False)

## Loading astro telomere length data per cell, exploding cell data, saving to file

In [9]:
astro_cells_df = pd.read_csv('../data/compiled data/All_astronauts_telomere_length_per_cell_dataframe.csv')

astro_cells_df['telo data per cell'] = astro_cells_df['telo data per cell'].apply(lambda row: np.array(literal_eval(row)))
astro_cells_df['telo data per cell'][0].shape

(30,)

In [10]:
explode_cells_raw = astro_cells_df['telo data per cell'].apply(pd.Series)

In [11]:
exploded_cells_astro_df = (explode_cells_raw

    .merge(astro_cells_df, right_index = True, left_index = True)
    .drop(['telo data per cell'], axis = 1)
    .melt(id_vars = ['astro number', 'astro id', 'timepoint', 'flight status', 'telo means'], value_name = "telo data per cell") 
    .drop("variable", axis = 1)
    .dropna())

exploded_cells_astro_df[
    (exploded_cells_astro_df['astro id'] == 5163) &
    (exploded_cells_astro_df['timepoint'] == 'L-270')].shape

(30, 6)

In [12]:
exploded_cells_astro_df.to_csv('../data/compiled and processed data/exploded_cells_astros_df.csv', index = False)

## Loading astro controls telomere length data per cell, exploding cell data, saving to file

In [13]:
ctrl_cells_df = pd.read_csv('../data/compiled data/All_controls_telomere_length_per_cell_dataframe.csv')

ctrl_cells_df['telo data per cell'] = ctrl_cells_df['telo data per cell'].apply(lambda row: np.array(literal_eval(row)))
ctrl_cells_df['telo data per cell'][0].shape

(30,)

In [14]:
explode_cells_ctrl_raw = ctrl_cells_df['telo data per cell'].apply(pd.Series)
ctrl_cells_df.head(4)

Unnamed: 0,control id,timepoint,flight status controls,telo data per cell,telo means
0,100,L-270,Pre-Flight,"[0.8477957561098561, 1.0075334075822138, 0.858...",1.013536
1,100,L-180,Pre-Flight,"[1.3042379742777852, 1.2996318440034278, 1.366...",1.268621
2,100,L-60,Pre-Flight,"[1.1800698226000297, 1.0767199987117924, 1.087...",0.909948
3,100,R+60,Post-Flight,"[1.5899987895105754, 1.0877994067201529, 1.071...",1.321577


In [15]:
exploded_cells_ctrl_df = (explode_cells_ctrl_raw

    .merge(ctrl_cells_df, right_index = True, left_index = True)
    .drop(['telo data per cell'], axis = 1)
    .melt(id_vars = ['control id', 'timepoint', 'flight status controls', 'telo means'], value_name = "telo data per cell") 
    .drop("variable", axis = 1)
    .dropna())

exploded_cells_ctrl_df

Unnamed: 0,control id,timepoint,flight status controls,telo means,telo data per cell
0,100,L-270,Pre-Flight,1.013536,0.847796
1,100,L-180,Pre-Flight,1.268621,1.304238
2,100,L-60,Pre-Flight,0.909948,1.180070
3,100,R+60,Post-Flight,1.321577,1.589999
4,397,L-270,Pre-Flight,0.932568,0.752803
5,397,L-180,Pre-Flight,0.754504,0.440275
6,397,L-60,Pre-Flight,1.202129,0.909556
7,397,R+60,Post-Flight,0.832499,1.017753
8,397,R+180,Post-Flight,0.964484,1.144038
9,397,R+270,Post-Flight,1.016254,0.954731


In [16]:
exploded_cells_ctrl_df['flight status'] = 'Controls'

In [17]:
exploded_cells_ctrl_df.to_csv('../data/compiled and processed data/exploded_cells_ctrl_df.csv', index = False)

## Defining short/mid/long telomeres w/ quartiles and counting per timepoint per astro

This function identifies the timepoint per astronaut most distal to spaceflight (L-270 or L-180) and identifies the individual telomere length values which divide the data into the bottom 25% (short telos), mid 50%, and top 25% (long telos). The function then counts how many telos reside in each quartile. Now we have a way to define short/long telomeres.

The function then applies those quartile cutoff values to subsequent datapoints and counts how many telomeres reside within each quartile. In doing so, we count the number of telomeres moving into/out of the quartiles, per timepoint per astronaut, and can quantitatively dicuss # of short/long telos.

In [6]:
astro_df.head()

Unnamed: 0,astro number,astro id,timepoint,flight status,telo data,telo means,Q1,Q2-3,Q4
0,1,5163,L-270,Pre-Flight,"[132.79318409622454, 73.62178416304711, 63.314...",87.67212,[1384],[2751],[1385]
1,1,5163,L-180,Pre-Flight,"[72.03474774473773, 85.83361176077514, 65.6364...",101.077756,[848],[2142],[2530]
2,1,5163,L-60,Pre-Flight,"[89.55897093217507, 95.80688272636151, 79.8362...",128.599235,[259],[1094],[4167]
3,1,5163,FD90,Mid-Flight,"[77.53090544604076, 71.38322753090544, 83.7788...",101.183129,[472],[2516],[2532]
4,1,5163,FD140,Mid-Flight,"[145.8904109589041, 126.2278650183762, 225.208...",129.85197,[203],[974],[4343]


In [5]:
quartiles_astro_df = telo_ma.make_quartiles_columns(astro_df)

In [19]:
for col in ['Q1', 'Q2-3', 'Q4']:
    quartiles_astro_df[col] = quartiles_astro_df[col].astype('int64')

quartiles_astro_df[quartiles_astro_df['astro id'] == 2171]

Unnamed: 0,astro number,astro id,timepoint,flight status,telo data,telo means,Q1,Q2-3,Q4
30,5,2171,L-180,Pre-Flight,"[111.3913043478261, 102.83229813664596, 52.484...",127.863014,1381,2759,1380
31,5,2171,L-60,Pre-Flight,"[80.64596273291926, 84.75776397515529, 119.378...",113.106425,1614,3235,671
32,5,2171,FD45,Mid-Flight,"[145.19254658385094, 180.3975155279503, 182.95...",135.290127,1413,2112,1995
33,5,2171,FD260,Mid-Flight,"[156.24844720496895, 98.54658385093168, 81.763...",134.60034,1230,2663,1627
34,5,2171,R+105,Post-Flight,"[83.44099378881988, 104.63354037267081, 76.559...",120.721879,1793,2600,1127
35,5,2171,R+180,Post-Flight,"[74.2360248447205, 128.472049689441, 193.16770...",112.335962,1558,3379,583
36,5,2171,R+270,Post-Flight,"[89.22981366459628, 34.298136645962735, 73.639...",96.448582,2498,2837,185


In [20]:
melted_quartiles_astro_df = pd.melt(
    quartiles_astro_df,
    id_vars = [col for col in quartiles_astro_df.columns if col != 'Q1' and col != 'Q2-3' and col != 'Q4'],
    var_name='relative Q',
    value_name='Q freq counts')

melted_quartiles_astro_df['Q freq counts'] = melted_quartiles_astro_df['Q freq counts'].astype('int64')
melted_quartiles_astro_df['astro id'] = melted_quartiles_astro_df['astro id'].astype('str')

In [21]:
melted_quartiles_astro_df[melted_quartiles_astro_df['astro id'] == '5163'].head(8)

Unnamed: 0,astro number,astro id,timepoint,flight status,telo data,telo means,relative Q,Q freq counts
0,1,5163,L-270,Pre-Flight,"[132.79318409622454, 73.62178416304711, 63.314...",87.67212,Q1,1384
1,1,5163,L-180,Pre-Flight,"[72.03474774473773, 85.83361176077514, 65.6364...",101.077756,Q1,848
2,1,5163,L-60,Pre-Flight,"[89.55897093217507, 95.80688272636151, 79.8362...",128.599235,Q1,259
3,1,5163,FD90,Mid-Flight,"[77.53090544604076, 71.38322753090544, 83.7788...",101.183129,Q1,472
4,1,5163,FD140,Mid-Flight,"[145.8904109589041, 126.2278650183762, 225.208...",129.85197,Q1,203
5,1,5163,R+7,Post-Flight,"[80.4209822920147, 102.90678249248246, 70.8987...",82.169298,Q1,1876
6,1,5163,R+60,Post-Flight,"[138.84062813230872, 76.2278650183762, 138.807...",100.116703,Q1,566
7,1,5163,R+180,Post-Flight,"[148.59672569328433, 182.6094219846308, 97.594...",115.40319,Q1,936


In [22]:
copy_melted_quartiles_astro_df = melted_quartiles_astro_df
copy_melted_quartiles_astro_df['telo data'] = copy_melted_quartiles_astro_df['telo data'].apply(lambda row: row.tolist())

melted_quartiles_astro_df.to_csv('../data/compiled and processed data/melted_quartiles_astro_df.csv', index = False)

# Cleaning astronaut urine biochem data (n=11)

In [23]:
urine_biochem_data = pd.read_csv('../data/compiled data/urine_biochem_data.csv')
urine_biochem_data

Unnamed: 0,biochemistry,Pre,FD15,FD30,FD60,FD120,FD180,R+0 day 1,R+0 day 2,R+30 day 1,R+30 day 2,sample type
0,"8-OHdG, Urine ug/gCr",2.6 ± 1,3.1 ± 1,3 ± 1,3.6 ± 1.2,3 ± 1,2.5 ± 0.6,2.4 ± 0.8,2.4 ± 1,1.7 ± 0.7,1.8 ± 1,urine
1,"Copper, Urine umol/day",0.28 ± 0.17,0.24 ± 0.04,0.19 ± 0.02,0.24 ± 0.04,0.21 ± 0.05,0.24 ± 0.08,0.58 ± 0.71,0.25 ± 0.07,0.26 ± 0.1,0.34 ± 0.32,urine
2,PGF2-alpha ng/mg Cr,1.84 ± 1.2,2.22 ± 0.94,2.48 ± 1.05,2.92 ± 1.38,2.7 ± 1.26,3.12 ± 1.14,1.53 ± 1.06,1.59 ± 0.91,1.88 ± 1.12,1.72 ± 1.11,urine
3,"Selenium, Urine umol/day",1.02 ± 0.39,1.11 ± 0.35,1.08 ± 0.3,1.26 ± 0.54,1.14 ± 0.42,1.19 ± 0.42,0.88 ± 0.21,0.94 ± 0.27,0.96 ± 0.31,1.03 ± 0.41,urine


In [24]:
melt_urine_biochem_data = pd.melt(
    urine_biochem_data,
    id_vars=['biochemistry', 'sample type'],
    var_name='timepoint',
    value_name='measurement'
)

melt_urine_biochem_data.head(4)

Unnamed: 0,biochemistry,sample type,timepoint,measurement
0,"8-OHdG, Urine ug/gCr",urine,Pre,2.6 ± 1
1,"Copper, Urine umol/day",urine,Pre,0.28 ± 0.17
2,PGF2-alpha ng/mg Cr,urine,Pre,1.84 ± 1.2
3,"Selenium, Urine umol/day",urine,Pre,1.02 ± 0.39


In [27]:
def make_flight_status_via_timepoint(row):
    if 'Pre' in row:
        return 'Pre-Flight'
    elif 'L' in row:
        return 'Pre-Flight'
    elif 'FD' in row:
        return 'Mid-Flight'
    elif 'R' in row:
        return 'Post-Flight'

In [28]:
def grab_number_remove_plusminus(row):
    if '±' in str(row):
        row = row.split('±')[0].strip()
        row = float(row)
        return row  

In [29]:
melt_urine_biochem_data['flight status'] = melt_urine_biochem_data['timepoint'].apply(lambda row: make_flight_status_via_timepoint(row))

In [30]:
melt_urine_biochem_data['measured analyte'] = melt_urine_biochem_data['measurement'].apply(lambda row: grab_number_remove_plusminus(row))
melt_urine_biochem_data.head(4)

Unnamed: 0,biochemistry,sample type,timepoint,measurement,flight status,measured analyte
0,"8-OHdG, Urine ug/gCr",urine,Pre,2.6 ± 1,Pre-Flight,2.6
1,"Copper, Urine umol/day",urine,Pre,0.28 ± 0.17,Pre-Flight,0.28
2,PGF2-alpha ng/mg Cr,urine,Pre,1.84 ± 1.2,Pre-Flight,1.84
3,"Selenium, Urine umol/day",urine,Pre,1.02 ± 0.39,Pre-Flight,1.02


# Reading in astronaut blood biochem data (n=11)

In [31]:
blood_biochem_data = pd.read_csv('../data/compiled data/blood_biochem_data.csv')
blood_biochem_data.head(6)

Unnamed: 0,biochemistry,Pre,FD15,FD30,FD60,FD120,FD180,R+0,R+180,sample type
0,CCL2/MCP-1 pg/ml,83 ± 17,89 ± 21,94 ± 17,96 ± 26,96 ± 25,93 ± 22,182 ± 146,87 ± 23,blood
1,CCL3/MIP-1a pg/ml,438 ± 194,644 ± 214,652 ± 129,642 ± 218,489 ± 89,641 ± 247,310 ± 96,388 ± 105,blood
2,CCL4/MIP1B pg/ml,54 ± 17,75 ± 27,101 ± 34,75 ± 34,75 ± 37,70 ± 28,51 ± 16,48 ± 14,blood
3,CCL5/RANTES pg/ml,6902 ± 4393,15462 ± 2388,17986 ± 601,15022 ± 2726,14227 ± 2755,13636 ± 3964,3890 ± 3563,5569 ± 2539,blood
4,CXCL5/ENA-78 pg/ml,275 ± 549,1233 ± 967,1005 ± 426,1144 ± 823,1009 ± 503,919 ± 501,93 ± 74,134 ± 70,blood
5,CXCL8/IL-8 pg/ml,12 ± 5,23 ± 17,14 ± 7,18 ± 6,12 ± 2,19 ± 9,*,*,blood


In [32]:
blood_biochem_data.replace('*', np.NaN, inplace=True)

In [33]:
melt_blood_biochem_data = pd.melt(
    blood_biochem_data,
    id_vars=['biochemistry', 'sample type'],
    var_name='timepoint',
    value_name='measurement')

In [34]:
melt_blood_biochem_data['flight status'] = melt_blood_biochem_data['timepoint'].apply(lambda row: make_flight_status_via_timepoint(row))
melt_blood_biochem_data['measured analyte'] = melt_blood_biochem_data['measurement'].apply(lambda row: grab_number_remove_plusminus(row))
melt_blood_biochem_data.head(4)

Unnamed: 0,biochemistry,sample type,timepoint,measurement,flight status,measured analyte
0,CCL2/MCP-1 pg/ml,blood,Pre,83 ± 17,Pre-Flight,83.0
1,CCL3/MIP-1a pg/ml,blood,Pre,438 ± 194,Pre-Flight,438.0
2,CCL4/MIP1B pg/ml,blood,Pre,54 ± 17,Pre-Flight,54.0
3,CCL5/RANTES pg/ml,blood,Pre,6902 ± 4393,Pre-Flight,6902.0


In [35]:
melt_blood_biochem_data.rename(columns={'biochemistry':'biochemistry analyte'}, inplace=True)
melt_urine_biochem_data.rename(columns={'biochemistry':'biochemistry analyte'}, inplace=True)

In [36]:
melt_urine_biochem_data.to_csv('../data/compiled and processed data/melt_urine_biochem_data.csv', index=False)
melt_blood_biochem_data.to_csv('../data/compiled and processed data/melt_blood_biochem_data.csv', index=False)