In [1]:
# tools for handling files
import sys
import os

# pandas/numpy for handling data
import pandas as pd
import numpy as np
from pandas import ExcelWriter
from pandas import ExcelFile

# for reading individual telomere length data from files
from ast import literal_eval

# for grabbing individual cells
import more_itertools

# my module containing functions for handling/visualizing/analyzing telomere length/chr rearrangement data
import telomere_methods_rad_patient as telo_mrp

# incase reloading modules is required
import importlib
%load_ext autoreload
%autoreload 2

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


---
##### &nbsp; 

...

##### &nbsp; 
---

### Extracting telomere length data from all radiation therapy patients

In [18]:
all_patients_dict = telo_mrp.generate_dictionary_from_TeloLength_and_Chr_aberr_Data('../raw patient teloFISH data/')

SW9A non irrad.xlsx data extraction in progress..
BJ1 for SW9_.xlsx data extraction in progress..
SW11A non irrad.xlsx data extraction in progress..
BJ1 for SW15_.xlsx data extraction in progress..
SW6A non irrad.xlsx data extraction in progress..
SW6A irrad @ 4 Gy.xlsx data extraction in progress..
SW8B.xlsx data extraction in progress..
SW14A irrad @ 4 Gy.xlsx data extraction in progress..
SW8A irrad @ 4 Gy.xlsx data extraction in progress..
SW5A irrad @ 4 Gy.xlsx data extraction in progress..
SW8C.xlsx data extraction in progress..
SW1A non irrad.xlsx data extraction in progress..
BJ1 for SW11_.xlsx data extraction in progress..
SW16A non irrad.xlsx data extraction in progress..
BJ1 for SW13_.xlsx data extraction in progress..
BJ-hTERT for SW9_.xlsx data extraction in progress..
BJ1 for SW14_.xlsx data extraction in progress..
SW9B.xlsx data extraction in progress..
BJ1 for SW8_.xlsx data extraction in progress..
SW_1_ok_3_C_.xlsx data extraction in progress..


  mns = a.mean(axis=axis)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret, rcount, out=ret, casting='unsafe', subok=False)


SW3A irrad @ 4 Gy.xlsx data extraction in progress..
SW11A irrad @ 4 Gy.xlsx data extraction in progress..
BJ1 for SW16_.xlsx data extraction in progress..
BJ1 for SW12_.xlsx data extraction in progress..
SW8A non irrad.xlsx data extraction in progress..
BJ-hTERT for SW8_.xlsx data extraction in progress..
SW10A non irrad.xlsx data extraction in progress..
SW12A irrad @ 4 Gy.xlsx data extraction in progress..
SW9C.xlsx data extraction in progress..
BJ1 for SW10_.xlsx data extraction in progress..
SW7A non irrad.xlsx data extraction in progress..
SW1A irrad @ 4 Gy.xlsx data extraction in progress..
SW13A irrad @ 4 Gy.xlsx data extraction in progress..
SW1B.xlsx data extraction in progress..
BJ-hTERT for SW6_.xlsx data extraction in progress..
SW13B.xlsx data extraction in progress..
BJ1 for SW2_.xlsx data extraction in progress..
SW2A non irrad.xlsx data extraction in progress..
SW5C.xlsx data extraction in progress..
SW15C.xlsx data extraction in progress..
SW7C.xlsx data extraction in

### Making dataframe from dict w/ all patients telomere length data, contains telo means & individual telos as list

In [7]:
# making df w/ all telomere data
all_patients_df = telo_mrp.generate_dataframe_from_dict_and_generate_histograms_stats(all_patients_dict)

# counting telomeres per quartile
all_patients_df = telo_mrp.calculate_apply_teloQuartiles_dataframe(all_patients_df)
all_patients_df['Q1'] = all_patients_df['Q1'].astype('float64')
all_patients_df['Q2-3'] = all_patients_df['Q2-3'].astype('float64')
all_patients_df['Q4'] = all_patients_df['Q4'].astype('float64')

# don't need telo means per cell @ this time
all_patients_df = all_patients_df.drop(['cell data'], axis=1)

print(all_patients_df.shape)
all_patients_df.head(4)

To display graphs pass the value "yes graphs" to the function otherwise default option="no graphs"
(59, 9)


Unnamed: 0,patient id,timepoint,telo data,chr data,status,telo means,Q1,Q2-3,Q4
0,1,1 non irrad,0 78.134078 1 82.357542 2 ...,chr data,IT WORKS PEGGY <333,84.796738,1195.0,2214.0,1191.0
1,1,2 irrad @ 4 Gy,0 137.262570 1 115.089385 2 ...,chr data,IT WORKS PEGGY <333,90.975987,724.0,2343.0,1533.0
2,1,3 B,0 95.027933 1 93.972067 2 ...,chr data,IT WORKS PEGGY <333,116.780229,231.0,1448.0,2921.0
3,1,4 C,0 124.592179 1 134.062806 2 ...,chr data,IT WORKS PEGGY <333,99.346663,372.0,2210.0,2018.0


### Saving all patients telomere length data for later retrieval

In [8]:
# changing telo data to list in prep for saving to csv

all_patients_df['telo data'] = all_patients_df['telo data'].apply(lambda row: row.tolist())
all_patients_df.to_csv('../compiled patient data csv files/all_patients_df.csv', index=False)

### Generating all patients telo df containing telo counts per quartile melted into tidy data format 

In [9]:
melted_all_patients_df = pd.melt(
    all_patients_df,
    id_vars = [col for col in all_patients_df.columns if col != 'Q1' and col != 'Q2-3' and col != 'Q4'],
    var_name='relative Q',
    value_name='Q freq counts')

melted_all_patients_df['Q freq counts'] = melted_all_patients_df['Q freq counts'].astype('float64')
melted_all_patients_df.head(4)

Unnamed: 0,patient id,timepoint,telo data,chr data,status,telo means,relative Q,Q freq counts
0,1,1 non irrad,"[78.13407813845714, 82.35754182161699, 30.6201...",chr data,IT WORKS PEGGY <333,84.796738,Q1,1195.0
1,1,2 irrad @ 4 Gy,"[137.26256970269498, 115.0893853661058, 89.748...",chr data,IT WORKS PEGGY <333,90.975987,Q1,724.0
2,1,3 B,"[95.02793287109652, 93.97206695030657, 185.832...",chr data,IT WORKS PEGGY <333,116.780229,Q1,231.0
3,1,4 C,"[124.59217865321546, 134.06280604091415, 45.40...",chr data,IT WORKS PEGGY <333,99.346663,Q1,372.0


### Saving melted all patients df to csv

In [10]:
melted_all_patients_df.to_csv('../compiled patient data csv files/melted_all_patients_df.csv', index=False)

### Dataframe w/ timepoints as columns, and telomere length means for each patient timepoint in rows

In [11]:
pivot_patients_telo_means_df = all_patients_df.pivot(index='patient id', columns='timepoint', values='telo means')
pivot_patients_telo_means_df = pivot_patients_telo_means_df.drop(13)
pivot_patients_telo_means_df.head(4)

timepoint,1 non irrad,2 irrad @ 4 Gy,3 B,4 C
patient id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,84.796738,90.975987,116.780229,99.346663
2,119.774143,133.199629,159.828115,108.915685
3,83.351204,87.295754,101.433049,95.669819
5,85.506373,113.096219,118.340987,97.832439


### Saving pivoted telo means df to file

In [12]:
pivot_patients_telo_means_df.to_csv('../compiled patient data csv files/pivot_patients_telo_means_df.csv', index=False)

### Exploding individual telomere length measurements from contained list into dataframe (i.e row per individual telomere) while retaining related column info

In [15]:
# can imagine the lists containing the individual telos per patient exploding to the right; maintains the index relationship
explode_telos_raw = all_patients_df['telo data'].apply(pd.Series)

print(explode_telos_raw.shape)
explode_telos_raw.head(4)

(59, 4600)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4590,4591,4592,4593,4594,4595,4596,4597,4598,4599
0,78.134078,82.357542,30.620112,158.379888,68.631285,154.156424,70.743017,85.52514,66.519553,53.849162,...,97.139665,138.318436,70.743017,77.078212,39.067039,87.636871,71.798883,63.351955,120.368715,48.569832
1,137.26257,115.089385,89.748603,90.804469,80.24581,99.251397,80.24581,68.631285,46.458101,104.530726,...,60.184357,116.145251,74.96648,68.631285,96.083799,74.96648,83.413408,73.910614,81.301676,81.301676
2,95.027933,93.972067,185.832402,117.201117,121.424581,211.173184,91.860335,98.195531,117.201117,85.52514,...,116.145251,131.98324,80.24581,122.480447,85.52514,147.821229,133.039106,92.916201,54.905028,114.033519
3,124.592179,134.062806,45.402235,97.139665,70.743017,59.128492,103.47486,74.96648,131.98324,59.128492,...,101.363128,83.413408,134.094972,140.430167,155.21229,62.296089,107.698324,103.47486,179.011213,158.75509


In [16]:
exploded_telos_all_patients_df = (explode_telos_raw
                                  
    # we'll merge the exploded telos df w/ our original all patients df on the index!
    .merge(all_patients_df, right_index = True, left_index = True)
    .drop(['telo data', 'Q1', 'Q2-3', 'Q4'], axis = 1)
    .melt(id_vars = ['patient id', 'timepoint', 'chr data', 'status', 'telo means'], value_name = "telo data exploded") 
    .drop("variable", axis = 1)
    .dropna())

# exploded_telos_all_patients_df['telo data exploded'] = exploded_telos_all_patients_df['telo data exploded'].astype('float64')
exploded_telos_all_patients_df.head(4)

Unnamed: 0,patient id,timepoint,chr data,status,telo means,individual telomeres
0,1,1 non irrad,chr data,IT WORKS PEGGY <333,84.796738,78.134078
1,1,2 irrad @ 4 Gy,chr data,IT WORKS PEGGY <333,90.975987,137.26257
2,1,3 B,chr data,IT WORKS PEGGY <333,116.780229,95.027933
3,1,4 C,chr data,IT WORKS PEGGY <333,99.346663,124.592179


### Saving exploded telomere df for later retrieval

In [17]:
exploded_telos_all_patients_df.to_csv('../compiled patient data csv files/exploded_telos_all_patients_df.csv', index=False)

### Extracting average telomere length data by qPCR data from Aidan & Lynn

In [2]:
raw_IMRT_telo_qPCR = (pd.read_excel(
                    "../qPCR telo data/Tel +Alb (both plates) 2019-08-05 13 Quantification Cq Results.xlsx", 
                    skiprows=[0],))

raw_IMRT_telo_qPCR.columns

Index(['Unnamed: 0', 'Well', 'Content', 'Sample', 'Biological Set Name', 'Cq',
       'Cq Mean', 'Cq Std. Dev', 'Starting Quantity (SQ)',
       'Log Starting Quantity', 'SQ Mean', 'SQ Std. Dev', 'Set Point',
       'Well Note', 'Tel SQ / Alb SQ', 'Normalized T/A SQ',
       'Avg. Normalized  T/A SQ', 'SE', 'Unnamed: 18', 'Sample.1',
       'Avg. Normalized  T/A SQ.1', 'SE.1', 'Unnamed: 22', 'Unnamed: 23',
       'Sample.2', 'Mean  Patient Average Normalized T/A SQ',
       'Mean Patient SE'],
      dtype='object')

In [3]:
# grabbing triplicate mean telomere length from qPCR plate 1
trim_plate_1 = raw_IMRT_telo_qPCR.iloc[10:82, [3, 15]]

# changing errant sample ID name
trim_plate_1['Sample'] = trim_plate_1['Sample'].apply(lambda row: telo_mrp.change_sample_ID(row))

# grabbing triplicate mean telomere length from qPCR plate 2
trim_plate_2 = raw_IMRT_telo_qPCR.iloc[100:163, [3, 15]]

In [4]:
full_IMRT_qPCR_telos = pd.concat([trim_plate_1, trim_plate_2], axis=0).reset_index(drop=True)
# full_IMRT_qPCR_telos

In [5]:
full_IMRT_qPCR_telos['timepoint'] = full_IMRT_qPCR_telos['Sample'].apply(lambda row: telo_mrp.make_timepoint_col(row))
full_IMRT_qPCR_telos['patient id'] = full_IMRT_qPCR_telos['Sample'].apply(lambda row: telo_mrp.make_patient_ID(row))
full_IMRT_qPCR_telos.drop(labels=['Sample'], axis=1, inplace=True)

In [6]:
full_IMRT_qPCR_telos = full_IMRT_qPCR_telos.rename(columns={'Normalized T/A SQ':'mean telomere length by qPCR'})
full_IMRT_qPCR_telos = full_IMRT_qPCR_telos[['patient id', 'timepoint', 'mean telomere length by qPCR']]
full_IMRT_qPCR_telos.shape

(135, 3)

In [8]:
groupby_IMRT_qPCR_telos = full_IMRT_qPCR_telos.groupby(['patient id', 'timepoint']).agg(['mean', 'std'])
# groupby_IMRT_qPCR_telos.columns = groupby_IMRT_qPCR_telos.columns.to_flat_index()
groupby_IMRT_qPCR_telos.reset_index(inplace=True)

In [10]:
groupby_IMRT_qPCR_telos['patient id'] = groupby_IMRT_qPCR_telos['patient id'].astype('int')
groupby_IMRT_qPCR_telos = groupby_IMRT_qPCR_telos.sort_values(by='patient id', axis=0, ascending=True).reset_index(drop=True)
groupby_IMRT_qPCR_telos.columns = ['patient id', 'timepoint', 'mean telomere length by qPCR', 'mean telomere length std dev']

Unnamed: 0_level_0,patient id,timepoint,mean telomere length by qPCR,mean telomere length by qPCR
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std
0,1,1 non irrad,2.054601,0.135746
1,1,3 B,1.849633,0.084712
2,1,4 C,1.647613,0.111219
3,2,4 C,1.56115,0.030569
4,2,3 B,1.676137,0.107934
5,2,1 non irrad,1.571951,0.056717
6,3,4 C,1.461543,0.059352
7,3,3 B,1.466218,0.08409
8,3,1 non irrad,1.275436,0.029833
9,5,1 non irrad,1.35748,0.029001


In [220]:
all_patients_df = pd.read_csv('../compiled patient data csv files/all_patients_df.csv')

In [241]:
all_patients_df_qPCR = all_patients_df.merge(groupby_IMRT_qPCR_telos, on=['patient id', 'timepoint'])
all_patients_df_qPCR.corr()

Unnamed: 0,patient id,telo means,Q1,Q2-3,Q4,mean telomere length by qPCR,mean telomere length std dev
patient id,1.0,-0.482406,0.079048,-0.22165,0.054,-0.581373,-0.195657
telo means,-0.482406,1.0,-0.38244,-0.163581,0.366317,0.306214,0.163859
Q1,0.079048,-0.38244,1.0,0.257396,-0.871351,-0.059355,-0.005582
Q2-3,-0.22165,-0.163581,0.257396,1.0,-0.69841,0.295792,0.103288
Q4,0.054,0.366317,-0.871351,-0.69841,1.0,-0.106232,-0.048312
mean telomere length by qPCR,-0.581373,0.306214,-0.059355,0.295792,-0.106232,1.0,0.390249
mean telomere length std dev,-0.195657,0.163859,-0.005582,0.103288,-0.048312,0.390249,1.0
