In [26]:
# tools for handling files
import sys
import os

# pandas/numpy for handling data
import pandas as pd
import numpy as np
from pandas import ExcelWriter
from pandas import ExcelFile

---
##### &nbsp; 

The chromosome rearrangement scoring data by subtelomere Directional Genomic Hybridization (subtelo-dGH) was originally stored in
indvidiual excel sheets. However, the data was previously combined into one sheet. The same is true of the telomeric aberrations by 
TeloFISH - it's in the same file as the subtelo-dGH data. 

subtelo-dGH was performed on pre-/post-flight samples most distal to spaceflight and on midflight samples for astros #1, 2, & 5; as well as distal pre-flight/post-flight samples for all other astros. Telomeric aberrations were scored for astros #1, 2, & 5 at pre-/post-flight most proximal to spaceflight and on midflight samples.

We'll extract all of the data from this excel sheet. We'll generate a dataframe & save it for later use.

##### &nbsp; 
---

### Extracting all telomeric / chr aberr data into one dataframe 

In [27]:
# pulling ALL of the data in the sheet into one dataframe

chr_data = '../excel data/Chromosome_Aberrations_telodGH_unrelatedAstros+SK_complete_TeloAberr_astros125___graphs_7_17_19_227pm.xlsx'
nasa_chr_data = pd.read_excel(chr_data)

### Parsing pre-flight, midflight1&2, and post-flight telomeric aberration data

In [28]:
# parsing pre-flight, midflight 1/2, and post-flight telomeric aberration data

pre_f_telo_aberr = nasa_chr_data.iloc[0:90, 147:155]
mid_f1_telo_aberr = nasa_chr_data.iloc[0:90, 155:163]
mid_f2_telo_aberr = nasa_chr_data.iloc[0:90, 163:171]
post_f_telo_aberr = nasa_chr_data.iloc[0:90, 179:187]

print(pre_f_telo_aberr.columns)

display(
pre_f_telo_aberr.head(1),
mid_f1_telo_aberr.head(1),
mid_f2_telo_aberr.head(1),
post_f_telo_aberr.head(1))

Index(['astro id', 'flight status.3', 'Cell Number.5', 'Image File Number.6',
       '# of Fragile Telos', '# of STL-complete', '#  of STL-hetero',
       '#  of potential threaded chr'],
      dtype='object')


Unnamed: 0,astro id,flight status.3,Cell Number.5,Image File Number.6,# of Fragile Telos,# of STL-complete,# of STL-hetero,# of potential threaded chr
0,2171,pre-flight,1,1,1,10,15,0


Unnamed: 0,astro id.1,flight status.4,Cell Number.6,Image File Number.7,# of Fragile Telos.1,# of STL-complete.1,# of STL-hetero.1,# of potential threaded chr.1
0,2171,mid-flight 1,1,4.0,3.0,18.0,31.0,0.0


Unnamed: 0,astro id.2,flight status.5,Cell Number.7,Image File Number.8,# of Fragile Telos.2,# of STL-complete.2,# of STL-hetero.2,# of potential threaded chr.2
0,2171.0,mid-flight 2,1.0,2.0,1.0,18.0,32.0,0.0


Unnamed: 0,astro id.3,flight status.6,Cell Number.9,Image File Number.10,# of Fragile Telos.4,# of STL-complete.4,# of STL-hetero.4,# of potential threaded chr.4
0,2171.0,post-flight,1.0,1.0,2.0,15.0,10.0,0.0


### Renaming columns to avoid formatting errors

In [29]:
# we need to rename the columns for these sheets; column names were changed in loading

telo_aberr_cols = ['astro id', 'flight status', 'Cell Number',
       'Image File Number', '# of Fragile Telos', '# of STL-complete',
       '# of STL-hetero', '# of sat associations']

pre_f_telo_aberr.columns = telo_aberr_cols
mid_f1_telo_aberr.columns = telo_aberr_cols
mid_f2_telo_aberr.columns = telo_aberr_cols
post_f_telo_aberr.columns = telo_aberr_cols

### Combining pre-flight, midflight1&2, post-flight telomeric aberr data into one dataframe

In [30]:
all_astro_telo_aberr = pd.concat([pre_f_telo_aberr, mid_f1_telo_aberr, mid_f2_telo_aberr, post_f_telo_aberr], axis=0, ignore_index=True)
all_astro_telo_aberr = all_astro_telo_aberr.drop(['Cell Number', 'Image File Number'], axis=1)

print(all_astro_telo_aberr.shape)
all_astro_telo_aberr.head(4)

(360, 6)


Unnamed: 0,astro id,flight status,# of Fragile Telos,# of STL-complete,# of STL-hetero,# of sat associations
0,2171,pre-flight,1,10,15,0
1,2171,pre-flight,1,13,15,0
2,2171,pre-flight,0,13,9,0
3,2171,pre-flight,2,6,12,0


### Saving all telomeric aberration data to dataframe for later retrieval

In [31]:
all_astro_telo_aberr.to_csv('../excel data/All_astronauts_telomeric_aberrations.csv', index = False)

### Making a tidy-data formatted dataframe for telomeric aberration data

In [32]:
melt_all_astro_telo_aberr = pd.melt(all_astro_telo_aberr,
                                   id_vars=['astro id', 'flight status'],
                                   var_name='aberration type',
                                   value_name='count per cell')

melt_all_astro_telo_aberr.head(4)

Unnamed: 0,astro id,flight status,aberration type,count per cell
0,2171,pre-flight,# of Fragile Telos,1
1,2171,pre-flight,# of Fragile Telos,1
2,2171,pre-flight,# of Fragile Telos,0
3,2171,pre-flight,# of Fragile Telos,2


### Saving telomeric aberration in tidy-data format dataframe for later retrieval

In [33]:
melt_all_astro_telo_aberr.to_csv('../excel data/melt_all_astro_telo_aberr.csv', index = False)

### Parsing pre-flight, midflight1&2, and post-flight chromosome rearrangement data

In [34]:
# parsing pre-flight, midflight 1/2, and post-flight chromosome rearrangement data

pre_f = nasa_chr_data.iloc[0:404, 1:15]
mid_f_1 = nasa_chr_data.iloc[35:109, 25:39]
mid_f_2 = nasa_chr_data.iloc[0:111, 49:63]
post_f = nasa_chr_data.iloc[0:400, 96:110]

In [35]:
# renaming columns to avoid errors when combining dataframes

chr_aberr_cols = ['astro id', 'flight status', 'cell number', 'image file number', 
                  'dicentrics', 'translocations', 'inversions', 
                  'terminal inversions', 'terminal SCEs paint cis', 
                  'terminal SCEs dark cis', 'subtelo SCEs', 
                  'sister chromatid exchanges', 'insertions', 'satellite associations']

pre_f.columns = chr_aberr_cols
mid_f_1.columns = chr_aberr_cols
mid_f_2.columns = chr_aberr_cols
post_f.columns = chr_aberr_cols

### Combining chromosome rearrangement data

In [36]:
all_astro_chr_aberr = pd.concat([pre_f, mid_f_1, mid_f_2, post_f], axis=0, ignore_index=True)
all_astro_chr_aberr = all_astro_chr_aberr.drop(['cell number', 'image file number', 'insertions'], 
                                               axis=1).dropna().reset_index(drop=True)
print(all_astro_chr_aberr.columns)
all_astro_chr_aberr.head(4)

Index(['astro id', 'flight status', 'dicentrics', 'translocations',
       'inversions', 'terminal inversions', 'terminal SCEs paint cis',
       'terminal SCEs dark cis', 'subtelo SCEs', 'sister chromatid exchanges',
       'satellite associations'],
      dtype='object')


Unnamed: 0,astro id,flight status,dicentrics,translocations,inversions,terminal inversions,terminal SCEs paint cis,terminal SCEs dark cis,subtelo SCEs,sister chromatid exchanges,satellite associations
0,2171,pre-flight,0,0,0,0,2,0,1,1,0
1,2171,pre-flight,0,0,0,0,0,0,0,0,0
2,2171,pre-flight,0,1,0,0,1,0,0,0,0
3,2171,pre-flight,0,0,0,0,1,0,1,0,0


### Generating total inversions & terminal SCEs columns

In [37]:
all_astro_chr_aberr['total inversions'] = all_astro_chr_aberr['terminal inversions'] + all_astro_chr_aberr['inversions']
all_astro_chr_aberr['terminal SCEs'] = all_astro_chr_aberr['terminal SCEs paint cis'] + all_astro_chr_aberr['terminal SCEs dark cis']

### Saving all astronauts chr aberr to csv for later retrieval 

In [38]:
all_astro_chr_aberr.to_csv('../excel data/all_astro_chr_aberr.csv', index=False)

### Generating tidy-data formatted dataframe for all astronaut chromosome rearrangement data 

In [39]:
copy_all_astro_chr_aberr = all_astro_chr_aberr.drop(['terminal inversions', 'inversions', 
                                                     'terminal SCEs paint cis', 'terminal SCEs dark cis'], axis=1)

melt_all_astro_chr_aberr = pd.melt(copy_all_astro_chr_aberr,
       id_vars=['astro id', 'flight status'],
       var_name='aberration type',
       value_name='count per cell')

melt_all_astro_chr_aberr.head(4)

Unnamed: 0,astro id,flight status,aberration type,count per cell
0,2171,pre-flight,dicentrics,0
1,2171,pre-flight,dicentrics,0
2,2171,pre-flight,dicentrics,0
3,2171,pre-flight,dicentrics,0


### Saving all astronaut chromosome rearrangement data in tidy-data format for later retrieval

In [40]:
melt_all_astro_chr_aberr.to_csv('../excel data/All_astronauts_chromosome_aberration_data_tidy_data.csv', index=False)