In [2]:
# for grabbing/handling data
import numpy as np
import pandas as pd
import fukushima_telomere_methods as f_tm

# incase needing to reload modules
import importlib
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

---
&nbsp; 

The Fukushima Project seeks to determine whether regions contaminated by radiation from the Fukushima Daiichi Nuclear Disaster can be considered safe for human repopulation. Here, we use free-roaming >200 wild boar trapped from the contaminated regions as mammalian proxies for humans. On the boar we've collected >10 types of data from chemistry, physics (radiation), and molecular biology techniques to ascertain whether boar are negatively impacted by radiation in terms of various biometrics. We have data on snake as well.

The data was generated over >4 years and exists as tabular data in various formats. Our first aim is to read in all of the data, then clean/reformat it to enable combination of the various datasets. We've focused on standardizing the format for boar sample identifications (Sample ID) to enable combination of the data.

&nbsp; 

---

## Aryn's Data (qPCR)
---

### Reading Aryn's snake data to dataframe

In [3]:
aryn_snake_df = pd.read_excel('../excel data/raw/boar snake master template.xlsx', sheet_name=1)
aryn_snake_df.head(4)

Unnamed: 0,Sample ID,Concentration (ng/mcL),260/280,Sex,External Dose (uSv),Internal Dose (uSv),Telomere Length
0,CEC10,19.3,1.96,F,0.1,,0.444298
1,CEC18,63.7,1.83,F,0.16,,0.82619
2,CEC22,45.5,1.88,F,0.17,,2.067214
3,CEC23,32.6,1.83,F,0.2,,1.389182


In [17]:
import re

def natural_key(string_):
    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_) if s]

In [18]:
list_snake_ID = list(aryn_snake_df['Sample ID'].unique())

In [20]:
# sorted(list_snake_ID, key=natural_key)

In [33]:
# making column for radiation exposure status and encoded dummy variables from the new column

aryn_snake_df = f_tm.readable_snake_df_dummy_variables(aryn_snake_df)

### Saving Aryn's Snake dataframe for later retrieval

In [34]:
aryn_snake_df.to_csv('../excel data/cleaned/aryn_snake_df.csv', index=False)

### Reading Aryn's Boar data to dataframe

In [6]:
aryn_boar_df = pd.read_excel('../excel data/raw/boar snake master template.xlsx', sheet_name=0)
aryn_boar_df.head(4)

Unnamed: 0,Exposure Status,Sample ID,Sex,Age,Age Class,Estimated Age (months),External Dose (uSv),Internal Dose (uSv),qPCR ID,Concentration (ng/mcL),260/280,Normalized T/A Average (Average Telomere Length),Cortisol,Dicentrics (TC-FISH),PH anomalies
0,Control,180526C3,M,,Yearling,15,0.38,,C1,14.5,1.9,0.910559,,,
1,Control,180528C2,F,,Juv,8,-,,C2,17.3,1.73,0.985085,,,
2,Control,180604C2,F,,Yearling,18,0.46,,C3,15.8,1.9,1.11798,,,
3,Control,180612C1,M,,Yearling,15,0.27,,C4,13.1,1.6,1.26004,,,


In [12]:
list_boar_ID = list(aryn_boar_df['Sample ID'].unique())

In [15]:
# sorted(list_boar_ID, key=natural_key)

### Cleaning Aryn's boar dataframe

In [36]:
# changing errant values to np.NaN
aryn_boar_df = aryn_boar_df.replace('-', np.NaN)

# encoding exposure status as dummy values
dummies = pd.get_dummies(aryn_boar_df['Exposure Status'], prefix='Encoded', drop_first=True)
aryn_boar_df['Encoded Exposed'] = dummies['Encoded_Exposed']

# cleaning Sample ID of dashes & spaces, + 'GPS' & 'collar' terms
aryn_boar_df['Sample ID'] = aryn_boar_df['Sample ID'].apply(lambda row: f_tm.remove_dashes_space_sampleIDs(row))

In [37]:
aryn_boar_df.to_csv('../excel data/cleaned/aryn_boar_df.csv', index=False)

## Cortisol Data
---

### Reading the boar cortisol data

In [38]:
boar_cortisol_df = pd.read_excel('../excel data/raw/boar cortisol.xlsx')
boar_cortisol_df.head(4)

Unnamed: 0,Biel #,Date Received,Sample Number (electronic sheet),Sample ID,Hair wt (mg),Cortisol (pg/mg),Re-run,MeOH Color,pH optimal for assay,Hair Type,Hair Length (cm),Color,Comments,"Worked on sample, but not able to process",Hair Took Extra Time,Repeats
0,1,43518,160804-1,160804-1,17.19,22.990111,N,N,normal,"course, short",3.81,black scalp and tip ends with tan middle,0,0,0,0
1,2,43518,161215 T-1,161215 T-1,25.28,29.12,N,N,normal,course with split ends,5.02,black with tan tips and scalp end,0,0,0,0
2,3,43518,161215 T-2,161215 T-2,15.85,27.255521,N,N,normal,course with split ends,5.88,black with brown tips,0,0,0,0
3,4,43518,161130-1 O-210 collar,161130-1 O-210 collar,22.84,17.12,N,N,normal,course with slightly split ends,7.02,black with brown tips,0,0,0,0


### Prep boar cortisol data for merging with other dataframes on Sample ID

In [39]:
# grabbing Sample IDs column in new dataframe, preserving index
prep_boar_cortisol_df = pd.DataFrame(boar_cortisol_df['Sample ID'])

# grabbing the Cortisol measurements & Sample Number (electronic sheet)
prep_boar_cortisol_df['Sample Number (electronic sheet)'] = boar_cortisol_df['Sample Number (electronic sheet)']
prep_boar_cortisol_df['Cortisol (pg/mg)'] = boar_cortisol_df['Cortisol (pg/mg)']

prep_boar_cortisol_df['Sample ID'] = prep_boar_cortisol_df['Sample ID'].astype('str')

# removing dashes/spaces & terms 'GPS' + 'collar' from sample IDs
prep_boar_cortisol_df['Sample ID'] = prep_boar_cortisol_df['Sample ID'].apply(lambda row: f_tm.remove_dashes_space_sampleIDs(row))

prep_boar_cortisol_df.head(4)

Unnamed: 0,Sample ID,Sample Number (electronic sheet),Cortisol (pg/mg)
0,1608041,160804-1,22.990111
1,161215T1,161215 T-1,29.12
2,161215T2,161215 T-2,27.255521
3,1611301O210,161130-1 O-210 collar,17.12


### Saving prepped cortisol data for later retrieval

In [40]:
prep_boar_cortisol_df.to_csv('../excel data/cleaned/prep_boar_cortisol_df.csv', index=False)

## Kelly's Data (teloFISH, dicentrics)
---

### Extracting Kelly's boar telomere FISH data

In [13]:
# boar_teloFISH_list = f_tm.extract_boar_teloFISH_as_list('../excel data/raw/')

### Formatting telomere FISH data

In [35]:
# making a dataframe from the list
kelly_boar_teloFISH_df = pd.DataFrame(boar_teloFISH_list, columns=['Sample ID', 'teloFISH data', 'teloFISH means'])

# turning the teloFISH data, which is a series in each row containing 4800 individial telomere lengths, 
# into a list in each row so that it's saved & loaded properly
kelly_boar_teloFISH_df['teloFISH data'] = kelly_boar_teloFISH_df['teloFISH data'].apply(lambda row: row.tolist())
kelly_boar_teloFISH_df['Sample ID'] = kelly_boar_teloFISH_df['Sample ID'].apply(lambda row: f_tm.remove_dashes_space_sampleIDs(row))

kelly_boar_teloFISH_df.head(4)

NameError: name 'boar_teloFISH_list' is not defined

### Saving kelly boar teloFISH dataframe for later retrieval

In [None]:
kelly_boar_teloFISH_df.to_csv('../excel data/cleaned/kelly_boar_teloFISH_df.csv', index=False)

### Loading Kelly boar dicentric data

In [15]:
kelly_boar_dicentrics = pd.read_excel('../excel data/raw/Fukushima Dicentrics.xlsx')

In [16]:
kelly_boar_dicentrics.rename(columns={'Pig ID': 'Sample ID'}, inplace=True)
kelly_boar_dicentrics['Sample ID'] = kelly_boar_dicentrics['Sample ID'].apply(lambda row: f_tm.remove_dashes_space_sampleIDs(row))
kelly_boar_dicentrics['Average Dicentrics per cell'] = kelly_boar_dicentrics['Dicentrics']/kelly_boar_dicentrics['Total Scored']                                       
kelly_boar_dicentrics.drop(['Dicentric Images', 'Notes', 'Dicentrics', 'Total Scored'], axis=1, inplace=True)

kelly_boar_dicentrics.head(4)

Unnamed: 0,Sample ID,Average Dicentrics per cell
0,1607261,0.01
1,1608012,0.03
2,1606101,0.04
3,1611261O205,0.01087


### Saving kelly boar dicentrics dataframe for later retrieval

In [17]:
kelly_boar_dicentrics.to_csv('../excel data/cleaned/kelly_boar_dicentrics_df.csv', index=False)

### Loading New Radiation Exposure Calculations for Kelly Boar Data 

In [53]:
kelly_boar_teloFISH_df = pd.read_csv('../excel data/cleaned/kelly_boar_teloFISH_df.csv')

In [54]:
kelly_new_exposure_df = pd.read_excel("../excel data/raw/jared_editsKelly's boar_1Dec2018-edited1Aug19 (1).xlsx")

In [55]:
cols_to_merge = ['ID', 'Sex', 'Age (weeks)', 
# 'Place name', 'Area', 'Lat.', 'Lon.',
'INTERNAL uGy/h Cs-134', 'INTERNAL uGy/h Cs-137',
'INTERNAL Life time dose,   mGy 134',
'(from wolfram) Build-up, corrected, Integrated INTERNAL LIFE-TIME dose mGy  Cs134',
'INTERNAL Life time dose, mGy 137',
'INTERNAL Life time dose, mGy 134+137',
'EXTERNAL Dose Reasonable Life Time, mGy (134+137)',
'% of INTERNAL to EXTERNAL dose',
'EXTERNAL Dose Maximum Life Time, mGy (134+137)',
'Reasonable Total Life Time Dose mGy (Int+Ext,134+137)',
'Maximum Total Life Time Dose mGy (Int+Ext,134+137)',
'Dose rate (uGy/h) at time of capture (Int+Ext; 134+137)']

print(len(cols_to_merge) - 3)

12


In [56]:
trim_kelly_new_exposure_df = pd.DataFrame(kelly_new_exposure_df[cols_to_merge])

In [57]:
(trim_kelly_new_exposure_df.rename(columns={
'ID':'Sample ID', 
'(from wolfram) Build-up, corrected, Integrated INTERNAL LIFE-TIME dose mGy  Cs134':'Integrated INTERNAL LIFE-TIME dose mGy  Cs134',
'INTERNAL Life time dose, mGy 134+137': 'INTERNAL Life time dose, mGy Cs 134+137',
'INTERNAL Life time dose,   mGy 134': 'INTERNAL Life time dose, mGy 134',
'EXTERNAL Dose Reasonable Life Time, mGy (134+137)':'EXTERNAL Dose Reasonable Life Time, mGy Cs 134+137',
'Reasonable Total Life Time Dose mGy (Int+Ext,134+137)': 'Reasonable Total Life Time Dose mGy (Int+Ext, Cs 134+137)'}, 
inplace=True))

In [58]:
trim_kelly_new_exposure_df['Sample ID'] = trim_kelly_new_exposure_df['Sample ID'].apply(lambda row: f_tm.remove_dashes_space_sampleIDs(row))
trim_kelly_new_exposure_df.head(4)

Unnamed: 0,Sample ID,Sex,Age (weeks),INTERNAL uGy/h Cs-134,INTERNAL uGy/h Cs-137,"INTERNAL Life time dose, mGy 134",Integrated INTERNAL LIFE-TIME dose mGy Cs134,"INTERNAL Life time dose, mGy 137","INTERNAL Life time dose, mGy Cs 134+137","EXTERNAL Dose Reasonable Life Time, mGy Cs 134+137",% of INTERNAL to EXTERNAL dose,"EXTERNAL Dose Maximum Life Time, mGy (134+137)","Reasonable Total Life Time Dose mGy (Int+Ext, Cs 134+137)","Maximum Total Life Time Dose mGy (Int+Ext,134+137)",Dose rate (uGy/h) at time of capture (Int+Ext; 134+137)
0,1606101,Male,33-39,0.0991863,0.307031,0.599879,0.672739,1.856921,2.52966,57.456,4.40278,57.456,59.98566,59.98566,7.406217
1,1606102,Female,26,0.171091,0.568404,0.747327,0.812612,2.48279,3.295402,61.152,5.38887,82.992,64.447402,86.287402,3.739496
2,1606103,Male,27-31,0.151089,0.494718,0.736103,0.808553,2.410267,3.21882,68.208,4.71912,92.568,71.42682,95.78682,3.645807
3,1607205,Male,6-9,0.00951251,0.028632,0.0119858,0.012258,0.036077,0.048335,0.882,5.48015,1.26,0.930335,1.308335,0.738145


### Saving Kelly boar new exposure data for later retrieval

In [59]:
## cleaning age (weeks) col
trim_kelly_new_exposure_df.to_csv('../excel data/cleaned/trim_kelly_new_exposure_df.csv', index=False)

# trim_kelly_new_exposure_df2 = trim_kelly_new_exposure_df
# trim_kelly_new_exposure_df2.to_excel('../excel data/cleaned/trim_kelly_new_exposure_df.xlsx', index=False)

### Saving Dicentrics + exposure merge

In [46]:
kelly_dicentrics_exposure_df = trim_kelly_new_exposure_df.merge(kelly_boar_dicentrics, on=['Sample ID'])
kelly_dicentrics_exposure_df.to_csv("../excel data/graphing/kelly_dicentrics_exposure_df.csv", index=False)

### Briefly looking at sharing samples between kelly's boar w/ teloFISH data vs. the exposure data & cortisol data

In [48]:
df_list = [trim_kelly_new_exposure_df, prep_boar_cortisol_df]

for df in df_list:
    f_tm.count_shared_sample_IDs(kelly_boar_teloFISH_df, df, print_names='no')

The number of sample IDs in common are: 38
The number of sample IDs in common are: 14


In [49]:
kelly_teloFISH_IDs = list(kelly_boar_teloFISH_df['Sample ID'].unique())
kelly_exposure_IDs = list(trim_kelly_new_exposure_df['Sample ID'].unique())

a = kelly_teloFISH_IDs
b = kelly_exposure_IDs

mismatched = [sample_ID for sample_ID in a if sample_ID not in b]
print(mismatched)

['161209O239', '161209O240', '161209O242']


## Josh's Data (PPHA)

In [114]:
josh_sex_data = pd.read_excel('../excel data/raw/(Sam_Mag) Fukushima 2017 Data (1).xlsx', sheet_name=0)
josh_sex_data = josh_sex_data.iloc[0:39, [0, 12]]

josh_sex_data.rename(columns={'ID NUMBER': 'Sample ID'}, inplace=True)
josh_sex_data['Sample ID'] = josh_sex_data['Sample ID'].astype('str')
josh_sex_data['Sample ID'] = josh_sex_data['Sample ID'].apply(lambda row: f_tm.remove_dashes_space_sampleIDs(row))

In [115]:
josh_ppha_df = pd.read_excel('../excel data/raw/PPHA concentration (1).xlsx')

In [116]:
trimmed_josh_age_data = josh_ppha_df.iloc[0:39, [0, 3]]
trimmed_josh_age_data.rename(columns={'ID NUMBER': 'Sample ID'}, inplace=True)
trimmed_josh_age_data['Sample ID'] = trimmed_josh_age_data['Sample ID'].astype('str')
trimmed_josh_age_data['Sample ID'] = trimmed_josh_age_data['Sample ID'].apply(lambda row: f_tm.remove_dashes_space_sampleIDs(row))

In [117]:
trimmed_josh_ppha_df = josh_ppha_df.iloc[0:38, 9:15]
trimmed_josh_ppha_df.rename(columns={'Boar ID':'Sample ID'}, inplace=True)
trimmed_josh_ppha_df['Sample ID'] = trimmed_josh_ppha_df['Sample ID'].astype('str')
trimmed_josh_ppha_df['Sample ID'] = trimmed_josh_ppha_df['Sample ID'].apply(lambda row: f_tm.remove_dashes_space_sampleIDs(row))

In [118]:
merge_josh_ppha_df = trimmed_josh_age_data.merge(trimmed_josh_ppha_df, on='Sample ID')
merge_josh_ppha_df.loc[1, 'Age (weeks)'] = 220
merge_josh_ppha_df.loc[3, 'Age (weeks)'] = 220
merge_josh_ppha_df.drop(13, axis=0, inplace=True)

In [119]:
merge_josh_ppha_df['Age (weeks)'] = merge_josh_ppha_df['Age (weeks)'].apply(lambda row: f_tm.average_age_weeks(row))

In [121]:
merge_josh_age_sex_ppha_df = merge_josh_ppha_df.merge(josh_sex_data, on='Sample ID')
# merge_josh_age_sex_ppha_df 

In [123]:
merge_josh_ppha_df.to_csv('../excel data/cleaned/merge_josh_ppha_df.csv', index=False)
merge_josh_age_sex_ppha_df.to_csv('../excel data/cleaned/merge_josh_age_sex_ppha_df.csv', index=False)

### Merging with cortisol data

In [33]:
josh_ppha_df_cortisol = trimmed_josh_ppha_df.merge(prep_boar_cortisol_df, on=['Sample ID'])

NameError: name 'prep_boar_cortisol_df' is not defined

### Saving to file

In [54]:
josh_ppha_df_cortisol.to_csv('../excel data/cleaned/josh_ppha_df_cortisol.csv', index=False)