In [187]:
# for grabbing/handling data
import numpy as np
import pandas as pd
import fukushima_telomere_methods as f_tm

# incase needing to reload modules
import importlib
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


---
&nbsp; 

The Fukushima Project seeks to determine whether regions contaminated by radiation from the Fukushima Daiichi Nuclear Disaster can be considered safe for human repopulation. Here, we use free-roaming >200 wild boar trapped from the contaminated regions as mammalian proxies for humans. On the boar we've collected >10 types of data from chemistry, physics (radiation), and molecular biology techniques to ascertain whether boar are negatively impacted by radiation in terms of various biometrics. We have data on snake as well.

The data was generated over >4 years and exists as tabular data in various formats. Our first aim is to read in all of the data, then clean/reformat it to enable combination of the various datasets. We've focused on standardizing the format for boar sample identifications (Sample ID) to enable combination of the data.

&nbsp; 

---

# Aryn's Data (qPCR)
---

## Reading Aryn's snake data to dataframe

In [233]:
snake_df = pd.read_excel('../excel data/raw/snake dose data-15Nov19 (2).xlsx', skiprows=4)
snake_df.drop([20, 21, 22, 23], axis=0, inplace=True)
snake_df.head()

Unnamed: 0,Sample ID,DNA Concentration (ng/mcL),260/280,Sex,External Dose Rate (uSv/h),Telomere Length (qPCR),Int + Ext dose rate (uGy/h),134Cs,137Cs,134/137,134Cs int dose rate,137Cs int dose rate,Total int dose rate,134Cs ext dose rate,137Cs ext dose rate,Total ext dose rate,Total dose rates μGy h-1
0,CEC10,19.3,1.96,F,0.1,0.444298,0.3,56.0,509.0,0.11002,0.01,0.08,0.09,0.05,0.17,0.22,0.3
1,CEC18,63.7,1.83,F,0.16,0.82619,0.11,17.0,185.0,0.091892,0.0,0.03,0.03,0.02,0.06,0.08,0.11
2,CEC22,45.5,1.88,F,0.17,2.06721,0.03,6.0,55.0,0.109091,0.0,0.01,0.01,0.01,0.02,0.02,0.03
3,CEC23,32.6,1.83,F,0.2,1.38918,0.18,34.0,304.0,0.111842,0.0,0.05,0.05,0.03,0.1,0.13,0.18
4,CEC5,23.8,1.81,F,0.13,0.710565,0.09,17.0,157.0,0.10828,0.0,0.02,0.03,0.02,0.05,0.07,0.09


In [34]:
# import re

# def natural_key(string_):
#     return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_) if s]

# list_snake_ID = list(snake_df['Sample ID'].unique())
# sorted(list_snake_ID, key=natural_key)

# # making column for radiation exposure status and encoded dummy variables from the new column
# aryn_snake_df = f_tm.readable_snake_df_dummy_variables(aryn_snake_df)

## Saving Aryn's Snake dataframe for later retrieval

In [234]:
snake_df.to_csv('../excel data/cleaned/aryn_snake_df.csv', index=False)

## Reading Aryn's Boar data to dataframe

In [219]:
aryn_boar_dose = pd.read_excel('../excel data/raw/Kelly& Aryn boar_1Dec2018-edited-TH-17Nov19 (2).xlsx',
                               sheet_name=0, skiprows=71)
aryn_boar_dose.drop([46, 47], axis=0, inplace=True)
aryn_boar_dose.head(3)

Unnamed: 0.1,Unnamed: 0,Sample ID,Unnamed: 2,Unnamed: 3,Long.,Lat.,uSv/h,Unnamed: 7,max in 1000m,Unnamed: 9,...,"(size-specific) Wolfram Build-up corrected, Integrated INTERNAL LIFE-TIME dose mGy Cs134","(size-specific) INTERNAL Life time dose, mGy Cs137","INTERNAL Life time dose, mGy 134+137","EXTERNAL Dose Reasonable Life Time, mGy (134+137)",% of INTERNAL to EXTERNAL dose,"EXTERNAL Dose Maximum Life Time, mGy (134+137)","Reasonable Total Life Time Dose mGy (Int+Ext,134+137)","Maximum Total Life Time Dose mGy (Int+Ext,134+137)",Dose rate (uGy/h) at time of capture (Int+Ext; 134+137),Unnamed: 38
0,,180526-C1,Nihonmatsu,japan,37.58291,140.59495,0.27,,0.5-1.0,,...,0.177,1.25594,1.43294,4.32,33.17,,5.75294,,0.515736,
1,,180526-C2,Nihonmatsu,japan,37.58567,140.59799,0.31,,0.5-1.0,,...,0.198,1.17823,1.37623,4.32,31.8571,,5.69623,,0.54548,
2,,180526-C3,Nihonmatsu,japan,37.64065,140.54304,0.38,,0.2-0.5,,...,0.749,3.22797,3.97697,3.78,105.211,,7.75697,,0.734958,


In [220]:
aryn_boar_dose['Age (months)'] = aryn_boar_dose['hours'].apply(lambda x: int(x / (24 * 30)))
aryn_boar_dose['Sex'] = aryn_boar_dose['Sex'].apply(lambda row: f_tm.male_or_female(row))

In [209]:
# dropping empty columns & rows, unneccessary cols
aryn_boar_dose.drop(['Unnamed: 0', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 7',
                     'Unnamed: 9', 'Unnamed: 11', 'Unnamed: 38', 'Cs-134 Bq/kg (dry)',
                     'Cs-137 Bq/kg (dry)', 'EXTERNAL Dose Maximum Life Time, mGy (134+137)',
                     'Maximum Total Life Time Dose mGy (Int+Ext,134+137)',
                     'hours', 'Age', 'tissue', 'dry wt.'], 
                     axis=1, inplace=True)

In [210]:
aryn_boar_qPCR = pd.read_excel('../excel data/raw/boar snake master template.xlsx', sheet_name=0)
aryn_boar_qPCR.head(4)

Unnamed: 0,Exposure Status,Sample ID,Sex,Age,Age Class,Age (months),External Dose (uSv),Internal Dose (uSv),qPCR ID,Concentration (ng/mcL),260/280,Normalized T/A Average (Average Telomere Length),Cortisol,Dicentrics (TC-FISH),PH anomalies
0,Control,180526C3,M,,Yearling,15,0.38,,C1,14.5,1.9,0.910559,,,
1,Control,180528C2,F,,Juv,8,-,,C2,17.3,1.73,0.985085,,,
2,Control,180604C2,F,,Yearling,18,0.46,,C3,15.8,1.9,1.11798,,,
3,Control,180612C1,M,,Yearling,15,0.27,,C4,13.1,1.6,1.26004,,,


In [211]:
# list_boar_ID = list(aryn_boar_df['Sample ID'].unique())
# sorted(list_boar_ID, key=natural_key)

## Cleaning Aryn's boar dose/qPCR dataframes

In [224]:
# changing errant values to np.NaN
aryn_boar_qPCR = aryn_boar_qPCR.replace('-', np.NaN)

# cleaning Sample ID of dashes & spaces, + 'GPS' & 'collar' terms
aryn_boar_qPCR['Sample ID'] = aryn_boar_qPCR['Sample ID'].apply(lambda row: f_tm.remove_dashes_space_sampleIDs(row))
aryn_boar_qPCR['Sample ID'] = aryn_boar_qPCR['Sample ID'].astype('str')

aryn_boar_dose['Sample ID'] = aryn_boar_dose['Sample ID'].apply(lambda row: f_tm.remove_dashes_space_sampleIDs(row))
aryn_boar_dose['Sample ID'] = aryn_boar_dose['Sample ID'].astype('str')

## Saving Aryn's boar dose/qPCR data for later retrieval

In [225]:
aryn_boar_dose.to_csv('../excel data/cleaned/aryn_boar_dose.csv', index=False)
aryn_boar_qPCR.to_csv('../excel data/cleaned/aryn_boar_qPCR.csv', index=False)

# Cortisol Data
---

## Reading the boar cortisol data

In [227]:
boar_cortisol_df = pd.read_excel('../excel data/raw/boar cortisol.xlsx')
boar_cortisol_df.drop([47], axis=0, inplace=True)
boar_cortisol_df.head()

Unnamed: 0,Biel #,Date Received,Sample Number (electronic sheet),Sample ID,Hair wt (mg),Cortisol (pg/mg),Re-run,MeOH Color,pH optimal for assay,Hair Type,Hair Length (cm),Color,Comments,"Worked on sample, but not able to process",Hair Took Extra Time,Repeats
0,1,43518,160804-1,160804-1,17.19,22.990111,N,N,normal,"course, short",3.81,black scalp and tip ends with tan middle,0,0,0,0
1,2,43518,161215 T-1,161215 T-1,25.28,29.12,N,N,normal,course with split ends,5.02,black with tan tips and scalp end,0,0,0,0
2,3,43518,161215 T-2,161215 T-2,15.85,27.255521,N,N,normal,course with split ends,5.88,black with brown tips,0,0,0,0
3,4,43518,161130-1 O-210 collar,161130-1 O-210 collar,22.84,17.12,N,N,normal,course with slightly split ends,7.02,black with brown tips,0,0,0,0
4,5,43518,160803-1,160808-1,13.06,5.635528,N,N,normal,"course, short, slightly split ends",2.48,light brown,0,0,0,0


## Prep boar cortisol data for merging with other dataframes on Sample ID

In [228]:
# grabbing Sample IDs column in new dataframe, preserving index
prep_boar_cortisol_df = boar_cortisol_df[['Sample ID', 'Sample Number (electronic sheet)', 'Cortisol (pg/mg)']].copy()
prep_boar_cortisol_df['Sample ID'] = prep_boar_cortisol_df['Sample ID'].astype('str')

# removing dashes/spaces & terms 'GPS' + 'collar' from sample IDs
prep_boar_cortisol_df['Sample ID'] = prep_boar_cortisol_df['Sample ID'].apply(lambda row: f_tm.remove_dashes_space_sampleIDs(row))
prep_boar_cortisol_df['Sample ID'] = prep_boar_cortisol_df['Sample ID'].astype('str')

## Saving prepped cortisol data for later retrieval

In [229]:
prep_boar_cortisol_df.to_csv('../excel data/cleaned/prep_boar_cortisol_df.csv', index=False)

# Kelly's Data (teloFISH, dicentrics)
---

## Extracting Kelly's boar telomere FISH data

In [137]:
boar_teloFISH_list = f_tm.extract_boar_teloFISH_as_list('../excel data/raw/')

Handling Telomeres - Fukushima Hyb 4 complete.xlsx...
Handling Telomeres - Fukushima Hyb 3 complete.xlsx...
Handling Telomeres - Fukushima Hyb 6 completed.xlsx...
Handling Meredith -- Telomeres - Fukushima Hyb 2.xlsx...
Handling Telomeres - Fukushima Hyb 1 complete.xlsx...
Handling Telomeres - Fukushima Hyb 5 complete.xlsx...
Finished collecting boar teloFISH data


## Formatting telomere FISH data

In [138]:
# making a dataframe from the list
kelly_boar_teloFISH_df = pd.DataFrame(boar_teloFISH_list, columns=['Sample ID', 'teloFISH data', 'teloFISH means'])

# turning the teloFISH data, which is a series in each row containing 4800 individial telomere lengths, 
# into a list in each row so that it's saved & loaded properly
kelly_boar_teloFISH_df['teloFISH data'] = kelly_boar_teloFISH_df['teloFISH data'].apply(lambda row: row.tolist())
kelly_boar_teloFISH_df['Sample ID'] = kelly_boar_teloFISH_df['Sample ID'].apply(lambda row: f_tm.remove_dashes_space_sampleIDs(row))

kelly_boar_teloFISH_df.head(4)

Unnamed: 0,Sample ID,teloFISH data,teloFISH means
0,1606103,"[0.18431935655788256, 0.3462969729269309, 0.92...",0.439557
1,1607205,"[0.5473726346264391, 0.3295406677853052, 0.497...",0.452469
2,1607221,"[0.983036568308707, 0.4189076285406422, 0.6199...",0.533217
3,1607231,"[0.48593284910714496, 0.5306163294848134, 0.54...",0.583101


## Saving kelly boar teloFISH dataframe for later retrieval

In [139]:
kelly_boar_teloFISH_df.to_csv('../excel data/cleaned/kelly_boar_teloFISH_df.csv', index=False)

## Loading Kelly boar dicentric data

In [140]:
kelly_boar_dicentrics = pd.read_excel('../excel data/raw/Fukushima Dicentrics.xlsx')

In [141]:
kelly_boar_dicentrics.rename(columns={'Pig ID': 'Sample ID'}, inplace=True)
kelly_boar_dicentrics['Sample ID'] = kelly_boar_dicentrics['Sample ID'].apply(lambda row: f_tm.remove_dashes_space_sampleIDs(row))
kelly_boar_dicentrics['Average Dicentrics per cell'] = kelly_boar_dicentrics['Dicentrics']/kelly_boar_dicentrics['Total Scored']                                       
kelly_boar_dicentrics.drop(['Dicentric Images', 'Notes', 'Dicentrics', 'Total Scored'], axis=1, inplace=True)

kelly_boar_dicentrics.head(4)

Unnamed: 0,Sample ID,Average Dicentrics per cell
0,1607261,0.01
1,1608012,0.03
2,1606101,0.04
3,1611261O205,0.01087


## Saving Kelly boar dicentrics dataframe for later retrieval

In [148]:
kelly_boar_dicentrics.to_csv('../excel data/cleaned/kelly_boar_dicentrics_df.csv', index=False)

## Loading new dose exposure data for Kelly's boar 

In [169]:
kelly_boar_teloFISH_df = pd.read_csv('../excel data/cleaned/kelly_boar_teloFISH_df.csv')

In [170]:
kelly_boar_dose = pd.read_excel('../excel data/raw/Kelly& Aryn boar_1Dec2018-edited-TH-17Nov19 (2).xlsx',
                               sheet_name=0, skiprows=7, nrows=40)
kelly_boar_dose.head(3)

Unnamed: 0,QA/AC,Sample ID,Place name,Unnamed: 3,Long.,Lat.,Unnamed: 6,Unnamed: 7,Unnamed: 8,"MEXT uSv/h, @ 1m, max, at capture site",...,"(from wolfram) Build-up, corrected, Integrated INTERNAL LIFE-TIME dose mGy Cs134","siz-specific INTERNAL Life time dose, mGy 137","INTERNAL Life time dose, mGy 134+137","EXTERNAL Dose Reasonable Life Time, mGy (134+137)",% of INTERNAL to EXTERNAL dose,"EXTERNAL Dose Maximum Life Time, mGy (134+137)","Reasonable Total Life Time Dose mGy (Int+Ext,134+137)","Maximum Total Life Time Dose mGy (Int+Ext,134+137)","Dose rate (uGy/h, not lifetime) at time of capture (Int+Ext; 134+137)",Unnamed: 21
0,√,160610-1,Namie,Akougi-Koakuto,37.568932,140.778014,7.0,3.8-9.5,9.5,9.5,...,0.54,1.626894,2.166894,57.456,3.771398,57.456,59.622894,59.622894,7.348438,
1,√,160610-2,Namie,Sakai-Matsukiuchi,37.479072,140.979392,3.0,3.8-9.5,9.5-19,19.0,...,0.651,2.175234,2.826234,61.152,4.621654,82.992,63.978234,85.818234,3.635025,
2,√,160610-3,Namie,Sakai-Matsukiuchi,37.479072,140.979392,3.0,3.8-9.5,9.5-19,19.0,...,0.645,2.111694,2.756694,68.208,4.041599,92.568,70.964694,95.324694,3.554446,


In [171]:
kelly_boar_dose['Age (months)'] = kelly_boar_dose['Age (hours)'].apply(lambda x: int(x / (24 * 30)))
kelly_boar_dose['Sample ID'] = kelly_boar_dose['Sample ID'].apply(lambda row: f_tm.remove_dashes_space_sampleIDs(row))
kelly_boar_dose.drop(['QA/AC', 'Unnamed: 3', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', ' '], axis=1, inplace=True)

## Saving Kelly's boar dose exposure data for later retrieval

In [172]:
kelly_boar_dose.to_csv('../excel data/cleaned/kelly_boar_dose.csv', index=False)

## Briefly looking at sharing samples between kelly's boar w/ teloFISH data vs. the exposure data & cortisol data

In [48]:
df_list = [trim_kelly_new_exposure_df, prep_boar_cortisol_df]

for df in df_list:
    f_tm.count_shared_sample_IDs(kelly_boar_teloFISH_df, df, print_names='no')

The number of sample IDs in common are: 38
The number of sample IDs in common are: 14


In [49]:
kelly_teloFISH_IDs = list(kelly_boar_teloFISH_df['Sample ID'].unique())
kelly_exposure_IDs = list(trim_kelly_new_exposure_df['Sample ID'].unique())

a = kelly_teloFISH_IDs
b = kelly_exposure_IDs

mismatched = [sample_ID for sample_ID in a if sample_ID not in b]
print(mismatched)

['161209O239', '161209O240', '161209O242']
