In [1]:
# general purpose modules for handling data
import numpy as np
import pandas as pd

# for loading telo data column containing individual telomere length values
from ast import literal_eval

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style(style="darkgrid")

# custom module for handling telomere length data
import telomere_methods_astros as telo_ma

# statistics
import scipy.stats as stats
import scikit_posthocs as sp
from statsmodels.stats.anova import AnovaRM

# machine learning 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, auc, accuracy_score, r2_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from bayes_opt import BayesianOptimization

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import importlib
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


# XGBoost machine learning with pre-flight individual telomere length data (Telo-FISH) for prediction of post-flight means

## Reading in astronaut telomere length data

In [2]:
# mean telomere length values
astro_df = pd.read_csv('../data/compiled data/All_astronauts_telomere_length_dataframe.csv')
astro_df['astro id'] = astro_df['astro id'].astype('str')
astro_df['telo data'] = astro_df['telo data'].apply(lambda row: np.array(literal_eval(row)))

# make short/long telomeres
astro_df = telo_ma.make_quartiles_columns(astro_df)

for col in ['Q1', 'Q2-3', 'Q4']:
    astro_df[col] = astro_df[col].astype('int64')
    
astro_df.drop(['telo data', 'Q2-3'], axis=1, inplace=True)

In [3]:
# individual telomere length data 
exploded_telos = pd.read_csv('../data/compiled and processed data/exploded_telos_astros_df.csv')
exploded_telos['astro id'] = exploded_telos['astro id'].astype('str')

exploded_telos.drop(['telo means'], axis=1, inplace=True)
exploded_telos.rename({'telo data exploded':'individual telomeres'}, axis=1, inplace=True)

# Preparing dataframes for machine learning

In [246]:
# create 4 separate dataframes bearing pre-flight individual telomere lengths & timepoint specific post-flight values 
# (R+7, R+60, R+180, R+270, etc)

ml_telos_df_r7 = telo_ma.make_post_flight_df_and_merge(astro_df=astro_df, exploded_telos=exploded_telos, timepoint='R+7')
ml_telos_df_r60 = telo_ma.make_post_flight_df_and_merge(astro_df=astro_df, exploded_telos=exploded_telos, timepoint='R+60')
ml_telos_df_r180 = telo_ma.make_post_flight_df_and_merge(astro_df=astro_df, exploded_telos=exploded_telos, timepoint='R+180')
ml_telos_df_r270 = telo_ma.make_post_flight_df_and_merge(astro_df=astro_df, exploded_telos=exploded_telos, timepoint='R+270')

In [247]:
ml_telos_df_r7.head(4)

Unnamed: 0,astro id,timepoint,individual telomeres,R+7 telo means,R+7 Q1,R+7 Q4
0,5163,L-270,132.793184,82.169298,1876,1003
1,5163,L-180,72.034748,82.169298,1876,1003
2,5163,L-60,89.558971,82.169298,1876,1003
3,5163,L-270,73.621784,82.169298,1876,1003


In [274]:
# train/test split & stratify split according to astro id/pre-flight timepoint
df = ml_telos_df_r270.copy()
train_set, test_set = train_test_split(df, test_size=0.2, shuffle=True, stratify=df[['astro id', 'timepoint']])

In [275]:
# intializing data cleaning pipelines

clean_process_pipe = Pipeline([('features', telo_ma.make_features(make_log_target=False)), 
                               ('dummies', telo_ma.make_dummies(drop_first=True, how_dummify='encode')),
                               ('cleaner', telo_ma.clean_data(drop_astro_id=True, timepoint='R+270', target='telo means'))
                              ])

In [276]:
# cleaning data with pipeline
train_clean = train_set.copy()
test_clean = test_set.copy()

train_clean = clean_process_pipe.fit_transform(train_clean)
test_clean = clean_process_pipe.fit_transform(test_clean)

train_clean.head(4)

In [278]:
model = XGBRegressor(n_estimators=200, max_depth=12, learning_rate=0.2, 
                     objective ='reg:squarederror', random_state=1)

xgb_pipe = Pipeline([('XGB', model)
                    ])

full_pipe = Pipeline([('clean_process', clean_process_pipe),
                      ('model', model)
                     ])

In [280]:
# five fold cross validation for predicting postflight timepoint/target with preflight individual telomeres
fit_xgb_model, telo_row = telo_ma.cv_score_fit_mae_test(train_set=train_clean, test_set=test_clean, target='R+270 telo means',
                                                        model=model, cv=5)

MAE per CV fold: 
[13.87808449 13.75387456 13.8630172  13.87507261 13.96282344] 

MEAN of MAE all folds: 13.866574461545309
STD of MAE all folds: 0.06660703561835987

MAE of predict_y_test & y_test: 13.826572352872063
R2 between predict_y_test & y_test: 0.18759597114995386


# Clustering mean telomere length

In [6]:
import importlib
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
