In [21]:
#Package loading
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime, date, time, timedelta

import matplotlib.pyplot as plt
import matplotlib.pylab as pl
from matplotlib import rcParams
from matplotlib.dates import DateFormatter
from matplotlib import rc
from matplotlib.dates import date2num
from IPython import display
%matplotlib inline

from scipy import stats as sps
from scipy.interpolate import interp1d

from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score

import statsmodels.api as sm
from os import path
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.figsize'] = (14,8)
plt.rcParams['font.size'] = 18
plt.rcParams['image.cmap'] = 'plasma'
plt.rcParams['axes.linewidth'] = 2
plt.rc('font', family='serif')

## Preparing regressions

In [35]:
df = pd.read_csv(r"./Data/"+"hchs_quest_pa_one_hot.csv")

In [57]:
#Base for regression pipeline, this time with 'real' data extracted earlier in the notebook
#print(df.head(5))
#I've left out most of the PA columns as the are highly correlated
df_nan = df[['SleepEfficiency', 'Awakening', 'TotalSleepTime',
       'TotalWakeTime', 'SRIDay', 'SRISleep','mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age', 'bmi', 'cafe', 'tea',
       'cancer', 'ssleepd', 'epworth', 'whiirs', 'insomnia_sev', 'insomnia_sev_grp_2.0',
       'insomnia_sev_grp_3.0', 'insomnia_sev_grp_4.0', 'alcohol_2.0',
       'alcohol_3.0', 'cigar_2.0', 'cigar_3.0', 'cafe_wake_1', 'cafe_wake_2',
       'cafe_wake_3', 'cafe_wake_4', 'employ_2.0', 'employ_3.0', 'employ_4.0',
       'diabetes_2.0', 'diabetes_3.0', 'persontype_1', 'persontype_2',
       'persontype_3', 'persontype_4']]

scaled_features = ['SleepEfficiency', 'Awakening', 'TotalSleepTime',
       'TotalWakeTime', 'SRIDay', 'SRISleep','mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped',
        'age', 'bmi', 'ssleepd', 'epworth', 'whiirs']

sleep_features = ['SleepEfficiency', 'Awakening', 'TotalSleepTime',
       'TotalWakeTime', 'SRIDay', 'SRISleep']

scaler = StandardScaler()

df_scaled = pd.DataFrame(scaler.fit_transform(df_nan), columns = df_nan.columns).dropna()

# Models:

## Model 1: MVPA, age, sex
## Model 2: Model 1 + interactions(MVPA*age, MVPA*sex,MVPA*BMI)
## Model 3: Model 1 + other anthropometrics (BMI, alcohol, cigar)
## Model 4: Model 1 + diet/lifestyle (cafe, tea, employ)
## Model 5: Model 1 + disease (cancer, diabetes)
## Model 6: Model 1 + sleep quest (Epworth, WHIIRS, insomnia, person_type)

## 1. TST

In [61]:
#Model 1 - MVPA, sex, age
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']

col1 = 'TotalSleepTime'
X_col1 = df_scaled[features_model1]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model1_col1_sm = sm.OLS(y_train, X_train).fit() ## sm.OLS(output, input)
predictions = model1_col1_sm.predict(X_test)

# Print out the statistics
print(model1_col1_sm.summary())

                            OLS Regression Results                            
Dep. Variable:         TotalSleepTime   R-squared:                       0.099
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     17.40
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           2.26e-16
Time:                        11:36:08   Log-Likelihood:                -1094.5
No. Observations:                 801   AIC:                             2201.
Df Residuals:                     795   BIC:                             2229.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [62]:
#Model 2 - MVPA, sex, age + interactions of MVPA 
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']
interactions = ['mvpa*age','mvpa*gender','mvpa*bmi']

col1 = 'TotalSleepTime'
 
df_int = df_scaled
df_int['mvpa*age'] = df_int['mvpa_bouts_1min_grouped']*df_int['age']
df_int['mvpa*gender'] = df_int['mvpa_bouts_1min_grouped']*df_int['gender']
df_int['mvpa*bmi'] = df_int['mvpa_bouts_1min_grouped']*df_int['bmi']

X_col1 = df_int[features_model1+interactions]
y_col1 = df_int[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model2_col1_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model2_col1_sm.predict(X_test)

# Print out the statistics
print(model2_col1_sm.summary())

                            OLS Regression Results                            
Dep. Variable:         TotalSleepTime   R-squared:                       0.100
Model:                            OLS   Adj. R-squared:                  0.091
Method:                 Least Squares   F-statistic:                     10.98
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           9.04e-15
Time:                        11:36:11   Log-Likelihood:                -1094.0
No. Observations:                 801   AIC:                             2206.
Df Residuals:                     792   BIC:                             2248.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [64]:
#Model 3 - MVPA, sex, age + other anthropometric
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']

col1 = 'TotalSleepTime'

X_col1 = df_scaled[features_model1+model3_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model3_col1_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model3_col1_sm.predict(X_test)

# Print out the statistics
print(model3_col1_sm.summary())

                            OLS Regression Results                            
Dep. Variable:         TotalSleepTime   R-squared:                       0.118
Model:                            OLS   Adj. R-squared:                  0.107
Method:                 Least Squares   F-statistic:                     10.54
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           7.14e-17
Time:                        11:39:09   Log-Likelihood:                -1085.9
No. Observations:                 801   AIC:                             2194.
Df Residuals:                     790   BIC:                             2245.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [65]:
#Model 4 - MVPA, sex, age + other anthropometric + diet/lifestyle
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']

col1 = 'TotalSleepTime'

X_col1 = df_scaled[features_model1+model3_add+model4_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model4_col1_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model4_col1_sm.predict(X_test)

# Print out the statistics
print(model4_col1_sm.summary())

                            OLS Regression Results                            
Dep. Variable:         TotalSleepTime   R-squared:                       0.142
Model:                            OLS   Adj. R-squared:                  0.121
Method:                 Least Squares   F-statistic:                     6.803
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           7.55e-17
Time:                        11:41:27   Log-Likelihood:                -1074.7
No. Observations:                 801   AIC:                             2189.
Df Residuals:                     781   BIC:                             2283.
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [66]:
#Model 5 - MVPA, sex, age + other anthropometric + diet/lifestyle + disease
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']
model5_add = ['cancer',  'diabetes_2.0', 'diabetes_3.0']

col1 = 'TotalSleepTime'

X_col1 = df_scaled[features_model1+model3_add+model4_add+model5_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model5_col1_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model5_col1_sm.predict(X_test)

# Print out the statistics
print(model5_col1_sm.summary())

                            OLS Regression Results                            
Dep. Variable:         TotalSleepTime   R-squared:                       0.150
Model:                            OLS   Adj. R-squared:                  0.126
Method:                 Least Squares   F-statistic:                     6.219
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           7.03e-17
Time:                        11:44:28   Log-Likelihood:                -1071.2
No. Observations:                 801   AIC:                             2188.
Df Residuals:                     778   BIC:                             2296.
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [67]:
#Model 6 - MVPA, sex, age + other anthropometric + diet/lifestyle + disease + sleep questionnaires
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']
model5_add = ['cancer',  'diabetes_2.0', 'diabetes_3.0']
model6_add = [ 'ssleepd', 'epworth', 'whiirs', 'insomnia_sev', 'insomnia_sev_grp_2.0',
       'insomnia_sev_grp_3.0', 'insomnia_sev_grp_4.0','persontype_1', 'persontype_2', 'persontype_3', 'persontype_4']

col1 = 'TotalSleepTime'

X_col1 = df_scaled[features_model1+model3_add+model4_add+model5_add+model6_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model6_col1_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model6_col1_sm.predict(X_test)

# Print out the statistics
print(model6_col1_sm.summary())

                            OLS Regression Results                            
Dep. Variable:         TotalSleepTime   R-squared:                       0.251
Model:                            OLS   Adj. R-squared:                  0.219
Method:                 Least Squares   F-statistic:                     7.781
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           1.34e-30
Time:                        11:46:37   Log-Likelihood:                -1020.4
No. Observations:                 801   AIC:                             2109.
Df Residuals:                     767   BIC:                             2268.
Df Model:                          33                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

## Sleep Efficiency

In [69]:
#Model 1 - MVPA, sex, age
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']

col1 = 'SleepEfficiency'
X_col1 = df_scaled[features_model1]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model1_col2_sm = sm.OLS(y_train, X_train).fit() ## sm.OLS(output, input)
predictions = model1_col2_sm.predict(X_test)

# Print out the statistics
print(model1_col2_sm.summary())

                            OLS Regression Results                            
Dep. Variable:        SleepEfficiency   R-squared:                       0.062
Model:                            OLS   Adj. R-squared:                  0.056
Method:                 Least Squares   F-statistic:                     10.52
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           8.51e-10
Time:                        11:49:52   Log-Likelihood:                -1137.6
No. Observations:                 801   AIC:                             2287.
Df Residuals:                     795   BIC:                             2315.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [70]:
#Model 2 - MVPA, sex, age + interactions of MVPA 
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']
interactions = ['mvpa*age','mvpa*gender','mvpa*bmi']

col1 = 'SleepEfficiency'
 
df_int = df_scaled
df_int['mvpa*age'] = df_int['mvpa_bouts_1min_grouped']*df_int['age']
df_int['mvpa*gender'] = df_int['mvpa_bouts_1min_grouped']*df_int['gender']
df_int['mvpa*bmi'] = df_int['mvpa_bouts_1min_grouped']*df_int['bmi']

X_col1 = df_int[features_model1+interactions]
y_col1 = df_int[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model2_col2_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model2_col2_sm.predict(X_test)

# Print out the statistics
print(model2_col2_sm.summary())

                            OLS Regression Results                            
Dep. Variable:        SleepEfficiency   R-squared:                       0.066
Model:                            OLS   Adj. R-squared:                  0.056
Method:                 Least Squares   F-statistic:                     6.986
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           6.30e-09
Time:                        11:50:55   Log-Likelihood:                -1135.9
No. Observations:                 801   AIC:                             2290.
Df Residuals:                     792   BIC:                             2332.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [71]:
#Model 3 - MVPA, sex, age + other anthropometric
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']

col1 = 'SleepEfficiency'

X_col1 = df_scaled[features_model1+model3_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model3_col2_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model3_col2_sm.predict(X_test)

# Print out the statistics
print(model3_col2_sm.summary())

                            OLS Regression Results                            
Dep. Variable:        SleepEfficiency   R-squared:                       0.090
Model:                            OLS   Adj. R-squared:                  0.078
Method:                 Least Squares   F-statistic:                     7.792
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           5.47e-12
Time:                        11:51:42   Log-Likelihood:                -1125.6
No. Observations:                 801   AIC:                             2273.
Df Residuals:                     790   BIC:                             2325.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [72]:
#Model 4 - MVPA, sex, age + other anthropometric + diet/lifestyle
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']

col1 = 'SleepEfficiency'

X_col1 = df_scaled[features_model1+model3_add+model4_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model4_col2_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model4_col2_sm.predict(X_test)

# Print out the statistics
print(model4_col2_sm.summary())

                            OLS Regression Results                            
Dep. Variable:        SleepEfficiency   R-squared:                       0.121
Model:                            OLS   Adj. R-squared:                  0.099
Method:                 Least Squares   F-statistic:                     5.644
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           2.78e-13
Time:                        11:52:51   Log-Likelihood:                -1111.7
No. Observations:                 801   AIC:                             2263.
Df Residuals:                     781   BIC:                             2357.
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [73]:
#Model 5 - MVPA, sex, age + other anthropometric + diet/lifestyle + disease
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']
model5_add = ['cancer',  'diabetes_2.0', 'diabetes_3.0']

col1 = 'SleepEfficiency'

X_col1 = df_scaled[features_model1+model3_add+model4_add+model5_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model5_col2_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model5_col2_sm.predict(X_test)

# Print out the statistics
print(model5_col2_sm.summary())

                            OLS Regression Results                            
Dep. Variable:        SleepEfficiency   R-squared:                       0.124
Model:                            OLS   Adj. R-squared:                  0.099
Method:                 Least Squares   F-statistic:                     4.991
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           1.28e-12
Time:                        11:53:46   Log-Likelihood:                -1110.4
No. Observations:                 801   AIC:                             2267.
Df Residuals:                     778   BIC:                             2374.
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [74]:
#Model 6 - MVPA, sex, age + other anthropometric + diet/lifestyle + disease + sleep questionnaires
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']
model5_add = ['cancer',  'diabetes_2.0', 'diabetes_3.0']
model6_add = [ 'ssleepd', 'epworth', 'whiirs', 'insomnia_sev', 'insomnia_sev_grp_2.0',
       'insomnia_sev_grp_3.0', 'insomnia_sev_grp_4.0','persontype_1', 'persontype_2', 'persontype_3', 'persontype_4']

col1 = 'SleepEfficiency'

X_col1 = df_scaled[features_model1+model3_add+model4_add+model5_add+model6_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model6_col2_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model6_col2_sm.predict(X_test)

# Print out the statistics
print(model6_col2_sm.summary())

                            OLS Regression Results                            
Dep. Variable:        SleepEfficiency   R-squared:                       0.165
Model:                            OLS   Adj. R-squared:                  0.129
Method:                 Least Squares   F-statistic:                     4.588
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           2.72e-15
Time:                        11:54:39   Log-Likelihood:                -1091.1
No. Observations:                 801   AIC:                             2250.
Df Residuals:                     767   BIC:                             2409.
Df Model:                          33                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

## 3. SRI

In [77]:
#Model 1 - MVPA, sex, age
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']

col1 = 'SRISleep'
X_col1 = df_scaled[features_model1]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model1_col3_sm = sm.OLS(y_train, X_train).fit() ## sm.OLS(output, input)
predictions = model1_col3_sm.predict(X_test)

# Print out the statistics
print(model1_col3_sm.summary())

                            OLS Regression Results                            
Dep. Variable:               SRISleep   R-squared:                       0.048
Model:                            OLS   Adj. R-squared:                  0.042
Method:                 Least Squares   F-statistic:                     7.963
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           2.46e-07
Time:                        11:56:30   Log-Likelihood:                -1115.6
No. Observations:                 801   AIC:                             2243.
Df Residuals:                     795   BIC:                             2271.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [78]:
#Model 2 - MVPA, sex, age + interactions of MVPA 
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']
interactions = ['mvpa*age','mvpa*gender','mvpa*bmi']

col1 = 'SRISleep'
 
df_int = df_scaled
df_int['mvpa*age'] = df_int['mvpa_bouts_1min_grouped']*df_int['age']
df_int['mvpa*gender'] = df_int['mvpa_bouts_1min_grouped']*df_int['gender']
df_int['mvpa*bmi'] = df_int['mvpa_bouts_1min_grouped']*df_int['bmi']

X_col1 = df_int[features_model1+interactions]
y_col1 = df_int[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model2_col3_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model2_col3_sm.predict(X_test)

# Print out the statistics
print(model2_col3_sm.summary())

                            OLS Regression Results                            
Dep. Variable:               SRISleep   R-squared:                       0.062
Model:                            OLS   Adj. R-squared:                  0.052
Method:                 Least Squares   F-statistic:                     6.490
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           3.31e-08
Time:                        11:57:04   Log-Likelihood:                -1109.8
No. Observations:                 801   AIC:                             2238.
Df Residuals:                     792   BIC:                             2280.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [79]:
#Model 3 - MVPA, sex, age + other anthropometric
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']

col1 = 'SRISleep'

X_col1 = df_scaled[features_model1+model3_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model3_col3_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model3_col3_sm.predict(X_test)

# Print out the statistics
print(model3_col3_sm.summary())

                            OLS Regression Results                            
Dep. Variable:               SRISleep   R-squared:                       0.072
Model:                            OLS   Adj. R-squared:                  0.060
Method:                 Least Squares   F-statistic:                     6.132
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           4.80e-09
Time:                        11:57:48   Log-Likelihood:                -1105.2
No. Observations:                 801   AIC:                             2232.
Df Residuals:                     790   BIC:                             2284.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [80]:
#Model 4 - MVPA, sex, age + other anthropometric + diet/lifestyle
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']

col1 = 'SRISleep'

X_col1 = df_scaled[features_model1+model3_add+model4_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model4_col3_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model4_col3_sm.predict(X_test)

# Print out the statistics
print(model4_col3_sm.summary())

                            OLS Regression Results                            
Dep. Variable:               SRISleep   R-squared:                       0.085
Model:                            OLS   Adj. R-squared:                  0.063
Method:                 Least Squares   F-statistic:                     3.830
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           8.11e-08
Time:                        11:59:14   Log-Likelihood:                -1099.5
No. Observations:                 801   AIC:                             2239.
Df Residuals:                     781   BIC:                             2333.
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [81]:
#Model 5 - MVPA, sex, age + other anthropometric + diet/lifestyle + disease
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']
model5_add = ['cancer',  'diabetes_2.0', 'diabetes_3.0']

col1 = 'SRISleep'

X_col1 = df_scaled[features_model1+model3_add+model4_add+model5_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model5_col3_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model5_col3_sm.predict(X_test)

# Print out the statistics
print(model5_col3_sm.summary())

                            OLS Regression Results                            
Dep. Variable:               SRISleep   R-squared:                       0.090
Model:                            OLS   Adj. R-squared:                  0.064
Method:                 Least Squares   F-statistic:                     3.478
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           1.58e-07
Time:                        12:00:21   Log-Likelihood:                -1097.6
No. Observations:                 801   AIC:                             2241.
Df Residuals:                     778   BIC:                             2349.
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [82]:
#Model 6 - MVPA, sex, age + other anthropometric + diet/lifestyle + disease + sleep questionnaires
features_model1 = ['mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']
model5_add = ['cancer',  'diabetes_2.0', 'diabetes_3.0']
model6_add = [ 'ssleepd', 'epworth', 'whiirs', 'insomnia_sev', 'insomnia_sev_grp_2.0',
       'insomnia_sev_grp_3.0', 'insomnia_sev_grp_4.0','persontype_1', 'persontype_2', 'persontype_3', 'persontype_4']

col1 = 'SRISleep'

X_col1 = df_scaled[features_model1+model3_add+model4_add+model5_add+model6_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model6_col3_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model6_col3_sm.predict(X_test)

# Print out the statistics
print(model6_col3_sm.summary())

                            OLS Regression Results                            
Dep. Variable:               SRISleep   R-squared:                       0.149
Model:                            OLS   Adj. R-squared:                  0.112
Method:                 Least Squares   F-statistic:                     4.054
Date:                Mon, 22 Jun 2020   Prob (F-statistic):           9.38e-13
Time:                        12:01:02   Log-Likelihood:                -1070.8
No. Observations:                 801   AIC:                             2210.
Df Residuals:                     767   BIC:                             2369.
Df Model:                          33                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     