In [1]:
#Package loading
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime, date, time, timedelta

import matplotlib.pyplot as plt
import matplotlib.pylab as pl
from matplotlib import rcParams
from matplotlib.dates import DateFormatter
from matplotlib import rc
from matplotlib.dates import date2num
from IPython import display
%matplotlib inline

from scipy import stats as sps
from scipy.interpolate import interp1d

from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score

import statsmodels.api as sm
from os import path
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.figsize'] = (14,8)
plt.rcParams['font.size'] = 18
plt.rcParams['image.cmap'] = 'plasma'
plt.rcParams['axes.linewidth'] = 2
plt.rc('font', family='serif')

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


## Preparing regressions

In [2]:
df = pd.read_csv(r"./Data/"+"hchs_quest_pa_one_hot.csv")


In [3]:
#Base for regression pipeline, this time with 'real' data extracted earlier in the notebook
#print(df.head(5))
#I've left out most of the PA columns as the are highly correlated
df_nan = df[['SleepEfficiency', 'Awakening', 'TotalSleepTime',
       'TotalWakeTime', 'SRIDay', 'SRISleep','mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped','gender', 'age', 'bmi', 'cafe', 'tea',
       'cancer', 'ssleepd', 'epworth', 'whiirs', 'insomnia_sev', 'insomnia_sev_grp_2.0',
       'insomnia_sev_grp_3.0', 'insomnia_sev_grp_4.0', 'alcohol_2.0',
       'alcohol_3.0', 'cigar_2.0', 'cigar_3.0', 'cafe_wake_1', 'cafe_wake_2',
       'cafe_wake_3', 'cafe_wake_4', 'employ_2.0', 'employ_3.0', 'employ_4.0',
       'diabetes_2.0', 'diabetes_3.0', 'persontype_1', 'persontype_2',
       'persontype_3', 'persontype_4']]

scaled_features = ['SleepEfficiency', 'Awakening', 'TotalSleepTime',
       'TotalWakeTime', 'SRIDay', 'SRISleep','mvpa_bouts_1min_grouped',
        'mvpa_bouts_10min_decomp', 'vpa_bouts_1min_grouped',
        'age', 'bmi', 'ssleepd', 'epworth', 'whiirs']

sleep_features = ['SleepEfficiency', 'Awakening', 'TotalSleepTime',
       'TotalWakeTime', 'SRIDay', 'SRISleep']

scaler = StandardScaler()

df_scaled = pd.DataFrame(scaler.fit_transform(df_nan), columns = df_nan.columns).dropna()

# Models:

## Model 1: MVPA, age, sex 
## Model 2: Model 1 + interactions(MVPA * sex), Model 1 + interaction(MVPA * BMI)
## Model 3: Model 1 + other anthropometrics (BMI, alcohol, cigar)
## Model 4: Model 1 + diet/lifestyle (cafe, tea, employ)
## Model 5: Model 1 + disease (cancer, diabetes)
## Model 6: Model 1 + sleep quest (Epworth, WHIIRS, insomnia, person_type)

Note: Bonferroni correction (3 tests) $\alpha < 0.0166$, Bonferroni corrected significance ( * ) if $\alpha < 0.05$ but > $0.0166$, nominally significant ( ** )

## 1. TST

In [5]:
#Model 1 - MVPA, sex, age
features_model1 = ['mvpa_bouts_1min_grouped','gender', 'age']

col1 = 'TotalSleepTime'
X_col1 = df_scaled[features_model1]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model1_col1_sm = sm.OLS(y_train, X_train).fit() ## sm.OLS(output, input)
predictions = model1_col1_sm.predict(X_test)

# Print out the statistics
print(model1_col1_sm.summary())



                            OLS Regression Results                            
Dep. Variable:         TotalSleepTime   R-squared:                       0.098
Model:                            OLS   Adj. R-squared:                  0.094
Method:                 Least Squares   F-statistic:                     28.83
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           1.06e-17
Time:                        10:34:09   Log-Likelihood:                -1094.8
No. Observations:                 801   AIC:                             2198.
Df Residuals:                     797   BIC:                             2216.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [6]:
#Model 1 - MVPA_10', sex, age # These are likely self contained in MVPA 1', so although significant we keep out
features_model1 = ['mvpa_bouts_10min_decomp','gender', 'age']

col1 = 'TotalSleepTime'
X_col1 = df_scaled[features_model1]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model1_col1_sm = sm.OLS(y_train, X_train).fit() ## sm.OLS(output, input)
predictions = model1_col1_sm.predict(X_test)

# Print out the statistics
print(model1_col1_sm.summary())

                            OLS Regression Results                            
Dep. Variable:         TotalSleepTime   R-squared:                       0.018
Model:                            OLS   Adj. R-squared:                  0.014
Method:                 Least Squares   F-statistic:                     4.763
Date:                Tue, 23 Jun 2020   Prob (F-statistic):            0.00268
Time:                        10:34:48   Log-Likelihood:                -1129.0
No. Observations:                 801   AIC:                             2266.
Df Residuals:                     797   BIC:                             2285.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [7]:
#Model 1 - VPA_1', sex, age # Not significant
features_model1 = ['vpa_bouts_1min_grouped','gender', 'age']

col1 = 'TotalSleepTime'
X_col1 = df_scaled[features_model1]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model1_col1_sm = sm.OLS(y_train, X_train).fit() ## sm.OLS(output, input)
predictions = model1_col1_sm.predict(X_test)

# Print out the statistics
print(model1_col1_sm.summary())

                            OLS Regression Results                            
Dep. Variable:         TotalSleepTime   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     2.727
Date:                Tue, 23 Jun 2020   Prob (F-statistic):             0.0431
Time:                        10:36:55   Log-Likelihood:                -1132.0
No. Observations:                 801   AIC:                             2272.
Df Residuals:                     797   BIC:                             2291.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      0

In [9]:
#Model 2 - MVPA, sex, age + interactions of MVPA # Not significant interaction
features_model1 = ['mvpa_bouts_1min_grouped','gender', 'age']
interactions = ['mvpa*gender']

col1 = 'TotalSleepTime'
 
df_int = df_scaled
df_int['mvpa*gender'] = df_int['mvpa_bouts_1min_grouped']*df_int['gender']


X_col1 = df_int[features_model1+interactions]
y_col1 = df_int[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model2_col1_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model2_col1_sm.predict(X_test)

# Print out the statistics
print(model2_col1_sm.summary())

                            OLS Regression Results                            
Dep. Variable:         TotalSleepTime   R-squared:                       0.098
Model:                            OLS   Adj. R-squared:                  0.094
Method:                 Least Squares   F-statistic:                     21.66
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           5.58e-17
Time:                        10:39:09   Log-Likelihood:                -1094.7
No. Observations:                 801   AIC:                             2199.
Df Residuals:                     796   BIC:                             2223.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [13]:
#Model 2 - MVPA, sex, age + BMI interactions of MVPA # Not significant interaction
features_model1 = ['mvpa_bouts_1min_grouped','gender', 'age','bmi']
interactions = ['mvpa*BMI']

col1 = 'TotalSleepTime'
 
df_int = df_scaled
df_int['mvpa*BMI'] = df_int['mvpa_bouts_1min_grouped']*df_int['bmi']


X_col1 = df_int[features_model1+interactions]
y_col1 = df_int[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model2_col1_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model2_col1_sm.predict(X_test)

# Print out the statistics
print(model2_col1_sm.summary())

                            OLS Regression Results                            
Dep. Variable:         TotalSleepTime   R-squared:                       0.109
Model:                            OLS   Adj. R-squared:                  0.104
Method:                 Least Squares   F-statistic:                     19.54
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           2.21e-18
Time:                        11:00:34   Log-Likelihood:                -1089.7
No. Observations:                 801   AIC:                             2191.
Df Residuals:                     795   BIC:                             2219.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [14]:
#Model 3 - MVPA, sex, age + other anthropometric
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']

col1 = 'TotalSleepTime'

X_col1 = df_scaled[features_model1+model3_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model3_col1_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model3_col1_sm.predict(X_test)

# Print out the statistics
print(model3_col1_sm.summary())

                            OLS Regression Results                            
Dep. Variable:         TotalSleepTime   R-squared:                       0.117
Model:                            OLS   Adj. R-squared:                  0.108
Method:                 Least Squares   F-statistic:                     13.09
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           7.86e-18
Time:                        11:01:39   Log-Likelihood:                -1086.4
No. Observations:                 801   AIC:                             2191.
Df Residuals:                     792   BIC:                             2233.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [15]:
#Model 4 - MVPA, sex, age + other anthropometric + diet/lifestyle
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']

col1 = 'TotalSleepTime'

X_col1 = df_scaled[features_model1+model3_add+model4_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model4_col1_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model4_col1_sm.predict(X_test)

# Print out the statistics
print(model4_col1_sm.summary())

                            OLS Regression Results                            
Dep. Variable:         TotalSleepTime   R-squared:                       0.140
Model:                            OLS   Adj. R-squared:                  0.122
Method:                 Least Squares   F-statistic:                     7.514
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           1.96e-17
Time:                        11:01:59   Log-Likelihood:                -1075.6
No. Observations:                 801   AIC:                             2187.
Df Residuals:                     783   BIC:                             2271.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [16]:
#Model 5 - MVPA, sex, age + other anthropometric + diet/lifestyle + disease
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']
model5_add = ['cancer',  'diabetes_2.0', 'diabetes_3.0']

col1 = 'TotalSleepTime'

X_col1 = df_scaled[features_model1+model3_add+model4_add+model5_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model5_col1_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model5_col1_sm.predict(X_test)

# Print out the statistics
print(model5_col1_sm.summary())

                            OLS Regression Results                            
Dep. Variable:         TotalSleepTime   R-squared:                       0.148
Model:                            OLS   Adj. R-squared:                  0.126
Method:                 Least Squares   F-statistic:                     6.770
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           1.95e-17
Time:                        11:02:42   Log-Likelihood:                -1072.0
No. Observations:                 801   AIC:                             2186.
Df Residuals:                     780   BIC:                             2284.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [17]:
#Model 6 - MVPA, sex, age + other anthropometric + diet/lifestyle + disease + sleep questionnaires
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']
model5_add = ['cancer',  'diabetes_2.0', 'diabetes_3.0']
model6_add = [ 'ssleepd', 'epworth', 'whiirs', 'insomnia_sev', 'insomnia_sev_grp_2.0',
       'insomnia_sev_grp_3.0', 'insomnia_sev_grp_4.0','persontype_1', 'persontype_2', 'persontype_3', 'persontype_4']

col1 = 'TotalSleepTime'

X_col1 = df_scaled[features_model1+model3_add+model4_add+model5_add+model6_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model6_col1_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model6_col1_sm.predict(X_test)

# Print out the statistics
print(model6_col1_sm.summary())

                            OLS Regression Results                            
Dep. Variable:         TotalSleepTime   R-squared:                       0.250
Model:                            OLS   Adj. R-squared:                  0.220
Method:                 Least Squares   F-statistic:                     8.262
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           2.47e-31
Time:                        11:03:05   Log-Likelihood:                -1020.9
No. Observations:                 801   AIC:                             2106.
Df Residuals:                     769   BIC:                             2256.
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

## Sleep Efficiency

In [18]:
#Model 1 - MVPA, sex, age
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender', 'age']

col1 = 'SleepEfficiency'
X_col1 = df_scaled[features_model1]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model1_col2_sm = sm.OLS(y_train, X_train).fit() ## sm.OLS(output, input)
predictions = model1_col2_sm.predict(X_test)

# Print out the statistics
print(model1_col2_sm.summary())

                            OLS Regression Results                            
Dep. Variable:        SleepEfficiency   R-squared:                       0.059
Model:                            OLS   Adj. R-squared:                  0.055
Method:                 Least Squares   F-statistic:                     16.62
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           1.77e-10
Time:                        11:03:22   Log-Likelihood:                -1138.9
No. Observations:                 801   AIC:                             2286.
Df Residuals:                     797   BIC:                             2305.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [24]:
#Model 1 - MVPA 10', sex, age # Not significant
features_model1 = ['mvpa_bouts_10min_decomp',
        'gender', 'age']

col1 = 'SleepEfficiency'
X_col1 = df_scaled[features_model1]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model1_col2_sm = sm.OLS(y_train, X_train).fit() ## sm.OLS(output, input)
predictions = model1_col2_sm.predict(X_test)

# Print out the statistics
print(model1_col2_sm.summary())

                            OLS Regression Results                            
Dep. Variable:        SleepEfficiency   R-squared:                       0.027
Model:                            OLS   Adj. R-squared:                  0.023
Method:                 Least Squares   F-statistic:                     7.359
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           7.21e-05
Time:                        11:12:07   Log-Likelihood:                -1152.3
No. Observations:                 801   AIC:                             2313.
Df Residuals:                     797   BIC:                             2331.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [19]:
#Model 2 - MVPA, sex, age + interactions of MVPA # not significant
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender', 'age']
interactions = ['mvpa*gender']

col1 = 'SleepEfficiency'
 
df_int = df_scaled

df_int['mvpa*gender'] = df_int['mvpa_bouts_1min_grouped']*df_int['gender']



X_col1 = df_int[features_model1+interactions]
y_col1 = df_int[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model2_col2_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model2_col2_sm.predict(X_test)

# Print out the statistics
print(model2_col2_sm.summary())

                            OLS Regression Results                            
Dep. Variable:        SleepEfficiency   R-squared:                       0.061
Model:                            OLS   Adj. R-squared:                  0.056
Method:                 Least Squares   F-statistic:                     12.84
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           3.92e-10
Time:                        11:04:28   Log-Likelihood:                -1138.2
No. Observations:                 801   AIC:                             2286.
Df Residuals:                     796   BIC:                             2310.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [20]:
#Model 2 - MVPA, sex, age + BMI interactions of MVPA # not significant
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender', 'age','bmi']
interactions = ['mvpa*BMI']

col1 = 'SleepEfficiency'
 
df_int = df_scaled

df_int['mvpa*gender'] = df_int['mvpa_bouts_1min_grouped']*df_int['gender']
df_int['mvpa*BMI'] = df_int['mvpa_bouts_1min_grouped']*df_int['bmi']


X_col1 = df_int[features_model1+interactions]
y_col1 = df_int[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model2_col2_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model2_col2_sm.predict(X_test)

# Print out the statistics
print(model2_col2_sm.summary())

                            OLS Regression Results                            
Dep. Variable:        SleepEfficiency   R-squared:                       0.065
Model:                            OLS   Adj. R-squared:                  0.059
Method:                 Least Squares   F-statistic:                     11.03
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           2.77e-10
Time:                        11:05:34   Log-Likelihood:                -1136.4
No. Observations:                 801   AIC:                             2285.
Df Residuals:                     795   BIC:                             2313.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [21]:
#Model 3 - MVPA, sex, age + other anthropometric
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']

col1 = 'SleepEfficiency'

X_col1 = df_scaled[features_model1+model3_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model3_col2_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model3_col2_sm.predict(X_test)

# Print out the statistics
print(model3_col2_sm.summary())

                            OLS Regression Results                            
Dep. Variable:        SleepEfficiency   R-squared:                       0.087
Model:                            OLS   Adj. R-squared:                  0.077
Method:                 Least Squares   F-statistic:                     9.383
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           1.97e-12
Time:                        11:10:49   Log-Likelihood:                -1127.0
No. Observations:                 801   AIC:                             2272.
Df Residuals:                     792   BIC:                             2314.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [25]:
#Model 4 - MVPA, sex, age + other anthropometric + diet/lifestyle
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']

col1 = 'SleepEfficiency'

X_col1 = df_scaled[features_model1+model3_add+model4_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model4_col2_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model4_col2_sm.predict(X_test)

# Print out the statistics
print(model4_col2_sm.summary())

                            OLS Regression Results                            
Dep. Variable:        SleepEfficiency   R-squared:                       0.117
Model:                            OLS   Adj. R-squared:                  0.098
Method:                 Least Squares   F-statistic:                     6.132
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           1.49e-13
Time:                        11:13:02   Log-Likelihood:                -1113.2
No. Observations:                 801   AIC:                             2262.
Df Residuals:                     783   BIC:                             2347.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [27]:
#Model 5 - MVPA, sex, age + other anthropometric + diet/lifestyle + disease
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']
model5_add = ['cancer',  'diabetes_2.0', 'diabetes_3.0']

col1 = 'SleepEfficiency'

X_col1 = df_scaled[features_model1+model3_add+model4_add+model5_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model5_col2_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model5_col2_sm.predict(X_test)

# Print out the statistics
print(model5_col2_sm.summary())

                            OLS Regression Results                            
Dep. Variable:        SleepEfficiency   R-squared:                       0.121
Model:                            OLS   Adj. R-squared:                  0.098
Method:                 Least Squares   F-statistic:                     5.348
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           7.16e-13
Time:                        11:13:36   Log-Likelihood:                -1111.8
No. Observations:                 801   AIC:                             2266.
Df Residuals:                     780   BIC:                             2364.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [28]:
#Model 6 - MVPA, sex, age + other anthropometric + diet/lifestyle + disease + sleep questionnaires
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']
model5_add = ['cancer',  'diabetes_2.0', 'diabetes_3.0']
model6_add = [ 'ssleepd', 'epworth', 'whiirs', 'insomnia_sev', 'insomnia_sev_grp_2.0',
       'insomnia_sev_grp_3.0', 'insomnia_sev_grp_4.0','persontype_1', 'persontype_2', 'persontype_3', 'persontype_4']

col1 = 'SleepEfficiency'

X_col1 = df_scaled[features_model1+model3_add+model4_add+model5_add+model6_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model6_col2_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model6_col2_sm.predict(X_test)

# Print out the statistics
print(model6_col2_sm.summary())

                            OLS Regression Results                            
Dep. Variable:        SleepEfficiency   R-squared:                       0.161
Model:                            OLS   Adj. R-squared:                  0.128
Method:                 Least Squares   F-statistic:                     4.774
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           1.99e-15
Time:                        11:13:48   Log-Likelihood:                -1092.7
No. Observations:                 801   AIC:                             2249.
Df Residuals:                     769   BIC:                             2399.
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

## 3. SRI

In [29]:
#Model 1 - MVPA, sex, age # Nominally significant, not Bonferroni sig.
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender', 'age']

col1 = 'SRISleep'
X_col1 = df_scaled[features_model1]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model1_col3_sm = sm.OLS(y_train, X_train).fit() ## sm.OLS(output, input)
predictions = model1_col3_sm.predict(X_test)

# Print out the statistics
print(model1_col3_sm.summary())

                            OLS Regression Results                            
Dep. Variable:               SRISleep   R-squared:                       0.046
Model:                            OLS   Adj. R-squared:                  0.043
Method:                 Least Squares   F-statistic:                     12.91
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           3.05e-08
Time:                        11:13:58   Log-Likelihood:                -1116.2
No. Observations:                 801   AIC:                             2240.
Df Residuals:                     797   BIC:                             2259.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [30]:
#Model 1 - MVPA, sex, age # INTERESTING CASE
features_model1 = ['mvpa_bouts_10min_decomp',
        'gender', 'age']

col1 = 'SRISleep'
X_col1 = df_scaled[features_model1]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model1_col3_sm = sm.OLS(y_train, X_train).fit() ## sm.OLS(output, input)
predictions = model1_col3_sm.predict(X_test)

# Print out the statistics
print(model1_col3_sm.summary())

                            OLS Regression Results                            
Dep. Variable:               SRISleep   R-squared:                       0.040
Model:                            OLS   Adj. R-squared:                  0.037
Method:                 Least Squares   F-statistic:                     11.17
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           3.49e-07
Time:                        11:14:39   Log-Likelihood:                -1118.7
No. Observations:                 801   AIC:                             2245.
Df Residuals:                     797   BIC:                             2264.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [32]:
#Model 2 - MVPA, sex, age + interactions of MVPA # Not significant 
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender','age']
interactions = ['mvpa*gender']

col1 = 'SRISleep'
 
df_int = df_scaled

df_int['mvpa*gender'] = df_int['mvpa_bouts_1min_grouped']*df_int['gender']



X_col1 = df_int[features_model1+interactions]
y_col1 = df_int[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model2_col3_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model2_col3_sm.predict(X_test)

# Print out the statistics
print(model2_col3_sm.summary())

                            OLS Regression Results                            
Dep. Variable:               SRISleep   R-squared:                       0.046
Model:                            OLS   Adj. R-squared:                  0.042
Method:                 Least Squares   F-statistic:                     9.671
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           1.22e-07
Time:                        11:24:28   Log-Likelihood:                -1116.2
No. Observations:                 801   AIC:                             2242.
Df Residuals:                     796   BIC:                             2266.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [33]:
#Model 2 - MVPA, sex, age + interactions of MVPA # Main effect isn't but interaction is (look this up)
#Break into tertiles and test within tertile (BMI Tertiles)
# Check if the association is different--> run in different tertiles of BMI: age, sex, mvpa ---> sleepregularityindex
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender','age','bmi']
interactions = ['mvpa*BMI']

col1 = 'SRISleep'
 
df_int = df_scaled


df_int['mvpa*BMI'] = df_int['mvpa_bouts_1min_grouped']*df_int['bmi']


X_col1 = df_int[features_model1+interactions]
y_col1 = df_int[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model2_col3_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model2_col3_sm.predict(X_test)

# Print out the statistics
print(model2_col3_sm.summary())

                            OLS Regression Results                            
Dep. Variable:               SRISleep   R-squared:                       0.059
Model:                            OLS   Adj. R-squared:                  0.053
Method:                 Least Squares   F-statistic:                     9.925
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           3.19e-09
Time:                        11:24:50   Log-Likelihood:                -1110.9
No. Observations:                 801   AIC:                             2234.
Df Residuals:                     795   BIC:                             2262.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [34]:
#Model 3 - MVPA, sex, age + other anthropometric
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']

col1 = 'SRISleep'

X_col1 = df_scaled[features_model1+model3_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model3_col3_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model3_col3_sm.predict(X_test)

# Print out the statistics
print(model3_col3_sm.summary())

                            OLS Regression Results                            
Dep. Variable:               SRISleep   R-squared:                       0.070
Model:                            OLS   Adj. R-squared:                  0.061
Method:                 Least Squares   F-statistic:                     7.499
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           1.13e-09
Time:                        11:28:05   Log-Likelihood:                -1105.9
No. Observations:                 801   AIC:                             2230.
Df Residuals:                     792   BIC:                             2272.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [35]:
#Model 4 - MVPA, sex, age + other anthropometric + diet/lifestyle
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']

col1 = 'SRISleep'

X_col1 = df_scaled[features_model1+model3_add+model4_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model4_col3_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model4_col3_sm.predict(X_test)

# Print out the statistics
print(model4_col3_sm.summary())

                            OLS Regression Results                            
Dep. Variable:               SRISleep   R-squared:                       0.084
Model:                            OLS   Adj. R-squared:                  0.064
Method:                 Least Squares   F-statistic:                     4.208
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           3.07e-08
Time:                        11:28:23   Log-Likelihood:                -1100.2
No. Observations:                 801   AIC:                             2236.
Df Residuals:                     783   BIC:                             2321.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [36]:
#Model 5 - MVPA, sex, age + other anthropometric + diet/lifestyle + disease
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']
model5_add = ['cancer',  'diabetes_2.0', 'diabetes_3.0']

col1 = 'SRISleep'

X_col1 = df_scaled[features_model1+model3_add+model4_add+model5_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model5_col3_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model5_col3_sm.predict(X_test)

# Print out the statistics
print(model5_col3_sm.summary())

                            OLS Regression Results                            
Dep. Variable:               SRISleep   R-squared:                       0.088
Model:                            OLS   Adj. R-squared:                  0.065
Method:                 Least Squares   F-statistic:                     3.766
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           6.52e-08
Time:                        11:28:41   Log-Likelihood:                -1098.3
No. Observations:                 801   AIC:                             2239.
Df Residuals:                     780   BIC:                             2337.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [37]:
#Model 6 - MVPA, sex, age + other anthropometric + diet/lifestyle + disease + sleep questionnaires
features_model1 = ['mvpa_bouts_1min_grouped',
        'gender', 'age']
model3_add = ['bmi','alcohol_2.0','alcohol_3.0', 'cigar_2.0', 'cigar_3.0']
model4_add = ['cafe', 'tea', 'cafe_wake_1', 'cafe_wake_2', 'cafe_wake_3', 'cafe_wake_4', 
              'employ_2.0', 'employ_3.0', 'employ_4.0']
model5_add = ['cancer',  'diabetes_2.0', 'diabetes_3.0']
model6_add = [ 'ssleepd', 'epworth', 'whiirs', 'insomnia_sev', 'insomnia_sev_grp_2.0',
       'insomnia_sev_grp_3.0', 'insomnia_sev_grp_4.0','persontype_1', 'persontype_2', 'persontype_3', 'persontype_4']

col1 = 'SRISleep'

X_col1 = df_scaled[features_model1+model3_add+model4_add+model5_add+model6_add]
y_col1 = df_scaled[col1]

X_col1 = sm.add_constant(X_col1)

X_train,X_test,y_train,y_test=train_test_split(X_col1,y_col1, test_size=0.2, random_state=111)

#Regression with sm for pvalues
# Note the difference in argument order

model6_col3_sm = sm.OLS(y_train,X_train).fit() ## sm.OLS(output, input)
predictions = model6_col3_sm.predict(X_test)

# Print out the statistics
print(model6_col3_sm.summary())

                            OLS Regression Results                            
Dep. Variable:               SRISleep   R-squared:                       0.148
Model:                            OLS   Adj. R-squared:                  0.113
Method:                 Least Squares   F-statistic:                     4.293
Date:                Tue, 23 Jun 2020   Prob (F-statistic):           3.04e-13
Time:                        11:29:01   Log-Likelihood:                -1071.3
No. Observations:                 801   AIC:                             2207.
Df Residuals:                     769   BIC:                             2356.
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     