In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
import numpy as np

In [None]:
# Read the data file
da = pd.read_csv("nhanes_2015_2016.csv")

# Drop unused columns, drop rows with any missing values.
vars = ["BPXSY1", "RIDAGEYR", "RIAGENDR", "RIDRETH1", "DMDEDUC2", "BMXBMI",
        "SMQ020", "SDMVSTRA", "SDMVPSU"]
da = da[vars].dropna()

In [None]:
da["group"] = 10*da.SDMVSTRA + da.SDMVPSU

In [None]:
model = sm.GEE.from_formula("BPXSY1 ~ 1", groups="group",
           cov_struct=sm.cov_struct.Exchangeable(), data=da)
result = model.fit()
print(result.cov_struct.summary())

In [None]:
# Recode smoking to a simple binary variable
da["smq"] = da.SMQ020.replace({2: 0, 7: np.nan, 9: np.nan})

for v in ["BPXSY1", "RIDAGEYR", "BMXBMI", "smq", "SDMVSTRA"]:
    model = sm.GEE.from_formula(v + " ~ 1", groups="group",
           cov_struct=sm.cov_struct.Exchangeable(), data=da)
    result = model.fit()
    print(v, result.cov_struct.summary())

In [None]:
for k in range(10):
    da["noise"] = np.random.normal(size=da.shape[0])
    model = sm.GEE.from_formula("noise ~ 1", groups="group",
           cov_struct=sm.cov_struct.Exchangeable(), data=da)
    result = model.fit()
    print(v, result.cov_struct.summary())

In [None]:
model = sm.GEE.from_formula("BPXSY1 ~ RIDAGEYR", groups="group",
           cov_struct=sm.cov_struct.Exchangeable(), data=da)
result = model.fit()
print(result.cov_struct.summary())

In [None]:
# Create a labeled version of the gender variable
da["RIAGENDRx"] = da.RIAGENDR.replace({1: "Male", 2: "Female"})

model = sm.GEE.from_formula("BPXSY1 ~ RIDAGEYR + RIAGENDRx + BMXBMI + C(RIDRETH1)",
           groups="group",
           cov_struct=sm.cov_struct.Exchangeable(), data=da)
result = model.fit()
print(result.cov_struct.summary())

In [None]:
# Fit a linear model with OLS
model1 = sm.OLS.from_formula("BPXSY1 ~ RIDAGEYR + RIAGENDRx + BMXBMI + C(RIDRETH1)",
           data=da)
result1 = model1.fit()

# Fit a marginal linear model using GEE to handle dependent data
model2 = sm.GEE.from_formula("BPXSY1 ~ RIDAGEYR + RIAGENDRx + BMXBMI + C(RIDRETH1)",
           groups="group",
           cov_struct=sm.cov_struct.Exchangeable(), data=da)
result2 = model2.fit()

x = pd.DataFrame({"OLS_params": result1.params, "OLS_SE": result1.bse,
                  "GEE_params": result2.params, "GEE_SE": result2.bse})
x = x[["OLS_params", "OLS_SE", "GEE_params", "GEE_SE"]]
print(x)

In [None]:
# Relabel the levels, convert rare categories to missing.
da["DMDEDUC2x"] = da.DMDEDUC2.replace({1: "lt9", 2: "x9_11", 3: "HS", 4: "SomeCollege",
                                       5: "College", 7: np.nan, 9: np.nan})

# Fit a basic GLM
model1 = sm.GLM.from_formula("smq ~ RIDAGEYR + RIAGENDRx + C(DMDEDUC2x)",
           family=sm.families.Binomial(), data=da)
result1 = model1.fit()
result1.summary()

# Fit a marginal GLM using GEE
model2 = sm.GEE.from_formula("smq ~ RIDAGEYR + RIAGENDRx + C(DMDEDUC2x)",
           groups="group", family=sm.families.Binomial(),
           cov_struct=sm.cov_struct.Exchangeable(), data=da)
result2 = model2.fit(start_params=result1.params)

x = pd.DataFrame({"OLS_params": result1.params, "OLS_SE": result1.bse,
                  "GEE_params": result2.params, "GEE_SE": result2.bse})
x = x[["OLS_params", "OLS_SE", "GEE_params", "GEE_SE"]]
print(x)

In [None]:
# Fit a multilevel (mixed effects) model to handle dependent data
model = sm.MixedLM.from_formula("BPXSY1 ~ RIDAGEYR + RIAGENDRx + BMXBMI + C(RIDRETH1)",
           groups="group", data=da)
result = model.fit()
result.summary()

In [None]:
result.random_effects

In [None]:
da["age_cen"] = da.groupby("group").RIDAGEYR.transform(lambda x: x - x.mean())

model = sm.MixedLM.from_formula("BPXSY1 ~ age_cen + RIAGENDRx + BMXBMI + C(RIDRETH1)",
           groups="group", vc_formula={"age_cen": "0+age_cen"}, data=da)
result = model.fit()
result.summary()

In [None]:
model = sm.MixedLM.from_formula("BPXSY1 ~ age_cen + RIAGENDRx + BMXBMI + C(RIDRETH1)",
           groups="group", re_formula="1+age_cen", data=da)
result = model.fit()
result.summary()