In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import spearmanr, mannwhitneyu
import statsmodels.api as sm

from IPython.display import display

%matplotlib inline

plt.style.use('ggplot')

# Data simulation

### Causal model:

* sugar intake &rarr; BMI
* smoking &rarr; lung cancer
* BMI, alcohol intake, lung cancer &rarr; death
* sex, age &rarr; everything else

<img src = "https://drive.google.com/uc?id=1eJhUFTLO5DaWvktB-yEP7tr47p6N8RHg" alt = "Causal diagram" width = 500px/>

In [None]:
N = 100000

def translate_to_range(values, new_min_value, new_max_value):
    values = (values - values.min()) / (values.max() - values.min())
    return new_min_value + (new_max_value - new_min_value) * values
    
def top_quantile_to_one(values, top_quantile):
    quantile_value = values.quantile(1 - top_quantile)
    return (values >= quantile_value).astype(int)
    
np.random.seed(100)

data = pd.DataFrame(index = np.arange(N))
data['sex'] = np.random.randint(0, 2, N) # 0 - female, 1 - male
data['age'] = np.random.randint(21, 100, N)
data['sugar_intake'] = translate_to_range(-data['age'] - 25 * data['sex'] + 300 * np.random.randn(N), 1, 5).round().astype(int)
data['alcohol_intake'] = translate_to_range(-data['age'] + 50 * data['sex'] + 100 * np.random.randn(N), 1, 5).round().astype(int)
data['smoking'] = top_quantile_to_one(data['age'] + 5 * data['sex'] + 200 * np.random.randn(N), 0.25)
data['BMI'] = translate_to_range(data['age'] - 3 * data['sex'] + 50 * data['sugar_intake'] + 200 * np.random.randn(N), 15, 45)\
        .round(1)
data['lung_cancer'] = top_quantile_to_one(data['age'] + 2 * data['sex'] + 200 * data['smoking'] + 200 * np.random.randn(N), \
        0.05)
data['death'] = top_quantile_to_one(data['age'] + 5 * data['sex'] + 0.5 * data['BMI'] + 1.5 * data['alcohol_intake'] + \
        30 * data['lung_cancer'] + 10 * np.random.randn(N), 0.2)
display(data)

# Multivariate linear regression

In [None]:
alcohol_intake_groups, BMI_values_in_groups = zip(*data.groupby('alcohol_intake')['BMI'])

fig, ax = plt.subplots()
ax.boxplot(BMI_values_in_groups)
ax.set_xlabel('Alcohol intake')
ax.set_xticklabels(alcohol_intake_groups)
ax.set_ylabel('BMI')
ax.set_title('BMI distribution as a function of alcohol intake')
plt.show()

print('Spearman\'s rank correlation between alcohol intake to BMI: œÅ = %.2f, p-value = %.2g.' % \
        spearmanr(data['alcohol_intake'], data['BMI']))

There exists a weak (but very significant) association between alcohol intake to BMI. If not careful, one might jump to the conclusion that drinking alcohol affects weight. But according to the data modeling, we know there is no causal relationship between the two variables. This association is the result of confounders (sex and age).

In [None]:
y = data['BMI']
X = sm.add_constant(data[['sex', 'age', 'alcohol_intake']])
model = sm.OLS(y, X)
model_results = model.fit()
regression_pval = model_results.pvalues['alcohol_intake']
print('The effect of alcohol intake on BMI (after adjusting for sex and age as covariates): p-value = %.2g.' % regression_pval)

In [None]:
model_results.pvalues

In [None]:
print(model_results.summary())

In [None]:
model_results = sm.OLS(data['BMI'], sm.add_constant(data[['sex', 'age', 'sugar_intake']])).fit()
regression_pval = model_results.pvalues['sugar_intake']
print('The effect of sugar intake on BMI (after adjusting for sex and age as covariates): p-value = %.2g.' % regression_pval)

sugar_intake_groups, BMI_values_in_groups = zip(*data.groupby('sugar_intake')['BMI'])
fig, ax = plt.subplots()
ax.boxplot(BMI_values_in_groups)
ax.set_xlabel('Sugar intake')
ax.set_xticklabels(sugar_intake_groups)
ax.set_ylabel('BMI')
ax.set_title('BMI distribution as a function of sugar intake')
plt.show()

Sugar intake is very significantly associated with BMI, even after controlling for sex and age. This is not surprising, as we modeled sugar intake to have a very strong causal effect on BMI.

In [None]:
'''
We can also include smoking and alcohol intake as additional covariates in this case, although it's not really necessary
(according to our causal model, sex and age are the only confounders we need to account for).
Of course not always we can be certain which variables are the relevant confounders. In many cases it is safe to include more
covaraites, but in some cases it can be dangerous (we will see an example later).
'''
sm.OLS(data['BMI'], sm.add_constant(data[['sex', 'age', 'alcohol_intake', 'smoking', 'sugar_intake']])).fit()\
        .pvalues['sugar_intake']

# Logistic regression

In [None]:
n_samples_per_alcohol_intake_and_lung_cancer_group = data.groupby('lung_cancer')['alcohol_intake'].value_counts().sort_index()\
        .unstack().fillna(0)
alcohol_intake_dist_given_lung_cancer = 100 * n_samples_per_alcohol_intake_and_lung_cancer_group.divide(\
        n_samples_per_alcohol_intake_and_lung_cancer_group.sum(axis = 1), axis = 0)
display(alcohol_intake_dist_given_lung_cancer.applymap(lambda pctg: '%.2f%%' % pctg))

alcohol_intake_given_lung_cancer = data.loc[data['lung_cancer'] == 1, 'alcohol_intake']
alcohol_intake_given_no_lung_cancer = data.loc[data['lung_cancer'] == 0, 'alcohol_intake']
_, utest_pval = mannwhitneyu(alcohol_intake_given_lung_cancer, alcohol_intake_given_no_lung_cancer)
print('U-test p-value = %.2g' % utest_pval)

Alcohol intake is associated with lung cancer, even though we know it is a spurious correlation.

In [None]:
y = data['lung_cancer']
X = sm.add_constant(data[['sex', 'age', 'alcohol_intake']])
model = sm.Logit(y, X)
model_results = model.fit()
regression_pval = model_results.pvalues['alcohol_intake']
print(50 * '*')
print('The effect of alcohol intake on lung cancer (after adjusting for sex and age as covariates): p-value = %.2g.' % \
        regression_pval)

In [None]:
model_results.pvalues

In [None]:
print(model_results.summary())

In [None]:
model_results = sm.Logit(data['lung_cancer'], sm.add_constant(data[['sex', 'age', 'smoking']])).fit()
regression_pval = model_results.pvalues['smoking']
print(50 * '*')
print('The effect of smoking intake on lung cancer (after adjusting for sex and age as covariates): p-value = %.2g.' % \
        regression_pval)

In [None]:
'''
Here too we may include more covariates than really necessary.
'''
sm.Logit(data['lung_cancer'], sm.add_constant(data[['sex', 'age', 'smoking', 'alcohol_intake', 'sugar_intake']])).fit()\
        .pvalues['smoking']

# Colliders

In [None]:
sm.OLS(data['BMI'], sm.add_constant(data[['sex', 'age', 'lung_cancer']])).fit().pvalues['lung_cancer']

BMI is not associated with lung cancer (conditional on sex and age), as expected.

<img src = "https://drive.google.com/uc?id=1eJhUFTLO5DaWvktB-yEP7tr47p6N8RHg" alt = "Causal diagram" width = 500px/>

But what happens if we also condition on death?

In [None]:
sm.OLS(data['BMI'], sm.add_constant(data[['sex', 'age', 'lung_cancer', 'death']])).fit().pvalues['lung_cancer']

This is known as a collider. Since both BMI and and lung cancer affect death, __we must NOT control for it!__

Intuitively, if a dead person didn't have lung cancer, his death might be explained by his BMI; but if he did have lung cancer, that may already explain away his death. Therefore, we expect to find a negative correlation between BMI and lung cancer when coditioning on death (as opposed to no correlation at all when we don't condition on a collider).

A commonly used toy example: __Hollywood actors__

<img src = "https://drive.google.com/uc?id=1XelfNNoR6ZIO-7tzXGVGWhqERFRelWXl" alt = "Causal diagram" width = 300px/>

__"Controlling for everything" is deeply misguided!__ (Recommended reading: "The Book of Why" by Judea Pearl)