In [None]:
# !pip install dowhy
# !pip install dowhy==0.8
# !pip install dask[dataframe]
# !sudo apt-get install graphviz graphviz-dev
# !sudo apt-get install build-essential
# !pip install pygraphviz

In [None]:
from copy import deepcopy

import numpy as np
import pandas as pd
from scipy import stats

from sklearn.metrics import mean_absolute_percentage_error

import dowhy
from dowhy import CausalModel

from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from lightgbm import LGBMRegressor, LGBMClassifier

import networkx as nx

from tqdm import tqdm

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import graphviz

In [None]:
dowhy.__version__

'0.8'

In [None]:
earnings_data = pd.read_csv('./data/ml_earnings.csv')
earnings_data.head(5)

Unnamed: 0,age,took_a_course,earnings
0,19,False,110579.0
1,28,False,142577.0
2,22,True,130520.0
3,25,True,142687.0
4,24,False,127832.0


In [None]:
earnings_data.shape

(200, 3)

# Biased or Naive Estimate on Taking a course on Earnings

In [None]:
earnings_data.groupby(['age', 'took_a_course']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,earnings
age,took_a_course,Unnamed: 2_level_1
19,False,111110.875
19,True,123488.0
20,False,115227.142857
20,True,125633.285714
21,False,117114.9
21,True,130155.0
22,False,120226.363636
22,True,131437.222222
23,False,124460.9
23,True,134654.333333


you can see that for some age groups
(for instance, 36 or 38), there are observations only for one of the values of the treatment. This means
that we won’t be able to compute the exact effects for these groups. We’ll leave it to our matching
estimator to handle this



let’s compute the naïve estimate of the causal effect of training
on earnings using the treatment and control group means

In [None]:
took_course_mean = earnings_data.groupby(['took_a_course'])['earnings'].mean()
took_course_mean

Unnamed: 0_level_0,earnings
took_a_course,Unnamed: 1_level_1
False,140680.508065
True,147376.078947


In [None]:
treatment_avg = took_course_mean.loc[True,]
control_avg = took_course_mean.loc[False,]
naive_att = treatment_avg - control_avg
naive_att

6695.570882852306

Naive estimate suggest that taking course increase the earnings by 6695 USD

# Unbiased estimate of effect on taking on course on earnings using approximate matching estimate

### Step #1: representing the problem as a graph

In [None]:
nodes = ['took_a_course', 'earnings', 'age']
edges = [
    ('took_a_course', 'earnings'),
    ('age', 'took_a_course'),
    ('age', 'earnings')
]

In [None]:
# Generate the GML graph
gml_string = 'graph [directed 1\n'

for node in nodes:
    gml_string += f'\tnode [id "{node}" label "{node}"]\n'

for edge in edges:
    gml_string += f'\tedge [source "{edge[0]}" target "{edge[1]}"]\n'

gml_string += ']'

In [None]:
# Instantiate the CausalModel
model = CausalModel(
    data=earnings_data,
    treatment='took_a_course',
    outcome='earnings',
    graph=gml_string
)

In [None]:
model.view_model()

### Step 2 – getting the estimand

In [None]:
# Get the estimand
estimand = model.identify_effect()

print(estimand)

Estimand type: nonparametric-ate

### Estimand : 1
Estimand name: backdoor
Estimand expression:
       d                         
────────────────(E[earnings|age])
d[took_a_course]                 
Estimand assumption 1, Unconfoundedness: If U→{took_a_course} and U→earnings then P(earnings|took_a_course,age,U) = P(earnings|took_a_course,age)

### Estimand : 2
Estimand name: iv
No such variable(s) found!

### Estimand : 3
Estimand name: frontdoor
No such variable(s) found!



### Step 3 – computing the effect

In [None]:
# Get estimate (Matching)
estimate = model.estimate_effect(
    identified_estimand=estimand,
    method_name='backdoor.distance_matching',
    target_units='ate',
    method_params={'distance_metric': 'minkowski', 'p': 2})

distance_matching


Let's break down the components:
`estimate = model.estimate_effect(...)`: This calls the `estimate_effect` method of the `CausalModel` object, which is responsible for computing the causal effect.
`identified_estimand=estimand`: This specifies the estimand that was previously identified using the `identify_effect` method. The estimand defines the target causal quantity we want to estimate (e.g., average treatment effect).
`method_name='backdoor.distance_matching'`: This specifies the estimation method to be used. In this case, it's a backdoor adjustment using distance matching. This means we're trying to estimate the causal effect by comparing treated and control units that are similar in terms of their observed covariates (confounders).
`target_units='ate'`: This indicates that we want to estimate the average treatment effect (ATE), which is the overall causal effect of the treatment on the outcome across all units in the population.

`method_params={'distance_metric': 'minkowski', 'p': 2'}`: This provides specific parameters for the distance matching method.
- `distance_metric='minkowski'` specifies the metric used to calculate the distance between units. Minkowski distance is a generalization of Euclidean distance.
- `p=2` specifies the order of the Minkowski distance, which in this case corresponds to Euclidean distance.


In [None]:
estimate.value

10464.5

### Step #4: refute the estimate

In [None]:
refutation = model.refute_estimate(
    estimand=estimand,
    estimate=estimate,
    method_name='random_common_cause')

In [None]:
print(refutation)

Refute: Add a random common cause
Estimated effect:10464.5
New effect:10321.8755
p value:0.5



We see that the new effect is slightly higher than the estimated one. Nonetheless, a high p value
indicates that the change is not statistically significant.So our estimates are robust

# Inverse probability weighting (IPW)


In [None]:
estimate = model.estimate_effect(
    identified_estimand=estimand,
    method_name='backdoor.propensity_score_weighting',
    target_units='ate')

propensity_score_weighting


  y = column_or_1d(y, warn=True)


In [None]:
estimate.value

10313.566831120319