In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

import ylearn
from ylearn.causal_discovery import CausalDiscovery

In [2]:
np.random.seed(2022)

In [3]:
train = pd.read_csv('./dataset/data/train.csv')
test = pd.read_csv('./dataset/data/test.csv')

In [4]:
# replace nan
def build_data(train):
    train_ = {}
    for i in train.columns:
        train_i = train[i]
        if any(train[i].isna()):
            train_i = train_i.replace(np.nan, train[i].mean())
        if len(train_i.value_counts()) <= 20 and train_i.dtype != object:
            train_i = train_i.astype(int)
        train_[i] = train_i

    return pd.DataFrame(train_)

train = build_data(train)
test = build_data(test)

In [5]:
all_cov = list(train.columns)
# save data and their corresponding transformers
class TransData:
    def __init__(self, name, is_obj=False):
        self.is_obj = is_obj
        self.name = name
        self.transformer = None

    def __call__(self, data):
        self.df = data[self.name]
        series = self.df.to_numpy().reshape(-1, 1)
        if self.df.dtype == object:
            self.is_obj = True
            self.transformer = OrdinalEncoder()
            self.data = self.transformer.fit_transform(series).astype(int)
        elif self.df.dtype != int:
            self.transformer = StandardScaler()
            self.data = self.transformer.fit_transform(series)
        else:
            self.data = series

In [6]:
# data preprocessing
data_dict = {}
cat_name = []
test_dict = {}

for name in all_cov:
    t = TransData(name=name)
    t(train)
    data_dict[name] = t.data.reshape(-1, )
    if t.is_obj:
        cat_name.append(name)
    if name not in ['treatment', 'outcome']:
        try:
            test_i = t.transformer.transform(test[name].values.reshape(-1, 1)).reshape(-1, )
        except:
            test_i = test[name]
        test_dict[name] = test_i
train_transformed = pd.DataFrame(data_dict)
test_data = pd.DataFrame(test_dict)

In [7]:
train_transformed.head()

Unnamed: 0,V_0,V_1,V_2,V_3,V_4,V_5,V_6,V_7,V_8,V_9,...,V_32,V_33,V_34,V_35,V_36,V_37,V_38,V_39,treatment,outcome
0,1.723577,-0.305753,-0.713223,-1.621706,-0.110603,0,1.967215,-1.605903,0,3,...,0.983957,1.170614,-0.043524,1.491432,53,-2,0,3,2,0.965484
1,-0.620006,1.144513,-0.713223,-0.836881,-0.329293,0,-0.32116,0.287543,0,999,...,0.935753,0.229336,0.849727,0.005753,50,0,2,4,0,1.110879
2,-0.844489,0.105237,1.23968,-1.558425,-0.300993,1,-0.277983,0.717924,0,999,...,-2.043339,-0.713962,-0.861334,0.631476,37,1,1,2,2,-2.25886
3,0.218723,-0.367827,-0.713223,-1.575069,-0.870663,1,0.952558,0.775616,0,999,...,-0.358267,0.035055,0.84504,0.112702,35,1,0,3,0,-0.267371
4,0.18364,0.928402,-0.713223,-0.134138,0.654154,1,-0.472279,0.77677,0,999,...,-0.07876,-0.046988,-0.110786,0.682046,58,1,0,2,2,-0.166405


Find relations between variables.

In [8]:
V = train_transformed.drop(['treatment', 'outcome'], axis=1).values
x = train_transformed['treatment'].values
y = train_transformed['outcome'].values

In [9]:
x_model = RandomForestClassifier(n_estimators=150, criterion='entropy', max_features=0.5, max_depth=50)
y_model = RandomForestRegressor(n_estimators=150, max_features=0.5, max_depth=100, )
x_model.fit(V, x)
x_importance = x_model.feature_importances_

In [10]:
y_model_input = np.concatenate((V, x.reshape(-1, 1)), axis=1)
y_model.fit(y_model_input, y=y)
y_importance = y_model.feature_importances_

In [11]:
confounder_list = []
for i, (x_, y_) in enumerate(zip(x_importance, y_importance)):
    if x_ >= 1e-3 and y_ >= 1e-5:
        confounder_list.append(all_cov[i])

In [12]:
V_new = train_transformed[confounder_list + ['treatment'] + ['outcome']]
# V_new = train_transformed

In [13]:
from ylearn.estimator_model import TLearner, XLearner
tl1 = TLearner(model=RandomForestRegressor(n_estimators=150, max_features=0.6),)
tl2 = TLearner(model=RandomForestRegressor(n_estimators=150, max_features=0.6))
tl1.fit(data=V_new, treatment='treatment', outcome='outcome', treat=1, control=0, covariate=confounder_list)
tl2.fit(data=V_new, treatment='treatment', outcome='outcome', treat=2, control=0, covariate=confounder_list)

TLearner(model=RandomForestRegressor(max_features=0.6, n_estimators=150), kwargs=None)

In [14]:
def get_ce(data, x1_model, x2_model):
    ce1 = x1_model.estimate(data)
    ce2 = x2_model.estimate(data)
    return np.concatenate([ce1.reshape(-1, 1), ce2.reshape(-1, 1)], axis=1)
ce = get_ce(V_new, tl1, tl2)

`ce`是训练集（train.csv）上的因果效应，另外需要估计测试集（test.csv）上的因果效应`ce_test`。最后需要把`ce_test`拼接在`ce`之后，存储到一个csv文件中上传到平台取得得分。得分为一个数值，越小说明结果越接近真实值，得分最小值为0。

In [15]:
ce_test = get_ce(test_data, x1_model=tl1, x2_model=tl2)
ce = np.concatenate((ce, ce_test), axis=0)

In [16]:
from ylearn.estimator_model import DML4CATE

dml = DML4CATE(cf_fold=1, x_model=RandomForestClassifier(n_estimators=250, criterion="entropy", max_depth=150, min_samples_leaf=2, min_samples_split=3, max_features=3),
               y_model=RandomForestRegressor(n_estimators=250, max_depth=150, min_samples_leaf=2, min_samples_split=2, max_features=3), is_discrete_treatment=True)
dml.fit(data=V_new, outcome='outcome', treatment='treatment', covariate=confounder_list,)
ce_dml = dml.effect_nji(data=V_new, control=0)
ce_dml_test = dml.effect_nji(data=test_data, control=0)
ce_dml_train = ce_dml[:, :, 1:].reshape(-1, 2)
ce_dml_all = np.concatenate([ce_dml_train, ce_dml_test[:, :, 1:].reshape(-1, 2)], axis=0)

In [17]:
from ylearn.estimator_model import CausalTree

ct1 = CausalTree(max_depth=100, min_samples_split=2, max_features=20)
ct2 = CausalTree(max_depth=100, max_features=20)
ct1.fit(data=V_new, outcome='outcome', treatment='treatment', covariate=confounder_list, treat=[1], control=[0])
ct2.fit(data=V_new, outcome='outcome', treatment='treatment', covariate=confounder_list, treat=[2], control=[0])

CausalTree(max_depth=100, max_features=20)

In [18]:
ce_ct = get_ce(V_new, ct1, ct2)
ce_ct_test = get_ce(test_data, ct1, ct2)
ce_ct_all = np.concatenate([ce_ct, ce_ct_test], axis=0)