In [1]:
from utils import *



# read data
file_name = "Estimation Data by Subject - Last Two Days Binary - Merged Subjects.dta"
file_dir = "..\\data\\"
file_dir_name = file_dir + file_name
data = pd.read_stata(file_dir_name)


Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
# keep advertiser ranks 1, 3
data = data[(data['advertiser_rank'] == 1) | (data['advertiser_rank'] == 3)]

In [9]:

(X, Y, T) = define_xyt(data)


In [4]:

class PropensityModel(BaseEstimator):
    def __init__(self):
        self.lr = LogisticRegression(max_iter=2000)

    def predict_proba(self, X, X_indices=slice(-17,-1)):
        return self.lr.predict_proba(X[:,X_indices])


    
    # X_indices are the ones that are used for the estimation of the propensity score
    def fit(self, X, y, X_indices=slice(-17,-1)):
        self.lr.fit(X[:,X_indices], y)
        return self


# Instantiate propensity_model from the PropensityModel class
propensity_model = PropensityModel()




# Define the hyperparameters to search over
param_grid = {
    # 'n_estimators': [50, 100, 200],
    'n_estimators': [200],
    'max_depth': [10, 20],
    'min_samples_split': [5000, 10000, 15000]
}


In [10]:
X.columns

Index(['impression_repeat', 'previous_clicks', 'previous_clicks_all_ads',
       'impression_repeat_base_ad', 'previous_clicks_base_ad', 'total_visits',
       'visit_s2', 'visit_s4', 'visit_s5', 'visit_s6', 'visit_s8', 'visit_s9',
       'visit_s11', 'visit_s12', 'visit_s14', 'visit_s16', 'visit_s17',
       'visit_s18', 'visit_s19', 'visit_s23', 'visit_s24', 'visit_s25',
       'sub_2', 'sub_4', 'sub_5', 'sub_6', 'sub_8', 'sub_9', 'sub_11',
       'sub_12', 'sub_14', 'sub_16', 'sub_17', 'sub_18', 'sub_19', 'sub_23',
       'sub_24', 'sub_25', 'mobile'],
      dtype='object')

In [12]:
X

Unnamed: 0,impression_repeat,previous_clicks,previous_clicks_all_ads,impression_repeat_base_ad,previous_clicks_base_ad,total_visits,visit_s2,visit_s4,visit_s5,visit_s6,...,sub_12,sub_14,sub_16,sub_17,sub_18,sub_19,sub_23,sub_24,sub_25,mobile
0,2,0,0,0,0,11,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,23,0,0,0,0,51,1,0,1,1,...,0,0,0,0,0,0,0,0,0,1
3,39,0,0,0,0,101,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27303034,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
27303035,17,0,0,0,0,21,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
27303036,8,0,0,0,0,21,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
27303037,8,0,0,0,0,21,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
(X, Y, T) = define_xyt(data)
# find best parameters for the m model
best_params = m_model_best_estimator(X, Y)
# estimate the casual forest model
# define the causal forest model
cf = CausalForestDML(
                        model_y=RandomForestRegressor(**best_params),
                        model_t=propensity_model,
                        discrete_treatment='True',
                        criterion='het',
                        n_jobs=n_jobs,
                        random_state=42,
                        verbose=0   
    )

# tune the model:
start_time = time.perf_counter()

# Define the hyperparameters to search over
cf_param_grid = {
    # 'n_estimators': [100, 200, 300],
    'n_estimators': [300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [5000 , 8000, 10000, 15000],
    'max_samples': [0.05, 0.1, 0.15]
}

tune_params = cf.tune(
            Y=Y,
            T=T,
            X=X,
            params=cf_param_grid)

finish_time = time.perf_counter()

print(f"finished tuning the model in {finish_time - start_time} seconds")

# fit the model using tuned parameters:
start_time = time.perf_counter()

cf.fit(Y=Y, T=T, X=X, inference="blb", cache_values=True)

finish_time = time.perf_counter()
print(f"finished fitting the model in {finish_time - start_time} seconds")

# save the model
file_name = f"..\\results\\main_scenario\\CF - Rank 1 and 3 - Merged Subjects.pkl"
joblib.dump(cf, file_name)

finished tuning the M model in 10402.496100199998 seconds


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


ValueError: X has 26 features, but LogisticRegression is expecting 16 features as input.

In [7]:
X

Unnamed: 0,impression_repeat,previous_clicks,previous_clicks_all_ads,impression_repeat_base_ad,previous_clicks_base_ad,total_visits,visit_s2,visit_s4,visit_s5,visit_s6,...,sub_12,sub_14,sub_16,sub_17,sub_18,sub_19,sub_23,sub_24,sub_25,mobile
0,2,0,0,0,0,11,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,23,0,0,0,0,51,1,0,1,1,...,0,0,0,0,0,0,0,0,0,1
3,39,0,0,0,0,101,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27303034,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
27303035,17,0,0,0,0,21,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
27303036,8,0,0,0,0,21,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
27303037,8,0,0,0,0,21,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
