In [36]:
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
from evidently.report import Report
from evidently.metrics import DataDriftTable, DatasetDriftMetric
from sklearn.model_selection import train_test_split
from scipy.stats import ks_2samp
from xgboost import XGBRegressor
from causalml.inference.meta import XGBTRegressor

In [37]:
# Import Data
df = pd.read_csv('/Users/qianlou/Documents/GitHub/Customer-Personality-Analysis-2.0/Data/Preprocessed Data/Final Preprocessed Data.csv')


In [38]:
treatments = ['Income_Category_High', 'Is_Parent', 'Cmp_Attitude', 'Complain']
covariates_list = ['Income_Category_High', 'Income_Category_Low', 'Income_Category_Medium',
    'Complain', 'Is_Parent', 'Cmp_Attitude', 'Family_Size', 'Age',
    'Member_Year', 'Total_amount', 'Total_purchase',
    'NumWebVisitsMonth', 'NumDealsPurchases', 'Recency']

In [39]:
# Causal inference pipeline function with drift detection
def causal_inference_pipeline_with_drift(df, X_cols, treatments):
    outcomes = ['Recency', 'Total_purchase', 'Total_amount']
    results = []

    # Drift report to analyze the overall drift in the dataset
    drift_report = Report(metrics=[
        DatasetDriftMetric(),  # Overall dataset drift analysis
        DataDriftTable()  # Feature-wise drift analysis table
    ])

    for outcome in outcomes:
        for treatment in treatments:
            # Prepare covariates
            covariates = [col for col in X_cols if col != treatment and col != outcome]
            if outcome == 'Total_purchase':
                covariates.remove('Total_amount')
            elif outcome == 'Total_amount':
                covariates.remove('Total_purchase')
            X = df[covariates]
            y = df[outcome]
            t = df[treatment]

            # Split data into training and testing sets
            X_train, X_test, y_train, y_test, t_train, t_test = train_test_split(
                X, y, t, test_size=0.2, random_state=42)

            # Track model performance and ATE metrics
            lowest_rmse = float('inf')
            best_model = None
            best_ate = None

            # Iterate over model types
            for model_type in [XGBRegressor, XGBTRegressor]:
                try:
                    # Model instantiation
                    if model_type == XGBRegressor:
                        model = model_type()
                        model.fit(X_train, y_train)
                    else:
                        model = model_type()
                        model.fit(X_train, t_train, y_train)

                    # Estimate ATE
                    ate_mean, ate_std = cross_validate_ate(model, X, t, y)

                    # Predictions for MSE calculation
                    y_pred = model.predict(X_test)
                    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

                    if rmse < lowest_rmse:
                        lowest_rmse = rmse
                        best_model = model
                        best_ate = ate_mean

                except Exception as e:
                    print(f"Error training model for treatment {treatment} and outcome {outcome}:", e)

            results.append({
                'Outcome': outcome,
                'Treatment': treatment,
                'Model Type': best_model.__class__.__name__ if best_model else None,
                'ATE': best_ate,
                'RMSE': lowest_rmse
            })

    # Generate drift report comparing the overall dataset's training and testing splits
    drift_report.run(reference_data=X_train, current_data=X_test)
    drift_report.save_html("overall_drift_report.html")
    print("Overall drift detection dashboard generated.")

    results_df = pd.DataFrame(results)
    return results_df



In [40]:
# Run causal inference pipeline with drift detection
results_df = causal_inference_pipeline_with_drift(df, covariates_list, treatments)
results_df


Pass `sample_weight` as keyword args.


Pass `sample_weight` as keyword args.


Pass `sample_weight` as keyword args.


Pass `sample_weight` as keyword args.


Pass `sample_weight` as keyword args.


Pass `sample_weight` as keyword args.


Pass `sample_weight` as keyword args.


Pass `sample_weight` as keyword args.


Pass `sample_weight` as keyword args.


Pass `sample_weight` as keyword args.


Pass `sample_weight` as keyword args.


Pass `sample_weight` as keyword args.



Overall drift detection dashboard generated.


Unnamed: 0,Outcome,Treatment,Model Type,ATE,RMSE
0,Recency,Income_Category_High,XGBRegressor,0.984333,54.66827
1,Recency,Is_Parent,XGBRegressor,0.714969,54.865256
2,Recency,Cmp_Attitude,XGBRegressor,0.217046,55.342698
3,Recency,Complain,XGBRegressor,0.012355,55.493829
4,Total_purchase,Income_Category_High,XGBRegressor,0.984334,13.986251
5,Total_purchase,Is_Parent,XGBRegressor,0.714974,14.285645
6,Total_purchase,Cmp_Attitude,XGBTRegressor,3.572356,14.27061
7,Total_purchase,Complain,XGBRegressor,0.010959,14.830237
8,Total_amount,Income_Category_High,XGBTRegressor,346.352068,248.611976
9,Total_amount,Is_Parent,XGBRegressor,0.715721,857.820259
