In [None]:
## MERGE_EVALUATION METHOD

In [4]:
import pandas as pd
from sklearn.mixture import GaussianMixture
from hmmlearn import hmm
from sklearn.mixture import BayesianGaussianMixture
from sklearn.mixture import DirichletProcess
from sklearn.mixture import CategoricalMixture
from sklearn.mixture import _BaseMixture
import numpy as np

class ATE_clustering:
    def __init__(self, algorithm='GMM', likelihood_threshold=0.5, null_cluster_label=-1, random_state=None):
        self.algorithm = algorithm
        self.likelihood_threshold = likelihood_threshold
        self.null_cluster_label = null_cluster_label
        self.model = None
        self.random_state = random_state

    def fit(self, x, t):
        # Check if input is a DataFrame or an array and convert to DataFrame
        if not isinstance(x, pd.DataFrame):
            x = pd.DataFrame(x)

        if not isinstance(t, pd.Series):
            t = pd.Series(t, name='treatment')

        # Initialize the clustering algorithm
        if self.algorithm == 'GMM':
            self.model = GaussianMixture(random_state=self.random_state)
        elif self.algorithm == 'HMM':
            self.model = hmm.GaussianHMM(random_state=self.random_state)
        elif self.algorithm == 'MoE':
            # Mixture of Experts is not directly available in scikit-learn, you may need a custom implementation
            # Here, we use BayesianGaussianMixture as a placeholder
            self.model = BayesianGaussianMixture(random_state=self.random_state)
        elif self.algorithm == 'LDA':
            self.model = _BaseMixture(component=CategoricalMixture, random_state=self.random_state)
        elif self.algorithm == 'DPMM':
            self.model = DirichletProcess(random_state=self.random_state)
        elif self.algorithm == 'BGMM':
            self.model = BayesianGaussianMixture(random_state=self.random_state)
        elif self.algorithm == 'FMM':
            self.model = _BaseMixture(component=CategoricalMixture, random_state=self.random_state)
        elif self.algorithm == 'EM':
            # EM clustering is not directly available in scikit-learn, you may need a custom implementation
            # Here, we use GaussianMixture as a placeholder
            self.model = GaussianMixture(random_state=self.random_state)
        elif self.algorithm == 'CMM':
            self.model = CategoricalMixture(random_state=self.random_state)
        else:
            raise ValueError("Unsupported clustering algorithm. Supported algorithms: 'GMM', 'HMM', 'MoE', 'LDA', 'DPMM', 'BGMM', 'FMM', 'EM', 'CMM'.")

        # Fit the model with x
        self.model.fit(x)

    def evaluate_private(self, x, t):
        # Check if input is a DataFrame or an array and convert to DataFrame
        if not isinstance(x, pd.DataFrame):
            x = pd.DataFrame(x)

        if not isinstance(t, pd.Series):
            t = pd.Series(t, name='treatment')

        # Get the likelihood scores for each sample
        likelihoods = self.model.score_samples(x)

        # Assign samples with low likelihoods to the null cluster
        clusters = [self.null_cluster_label if likelihood < self.likelihood_threshold else cluster
                    for cluster, likelihood in zip(self.model.predict(x), likelihoods)]

        # Create a DataFrame with cluster indices, treated sample size, and untreated sample size
        evaluation_df = pd.DataFrame({'Cluster_Index': clusters,
                                      'Treated_Size': t.groupby(clusters).sum(),
                                      'Untreated_Size': (1 - t).groupby(clusters).sum(),
                                      'Treated_Var_Y': 0.0,
                                      'Untreated_Var_Y': 0.0})

        # Compute variances for each cluster
        for cluster_index, cluster_data in evaluation_df.groupby('Cluster_Index'):
            treated_y_var = y[t & (evaluation_df['Cluster_Index'] == cluster_index)].var()
            untreated_y_var = y[~t & (evaluation_df['Cluster_Index'] == cluster_index)].var()

            evaluation_df.loc[evaluation_df['Cluster_Index'] == cluster_index, 'Treated_Var_Y'] = treated_y_var
            evaluation_df.loc[evaluation_df['Cluster_Index'] == cluster_index, 'Untreated_Var_Y'] = untreated_y_var

        # Add a column for the proportion of samples in each cluster
        evaluation_df['Probability_Cluster'] = evaluation_df.groupby('Cluster_Index')['Cluster_Index'].transform('count') / len(x)

        return evaluation_df

    def compute_info_gain(self, evaluation_df_host, evaluation_df_candidate):
        """
        Compute Information Gain between host and candidate datasets.

        Parameters:
        - evaluation_df_host (DataFrame): Complete non-private summary for the host dataset.
        - evaluation_df_candidate (DataFrame): Partial summary for the candidate dataset.

        Returns:
        - float: Information Gain ratio.
        """
        # Check if input is a DataFrame or an array and convert to DataFrame
        if not isinstance(evaluation_df_host, pd.DataFrame):
            evaluation_df_host = pd.DataFrame(evaluation_df_host)

        if not isinstance(evaluation_df_candidate, pd.DataFrame):
            evaluation_df_candidate = pd.DataFrame(evaluation_df_candidate)

        # Compute var_host_estimator
        pi_host = evaluation_df_host['Probability_Cluster']
        sigma_treated_host = evaluation_df_host['Treated_Var_Y']
        sigma_untreated_host = evaluation_df_host['Untreated_Var_Y']
        n_treated_host = evaluation_df_host['Treated_Size']
        n_untreated_host = evaluation_df_host['Untreated_Size']

        var_host_estimator = np.sum((pi_host ** 2 / 2) * ((sigma_treated_host / n_treated_host) + (sigma_untreated_host / n_untreated_host)))

        # Compute var_merged_estimator
        n_treated_candidate = evaluation_df_candidate['Treated_Size_Host']
        n_untreated_candidate = evaluation_df_candidate['Untreated_Size_Host']

        var_merged_estimator = np.sum((pi_host ** 2 / 2) * ((sigma_treated_host / (n_treated_host + n_treated_candidate)) + (sigma_untreated_host / (n_untreated_host + n_untreated_candidate))))

        # Compute Information Gain ratio
        info_gain_ratio = var_merged_estimator / var_host_estimator

        return info_gain_ratio


ImportError: cannot import name '_BaseMixture' from 'sklearn.mixture' (/usr/local/lib/python3.9/site-packages/sklearn/mixture/__init__.py)

In [None]:
# Example usage:
# Instantiate the ATE_clustering class with different algorithms and a likelihood threshold
ate_gmm = ATE_clustering(algorithm='GMM', likelihood_threshold=0.5)
ate_hmm = ATE_clustering(algorithm='HMM', likelihood_threshold=0.5)
ate_moe = ATE_clustering(algorithm='MoE', likelihood_threshold=0.5)
ate_lda = ATE_clustering(algorithm='LDA', likelihood_threshold=0.5)
ate_dpmm = ATE_clustering(algorithm='DPMM', likelihood_threshold=0.5)
ate_bgmm = ATE_clustering(algorithm='BGMM', likelihood_threshold=0.5)
ate_fmm = ATE_clustering(algorithm='FMM', likelihood_threshold=0.5)
ate_em = ATE_clustering(algorithm='EM', likelihood_threshold=0.5)
ate_cmm = ATE_clustering(algorithm='CMM', likelihood_threshold=0.5)

# Example data
example_x_host = pd.DataFrame({
    'Feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Feature2': [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
})

example_t_host = pd.Series([0, 1, 0, 1, 0, 1, 0, 1, 0, 1], name='Treatment')
example_y_host = pd.Series([25, 30, 35, 40, 45, 50, 55, 60, 65, 70], name='Outcome')

example_x_candidate = pd.DataFrame({
    'Feature1': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
    'Feature2': [12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
})

example_t_candidate = pd.Series([1, 0, 1, 0, 1, 0, 1, 0, 1, 0], name='Treatment')
example_y_candidate = pd.Series([32, 36, 41, 38, 48, 52, 55, 60, 68, 70], name='Outcome')

# Fit and evaluate using different algorithms
ate_gmm.fit(example_x_host, example_t_host)
evaluation_gmm_host = ate_gmm.compute_summary_stats_non_private(example_x_host, example_t_host, example_y_host)
evaluation_gmm_candidate = ate_gmm.evaluate_private(example_x_candidate, example_t_candidate)
info_gain_gmm = ate_gmm.compute_info_gain(evaluation_gmm_host, evaluation_gmm_candidate)
print("Information Gain (GMM):", info_gain_gmm)

# Repeat the above steps for other algorithms as needed

In [None]:
## Clustering class 

# parameters: 'clustering algorithm' 
# inputs: x, t

# fit function : fits the clustering algorithm and returns the fitted model

# evaluate_private function : 
# takes in as an input (x,t) 
# returns a vector of cluster index and treated/untreated sample size

# for later
# describe function: 
# input would be the fitted model
# returns a description each cluster in terms of covariate partition, with corresponding cluster index



## Computation class

# we just take the variance of the outcome in each cluster for treated and untreatd, looking only at our dataset


In [None]:
## EXPERIMENTS

In [None]:
## Data class 

# each object has a dataframe attribute
# some way to declare datasets and merge them easily
# function to merge datasets (could use add)


In [None]:
## Simulated examples

# 1 class = 1 experiment
# returns a set of datasets (doesn't have to be 2)

# experiment 1
# input parameters: (sigma, sample size, eps)
# the ideal match or some noisy version of that (have the noise as a parameter)

# this setting = in the regions where we have a lot of data is where overlap will be good 
# reinforcing the regions of good overlap
# we would like to have 

# dataset A: pi_A(x) = sigmoid(x) 
# X_A : 1 dimensional normal covariate with mean 0 

# dataset B: pi_B(x) = sigmoid(-x) 
# X_B : 1 dimensional normal covariate with mean eps
# eps being 0 would be the ideal match 
# eps the further away from 0 we will have more samples in a region where there used to high uncertainty

# experiment 2
# input parameters: (sigma, sample size, eps)
# same thing but play with sigma instead of eps

