In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from joblib import Parallel, delayed
import warnings

# Step 1: Generating Embeddings
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Disable tokenizers parallelism
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load the tokenizer and model for BioClinicalBERT
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT").to(device)

# Load the CSV file
file_path = 'cleaned_predictions_2ndjune.csv'
df = pd.read_csv(file_path)

# Drop the determinant_pr_ab column
df = df.drop(columns=['determinant_pr_ab'])

# Take a sample of 10000 rows
df = df.sample(n=10000, random_state=42)

# Function to generate embeddings
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length').to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.cpu().numpy()

# Generate embeddings for the 'text' column
df['embeddings'] = df['text'].apply(lambda x: get_embeddings(x).squeeze())

# Extract embeddings to a separate variable and save as a numpy file
embeddings = np.vstack(df['embeddings'].values)
np.save('embeddings_sampled.npy', embeddings)

# Optionally, save the dataframe with embeddings to a new CSV file
df.to_csv('predictions_with_embeddings_sampled.csv', index=False)

# Display the head of the dataframe
print(df.head())

# Step 2: Causal Inference with Propensity Score Matching
# Set random seed for reproducibility
np.random.seed(42)

# Load the dataframe with embeddings
df = pd.read_csv('predictions_with_embeddings_sampled.csv')

# Load the embeddings
embeddings = np.load('embeddings_sampled.npy')

# Flatten the embeddings to ensure they are in the correct format
flattened_embeddings = [embedding.flatten() for embedding in embeddings]

# Add flattened embeddings to the dataframe
df = df.drop(columns=['embeddings'])  # Remove the old embeddings column if it exists
flattened_embeddings_df = pd.DataFrame(flattened_embeddings, index=df.index)
df = pd.concat([df.reset_index(drop=True), flattened_embeddings_df.reset_index(drop=True)], axis=1)

# Define the outcome and confounders
outcome = 'opioid_pr_ab'
embedding_columns = flattened_embeddings_df.columns.tolist()
confounders = embedding_columns

# Ensure that the outcome column is numeric
df[outcome] = pd.to_numeric(df[outcome], errors='coerce')

# Function to calculate propensity scores
def calculate_propensity_scores(df, treatment, confounders):
    X = df[confounders].values
    y = df[treatment]
    
    # Ensure the target variable is binary for logistic regression
    if len(y.unique()) == 2:
        model = LogisticRegression(max_iter=5000)
        model.fit(X, y)
        propensity_scores = model.predict_proba(X)[:, 1]
        return propensity_scores
    else:
        raise ValueError(f"The target variable '{treatment}' is not binary.")

# Function to run propensity score matching and calculate ATE
def run_ps(df, X_data, T, y):
    ps = LogisticRegression(max_iter=5000, C=1e6, n_jobs=-1).fit(X_data, df[T]).predict_proba(X_data)[:, 1]
    weight = (df[T] - ps) / (ps * (1 - ps))  # define the weights
    return np.mean(weight * df[y])  # compute the ATE

# Initialize lists to store the results
determinants = []
original_sample_sizes = []
treated_sample_sizes = []
untreated_sample_sizes = []
Y1_values = []
Y0_values = []
ATE_values = []
p_values = []
ci_lowers = []
ci_uppers = []

# Suppress specific warnings
warnings.filterwarnings("ignore", message="lbfgs failed to converge")
warnings.filterwarnings("ignore", message="Pandas requires version")

# Loop through each determinant
for determinant in df.columns[2:15]:  # Skipping 'text' and 'opioid_pr_ab', and only taking the determinant columns
    print(f"Calculating for {determinant}")
    
    # Ensure the determinant column is numeric
    df[determinant] = pd.to_numeric(df[determinant], errors='coerce')
    
    # Calculate propensity scores
    try:
        df['propensity_score'] = calculate_propensity_scores(df, determinant, confounders)
        
        # Perform matching and estimate ATE using bootstrap sampling
        X_data = df[confounders].values
        y_data = df[outcome]
        
        # Original sample sizes
        original_sample_size = len(df)
        treated_sample_size = df[df[determinant] == 1].shape[0]
        untreated_sample_size = df[df[determinant] == 0].shape[0]
        
        # Run 1000 bootstrap samples
        bootstrap_sample = 1000
        ates = Parallel(n_jobs=-1)(delayed(run_ps)(df.sample(frac=1, replace=True, random_state=42).reset_index(drop=True), X_data, determinant, y_data)
                                   for _ in range(bootstrap_sample))
        ates = np.array(ates)
        ci_lower = np.percentile(ates, 2.5)
        ci_upper = np.percentile(ates, 97.5)
        ATE = np.mean(ates)
        
        # Fit OLS model for p-value
        model = sm.OLS(df[outcome], sm.add_constant(df[[determinant, 'propensity_score']].astype(float)))
        result = model.fit()
        p_value = result.pvalues[determinant]
        
        # Y1 and Y0 values
        Y1 = df.loc[df[determinant] == 1, outcome].mean()
        Y0 = df.loc[df[determinant] == 0, outcome].mean()
        
        determinants.append(determinant)
        original_sample_sizes.append(original_sample_size)
        treated_sample_sizes.append(treated_sample_size)
        untreated_sample_sizes.append(untreated_sample_size)
        Y1_values.append(Y1)
        Y0_values.append(Y0)
        ATE_values.append(ATE)
        p_values.append(p_value)
        ci_lowers.append(ci_lower)
        ci_uppers.append(ci_upper)
    
    except ValueError as e:
        print(f"Skipping {determinant}: {e}")

# Create a DataFrame with the results
results_df = pd.DataFrame({
    'Determinant': determinants,
    'Original Sample Size': original_sample_sizes,
    'Treated Sample Size': treated_sample_sizes,
    'Untreated Sample Size': untreated_sample_sizes,
    'Y1': Y1_values,
    'Y0': Y0_values,
    'ATE': ATE_values,
    'p-value': p_values,
    '95% CI Lower': ci_lowers,
    '95% CI Upper': ci_uppers
})

# Display the results DataFrame
print(results_df)


    Found GPU%d %s which is of cuda capability %d.%d.
    PyTorch no longer supports this GPU because it is too old.
    The minimum cuda capability supported by this library is %d.%d.
    


                                                     text  opioid_pr_ab  \
6292     \nname                      unit no   \n \nad...         False   
92111    \nname                   unit no   \n \nadmis...          True   
209235   \nname                     unit no   \n \nadm...         False   
225051   \nname                  unit no   \n \nadmiss...          True   
143620   \nname                   unit no   \n \nadmis...         False   

        determinant_1  determinant_2  determinant_3  determinant_4  \
6292            False           True          False          False   
92111           False           True          False          False   
209235          False           True          False          False   
225051          False           True          False          False   
143620          False           True          False          False   

        determinant_5  determinant_6  determinant_7  determinant_8  \
6292            False          False          False       

In [2]:
# Save the results to a CSV file
results_df.to_csv('causal_inference_results_3k.csv', index=False)

In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from joblib import Parallel, delayed
import statsmodels.api as sm

# Load the dataframe with embeddings
df = pd.read_csv('predictions_with_embeddings_sampled.csv')

# Load the embeddings
embeddings = np.load('embeddings_sampled.npy')

# Flatten the embeddings to ensure they are in the correct format
flattened_embeddings = [embedding.flatten() for embedding in embeddings]

# Add flattened embeddings to the dataframe
df = df.drop(columns=['embeddings'])  # Remove the old embeddings column if it exists
flattened_embeddings_df = pd.DataFrame(flattened_embeddings, index=df.index)
df = pd.concat([df.reset_index(drop=True), flattened_embeddings_df.reset_index(drop=True)], axis=1)

# Take a sample of 10,000 rows to maintain consistency
df = df.sample(n=10000, random_state=42).reset_index(drop=True)

# Define the outcome and confounders
outcome = 'opioid_pr_ab'
embedding_columns = flattened_embeddings_df.columns.tolist()
confounders = embedding_columns

# Ensure that the outcome column is numeric
df[outcome] = pd.to_numeric(df[outcome], errors='coerce')

# Handling class imbalance for determinant 5 by manual oversampling
determinant = 'determinant_5'
X = df[confounders].values
y = df[determinant]

# Manually oversample the minority class
if len(np.unique(y)) < 2:
    print(f"Skipping manual oversampling for {determinant} due to insufficient samples in one class.")
else:
    class_counts = np.bincount(y)
    majority_class = np.argmax(class_counts)
    minority_class = 1 - majority_class
    minority_class_count = class_counts[minority_class]
    majority_class_count = class_counts[majority_class]

    # Duplicate the minority class samples until they match the majority class
    minority_class_indices = np.where(y == minority_class)[0]
    oversample_indices = np.random.choice(minority_class_indices, majority_class_count, replace=True)
    
    X_oversampled = np.vstack((X, X[oversample_indices]))
    y_oversampled = np.hstack((y, y[oversample_indices]))

    # Update the dataframe with the oversampled data
    df_resampled = pd.DataFrame(X_oversampled, columns=confounders)
    df_resampled[determinant] = y_oversampled
    df_resampled[outcome] = df[outcome].sample(n=len(df_resampled), replace=True, random_state=42).reset_index(drop=True)

    # Ensure consistent sample size of 10,000 after oversampling
    df_resampled = df_resampled.sample(n=10000, random_state=42).reset_index(drop=True)

    # Function to calculate propensity scores
    def calculate_propensity_scores(df, treatment, confounders):
        X = df[confounders].values
        y = df[treatment]
        
        # Ensure the target variable is binary for logistic regression
        if len(np.unique(y)) == 2:
            model = LogisticRegression(max_iter=5000)
            model.fit(X, y)
            propensity_scores = model.predict_proba(X)[:, 1]
            return propensity_scores
        else:
            raise ValueError(f"The target variable '{treatment}' is not binary.")

    # Function to run propensity score matching and calculate ATE
    def run_ps(df, X_data, T, y):
        ps = LogisticRegression(max_iter=5000, C=1e6, n_jobs=-1).fit(X_data, df[T]).predict_proba(X_data)[:, 1]
        weight = (df[T] - ps) / (ps * (1 - ps))  # define the weights
        return np.mean(weight * df[y])  # compute the ATE

    # Calculate propensity scores for determinant 5
    df_resampled['propensity_score'] = calculate_propensity_scores(df_resampled, determinant, confounders)

    # Perform matching and estimate ATE using bootstrap sampling
    X_data = df_resampled[confounders].values
    y_data = df_resampled[outcome]

    # Original sample sizes
    original_sample_size = len(df_resampled)
    treated_sample_size = df_resampled[df_resampled[determinant] == 1].shape[0]
    untreated_sample_size = df_resampled[df_resampled[determinant] == 0].shape[0]

    # Run 1000 bootstrap samples
    bootstrap_sample = 1000
    ates = Parallel(n_jobs=-1)(delayed(run_ps)(df_resampled.sample(frac=1, replace=True).reset_index(drop=True), X_data, determinant, y_data)
                               for _ in range(bootstrap_sample))
    ates = np.array(ates)
    ci_lower = np.percentile(ates, 2.5)
    ci_upper = np.percentile(ates, 97.5)
    ATE = np.mean(ates)

    # Fit OLS model for p-value
    model = sm.OLS(df_resampled[outcome], sm.add_constant(df_resampled[[determinant, 'propensity_score']].astype(float)))
    result = model.fit()
    p_value = result.pvalues[determinant]

    # Y1 and Y0 values
    Y1 = df_resampled.loc[df_resampled[determinant] == 1, outcome].mean()
    Y0 = df_resampled.loc[df_resampled[determinant] == 0, outcome].mean()

    # Print the results for determinant 5
    print(f"Results for determinant 5:")
    print(f"Original Sample Size: {original_sample_size}")
    print(f"Treated Sample Size: {treated_sample_size}")
    print(f"Untreated Sample Size: {untreated_sample_size}")
    print(f"Y1: {Y1}")
    print(f"Y0: {Y0}")
    print(f"ATE: {ATE}")
    print(f"p-value: {p_value}")
    print(f"95% CI Lower: {ci_lower}")
    print(f"95% CI Upper: {ci_upper}")
    
    # Calculate Conditional Average Treatment Effect (CATE)
    def calculate_cate(df, treatment, outcome):
        cate_result = {}
        treated = df[df[treatment] == 1]
        untreated = df[df[treatment] == 0]
        cate_result['Treated'] = treated[outcome].mean()
        cate_result['Untreated'] = untreated[outcome].mean()
        cate_result['CATE'] = treated[outcome].mean() - untreated[outcome].mean()
        return cate_result

    # Calculate CATE for determinant 5
    cate_result = calculate_cate(df_resampled, determinant, outcome)

    # Print CATE results
    print("CATE Results:")
    for key, value in cate_result.items():
        print(f"{key}: {value}")



Results for determinant 5:
Original Sample Size: 10000
Treated Sample Size: 4966
Untreated Sample Size: 5034
Y1: 0.03201772049939589
Y0: 0.037544696066746125
ATE: 0.00016230458592381326
p-value: 0.39561241867817076
95% CI Lower: -0.06638903469416536
95% CI Upper: 0.058431122968938376
CATE Results:
Treated: 0.03201772049939589
Untreated: 0.037544696066746125
CATE: -0.005526975567350233


In [5]:
# Given values
Y1 = 0.03201772049939589
Y0 = 0.037544696066746125

# Calculate CATE
CATE = Y1 - Y0

# Print CATE results
print("CATE Results:")
print(f"Treated (Y1): {Y1}")
print(f"Untreated (Y0): {Y0}")
print(f"CATE: {CATE}")


CATE Results:
Treated (Y1): 0.03201772049939589
Untreated (Y0): 0.037544696066746125
CATE: -0.005526975567350233
