In [None]:

import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment
import matplotlib.pyplot as plt
import statsmodels.api as sm

# Simulated Data: Patients with Treatment Time and Symptoms
def generate_sample_data(n=200):
    np.random.seed(42)
    data = pd.DataFrame({
        'patient_id': np.arange(n),
        'treatment_time': np.random.choice([3, 6, 9, 12, np.nan], size=n, p=[0.2, 0.2, 0.2, 0.2, 0.2]),
        'pain': np.random.randint(0, 10, size=n),
        'urgency': np.random.randint(0, 10, size=n),
        'frequency': np.random.randint(0, 10, size=n)
    })
    return data

data = generate_sample_data()
print(data.head())

# Risk Set Matching: Finding Controls for Treated Patients
def risk_set_matching(data):
    treated = data.dropna(subset=['treatment_time'])
    controls = data[data['treatment_time'].isna()]
    
    # Compute pairwise distances based on symptoms
    dist_matrix = cdist(treated[['pain', 'urgency', 'frequency']], 
                        controls[['pain', 'urgency', 'frequency']], metric='euclidean')
    
    # Solve assignment problem (Hungarian Algorithm)
    row_ind, col_ind = linear_sum_assignment(dist_matrix)
    matched_controls = controls.iloc[col_ind].reset_index(drop=True)
    matched_treated = treated.iloc[row_ind].reset_index(drop=True)
    
    return matched_treated, matched_controls

matched_treated, matched_controls = risk_set_matching(data)

# Performing a Simple Paired t-Test
def analyze_results(matched_treated, matched_controls):
    diff = matched_treated[['pain', 'urgency', 'frequency']].values - matched_controls[['pain', 'urgency', 'frequency']].values
    t_stat, p_value = sm.stats.ttest_1samp(diff, 0)
    return t_stat, p_value

results = analyze_results(matched_treated, matched_controls)
print("T-test statistics:", results)
