In [161]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import pearsonr

df = pd.read_csv('/Users/jt041/repos/brain_mapping_intro/example_data/Synthetic_Epilepsy_Patient_Data.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)

df['seiz_reduction'] = df['Seizure_Frequency_Before'] - df['Seizure_Frequency_After']
resection_mask_connectivity_to_pulvinar = StandardScaler().fit_transform(df[['seiz_reduction']]).flatten()
resection_mask_connectivity_to_pulvinar = resection_mask_connectivity_to_pulvinar - np.random.randn(df.shape[0])
resection_mask_connectivity_to_pulvinar = resection_mask_connectivity_to_pulvinar + np.abs(np.min(resection_mask_connectivity_to_pulvinar))
df['resection_mask_connectivity_to_pulvinar'] = resection_mask_connectivity_to_pulvinar

resection_mask_connectivity_to_dlpfc = MinMaxScaler().fit_transform(-df[['seiz_reduction']]).flatten()
resection_mask_connectivity_to_dlpfc = resection_mask_connectivity_to_dlpfc + np.random.randn(df.shape[0])
resection_mask_connectivity_to_dlpfc = resection_mask_connectivity_to_dlpfc + np.abs(np.min(resection_mask_connectivity_to_dlpfc))
df['resection_mask_connectivity_to_dlpfc'] = resection_mask_connectivity_to_dlpfc

# pearsonr(df['seiz_reduction'], resection_mask_connectivity_to_pulvinar)
# pearsonr(df['seiz_reduction'], resection_mask_connectivity_to_dlpfc)
# pearsonr(df['seiz_reduction'], df['EEG_Variance'])

df.drop(columns=['seiz_reduction'], inplace=True)
df.to_csv('/Users/jt041/repos/brain_mapping_intro/example_data/Synthetic_Epilepsy_Patient_Data.csv', index=False)

In [12]:
import pandas as pd
import numpy as np
from scipy import stats
import random

df = pd.read_csv('/Users/jt041/repos/brain_mapping_intro/example_data/Synthetic_Epilepsy_Patient_Data.csv')

# Analyze data patterns
age_mean = df['Age'].mean()
age_std = df['Age'].std()
gender_dist = df['Gender'].value_counts(normalize=True).to_dict()
surgery_loc_dist = df['Surgery_Location'].value_counts(normalize=True).to_dict()
outcome_dist = df['Outcome_Score'].value_counts(normalize=True).to_dict()

# Analyze correlations
loc_outcome_map = {}
for loc in df['Surgery_Location'].unique():
    loc_outcome_map[loc] = df[df['Surgery_Location'] == loc]['Outcome_Score'].value_counts(normalize=True).to_dict()

# Helper functions to generate realistic data
def generate_seizure_before():
    # Generate values similar to original data
    return max(1, int(np.random.normal(35, 15)))

def generate_seizure_after(before_val, outcome):
    # Logic based on outcome - better outcomes should have larger reductions
    if outcome == "Engel I":
        # Very good outcome, 70-100% reduction
        reduction_factor = random.uniform(0.7, 1.0)
        return max(0, round(before_val * (1 - reduction_factor)))
    elif outcome == "Engel II":
        # Good outcome, 50-70% reduction
        reduction_factor = random.uniform(0.5, 0.7)
        return round(before_val * (1 - reduction_factor))
    elif outcome == "Engel III":
        # Worthwhile improvement, 30-50% reduction
        reduction_factor = random.uniform(0.3, 0.5)
        return round(before_val * (1 - reduction_factor))
    else:  # Engel IV
        # No worthwhile improvement, 0-30% reduction or even increase
        change_factor = random.uniform(-0.2, 0.3)  # Can worsen by up to 20%
        return max(0, round(before_val * (1 - change_factor)))

def generate_eeg_variance(outcome, surgery_location):
    # Better outcomes tend to have more clear EEG abnormalities
    if outcome == "Engel I":
        base = random.uniform(35, 60)
    elif outcome == "Engel II":
        base = random.uniform(30, 50)
    elif outcome == "Engel III":
        base = random.uniform(25, 45)
    else:  # Engel IV
        base = random.uniform(20, 65)  # More variable for poor outcomes
    
    # Adjust slightly based on location
    if surgery_location == "Temporal Lobe":
        base *= random.uniform(0.95, 1.05)
    elif surgery_location == "Frontal Lobe":
        base *= random.uniform(0.9, 1.1)
    else:  # Occipital Lobe
        base *= random.uniform(0.85, 1.15)
        
    return round(base, 2)

# Generate synthetic data
def generate_synthetic_data(num_samples=50):
    synthetic_data = []
    last_id = int(df['Patient_ID'].str.replace('P', '').max())
    
    for i in range(num_samples):
        patient_id = f"P{last_id + i + 1:03d}"
        age = max(18, min(85, round(np.random.normal(age_mean, age_std))))
        gender = random.choices(list(gender_dist.keys()), weights=list(gender_dist.values()))[0]
        
        # Add Engel II which was missing from original data (rare)
        full_outcomes = ["Engel I", "Engel II", "Engel III", "Engel IV"]
        outcome_weights = [0.4, 0.1, 0.2, 0.3]  # Estimated distribution
        
        surgery_loc = random.choices(list(surgery_loc_dist.keys()), weights=list(surgery_loc_dist.values()))[0]
        
        # Better correlation between location and outcome
        outcome_weights_adj = outcome_weights.copy()
        if surgery_loc == "Temporal Lobe":
            # Better outcomes for temporal lobe
            outcome_weights_adj = [0.5, 0.15, 0.15, 0.2]
        elif surgery_loc == "Frontal Lobe":
            # Worse outcomes for frontal lobe
            outcome_weights_adj = [0.2, 0.1, 0.3, 0.4]
            
        outcome = random.choices(full_outcomes, weights=outcome_weights_adj)[0]
        
        seizure_before = generate_seizure_before()
        seizure_after = generate_seizure_after(seizure_before, outcome)
        eeg_variance = generate_eeg_variance(outcome, surgery_loc)
        
        synthetic_data.append({
            'Patient_ID': patient_id,
            'Age': float(age),
            'Gender': gender,
            'Seizure_Frequency_Before': seizure_before,
            'Seizure_Frequency_After': float(seizure_after),
            'Surgery_Location': surgery_loc,
            'EEG_Variance': eeg_variance,
            'Outcome_Score': outcome
        })
    
    return pd.DataFrame(synthetic_data)

# Generate new data
new_data = generate_synthetic_data(285)

# Combine with original data
combined_df = pd.concat([df, new_data], ignore_index=True)

# Output the combined CSV
# print(combined_df.to_csv(index=False))
combined_df.to_csv('/Users/jt041/repos/brain_mapping_intro/example_data/Synthetic_Epilepsy_Patient_Data.csv', index=False)