In [3]:
import XGBoost as xgb
import pandas as pd

In [4]:
df = pd.read_csv('data.csv')

In [5]:
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

def evaluate_model(X_train, X_test, y_train, y_test):
    """Helper function to train and evaluate model"""
    model = xgb.XGBClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred)
    }

# Initialize results storage
results = []

# Get unique subjects (assuming 'id' column contains subject identifiers)
subjects = df['id'].unique()

# For reproducibility
np.random.seed(42)

# Different sample sizes to compare
sample_sizes = [10, 100]

for n_samples in sample_sizes:
    # Repeat experiment multiple times for robust results
    for iteration in range(10):  # 10 iterations for each sample size
        
        # Initialize data for this iteration
        iteration_data = []
        
        # For each subject
        for subject in subjects:
            # Get subject's data
            subject_data = df[df['id'] == subject]
            
            # If subject has enough samples
            if len(subject_data) >= n_samples:
                # Randomly select n_samples
                selected_data = subject_data.sample(n=n_samples)
                iteration_data.append(selected_data)
        
        # Combine all selected data
        iteration_df = pd.concat(iteration_data)
        
        # Prepare features and target
        X = iteration_df.drop(['id', 'diagnosis'], axis=1)
        y = iteration_df['diagnosis'].map({'M': 1, 'B': 0})
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=iteration
        )
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Evaluate model
        metrics = evaluate_model(X_train_scaled, X_test_scaled, y_train, y_test)
        
        # Store results
        results.append({
            'samples_per_subject': n_samples,
            'iteration': iteration,
            'n_subjects': len(iteration_data),
            **metrics
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Calculate summary statistics
summary = results_df.groupby

ValueError: No objects to concatenate