In [None]:

import numpy as np
from sklearn.model_selection import train_test_split # Used here for comparison

def stratified_split_from_scratch(X, y, test_size=0.2, random_state=None):
    """
    Performs a stratified train-test split manually.

    Args:
        X (pd.DataFrame or np.array): Features.
        y (pd.Series or np.array): Target labels.
        test_size (float): Proportion of the dataset to include in the test split.
        random_state (int): Controls the randomness of the split for reproducibility.

    Returns:
        tuple: (X_train, X_test, y_train, y_test)
    """
    if random_state:
        np.random.seed(random_state)

    # Convert to pandas Series if not already for easier indexing
    
    # Initialize train and test indices
    train_indices = []
    test_indices = []
    
    # Group by class labels and perform sampling within each class (stratum)
    for class_label in np.unique(y):
        class_indices = np.where(y == class_label)[0]
        n_class = len(class_indices)
        n_test = int(n_class * test_size)
        
        # Randomly select indices for the test set from the current class
        test_subset_indices = np.random.choice(class_indices, n_test, replace=False)
        train_subset_indices = list(set(class_indices) - set(test_subset_indices))
        
        train_indices.extend(train_subset_indices)
        test_indices.extend(test_subset_indices)
    
    # Shuffle indices to ensure randomness across the entire dataset
    np.random.shuffle(train_indices)
    np.random.shuffle(test_indices)

    # Split data using the collected indices
    X_train = X[train_indices]
    X_test = X[test_indices]
    y_train = y[train_indices]
    y_test = y[test_indices]

    return X_train, X_test, y_train, y_test

# --- Example Usage ---

# 1. Create a sample imbalanced dataset
from sklearn.datasets import make_classification
X, y = np.arange(100,110), np.array([1,1,1,1,0,0,0,0,1,1])

# 2. Perform the custom stratified split
X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch = \
    stratified_split_from_scratch(X, y, test_size=0.5, random_state=42)

# 4. Compare with scikit-learn's built-in function
X_train_sklearn, X_test_sklearn, y_train_sklearn, y_test_sklearn = \
    train_test_split(X, y, test_size=0.5, stratify=y, random_state=42)




AttributeError: 'numpy.ndarray' object has no attribute 'index'