In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer

from zadania import \
    Dataset, \
    RandomSplitter, \
    RandomLabelSplitter, \
    TimeSplitter, \
    CVSplitter

In [2]:
def breast_cancer():
    data = load_breast_cancer()
    return data.data, data.target

def show_dataset_and_splits(dataset, splitter, seed=43):
    print("X:\n", dataset.X)
    print("y:\n", dataset.y)
    print("labels:\n", dataset.labels)
    for i, split in enumerate(splitter.get_splits(dataset, seed=seed)):
        print()
        print("Split:", i+1)
        print("X_train:\n", split.train.X)
        print("y_train:\n", split.train.y)
        print("labels_train:\n", split.train.labels)
        print("X_test:\n", split.test.X)
        print("y_test:\n", split.test.y)
        print("labels_test:\n", split.test.labels)

def show_dataset_and_splits_shapes(dataset, splitter, seed=43):
    print("X:", dataset.X.shape)
    print("y:", dataset.y.shape)
    if dataset.labels is not None:
        print("labels:", dataset.labels.shape)    
    for i, split in enumerate(splitter.get_splits(dataset, seed=seed)):
        print()
        print("Split:", i+1)
        print("X_train:", split.train.X.shape)
        print("y_train:", split.train.y.shape)
        if dataset.labels is not None:
            print("labels_train:", split.train.labels.shape)
        print("X_test:", split.test.X.shape)
        print("y_test:", split.test.y.shape)
        if dataset.labels is not None:
            print("labels_test:", split.test.labels.shape)

In [3]:
# BREAST CANCER DATASET
breast_cancer_dataset = Dataset(*breast_cancer())

# DUMMY DATASET
X = np.arange(5).reshape(5,1)
y = np.concatenate((np.zeros(3), np.ones(2)))
dummy_dataset = Dataset(X, y)

# DUMMY LABELED DATASET
# labels reprezentują niepodzielne grupy
X = np.arange(10).reshape(10,1)
y = np.concatenate((np.zeros(5), np.ones(5)))
labels = np.array([0,2,3,0,6,2,7,7,0,3])
dummy_labeled_dataset = Dataset(X, y, labels)

# DUMMY TIME DATASET
# labels reprezentują timestamp używany w Time Splitterze
X = np.arange(5).reshape(5,1)
y = np.concatenate((np.zeros(3), np.ones(2)))
labels = np.arange(5)[::-1]
dummy_time_dataset = Dataset(X, y, labels)

### Random Splitter

In [4]:
random_splitter = RandomSplitter(
    test_percentages=[.2, .6, .4])

In [5]:
show_dataset_and_splits_shapes(breast_cancer_dataset, random_splitter, seed=43)

X: (569, 30)
y: (569,)

Split: 1
X_train: (456, 30)
y_train: (456,)
X_test: (113, 30)
y_test: (113,)

Split: 2
X_train: (228, 30)
y_train: (228,)
X_test: (341, 30)
y_test: (341,)

Split: 3
X_train: (342, 30)
y_train: (342,)
X_test: (227, 30)
y_test: (227,)


In [6]:
show_dataset_and_splits(dummy_dataset, random_splitter, seed=43)

X:
 [[0]
 [1]
 [2]
 [3]
 [4]]
y:
 [ 0.  0.  0.  1.  1.]
labels:
 None

Split: 1
X_train:
 [[2]
 [1]
 [0]
 [4]]
y_train:
 [ 0.  0.  0.  1.]
labels_train:
 None
X_test:
 [[3]]
y_test:
 [ 1.]
labels_test:
 None

Split: 2
X_train:
 [[0]
 [2]]
y_train:
 [ 0.  0.]
labels_train:
 None
X_test:
 [[3]
 [4]
 [1]]
y_test:
 [ 1.  1.  0.]
labels_test:
 None

Split: 3
X_train:
 [[4]
 [2]
 [3]]
y_train:
 [ 1.  0.  1.]
labels_train:
 None
X_test:
 [[1]
 [0]]
y_test:
 [ 0.  0.]
labels_test:
 None


In [7]:
show_dataset_and_splits(dummy_dataset, random_splitter, seed=7)

X:
 [[0]
 [1]
 [2]
 [3]
 [4]]
y:
 [ 0.  0.  0.  1.  1.]
labels:
 None

Split: 1
X_train:
 [[3]
 [2]
 [1]
 [4]]
y_train:
 [ 1.  0.  0.  1.]
labels_train:
 None
X_test:
 [[0]]
y_test:
 [ 0.]
labels_test:
 None

Split: 2
X_train:
 [[4]
 [3]]
y_train:
 [ 1.  1.]
labels_train:
 None
X_test:
 [[2]
 [1]
 [0]]
y_test:
 [ 0.  0.  0.]
labels_test:
 None

Split: 3
X_train:
 [[2]
 [1]
 [0]]
y_train:
 [ 0.  0.  0.]
labels_train:
 None
X_test:
 [[3]
 [4]]
y_test:
 [ 1.  1.]
labels_test:
 None


### Random Label Splitter

In [8]:
random_label_splitter = RandomLabelSplitter(
    test_percentages=[.2, .6, .4])

In [9]:
show_dataset_and_splits(dummy_labeled_dataset, random_label_splitter, seed=43)

X:
 [[0]
 [1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [7]
 [8]
 [9]]
y:
 [ 0.  0.  0.  0.  0.  1.  1.  1.  1.  1.]
labels:
 [0 2 3 0 6 2 7 7 0 3]

Split: 1
X_train:
 [[0]
 [1]
 [2]
 [3]
 [5]
 [6]
 [7]
 [8]
 [9]]
y_train:
 [ 0.  0.  0.  0.  1.  1.  1.  1.  1.]
labels_train:
 [0 2 3 0 2 7 7 0 3]
X_test:
 [[4]]
y_test:
 [ 0.]
labels_test:
 [6]

Split: 2
X_train:
 [[0]
 [2]
 [3]
 [8]
 [9]]
y_train:
 [ 0.  0.  0.  1.  1.]
labels_train:
 [0 3 0 0 3]
X_test:
 [[1]
 [4]
 [5]
 [6]
 [7]]
y_test:
 [ 0.  0.  1.  1.  1.]
labels_test:
 [2 6 2 7 7]

Split: 3
X_train:
 [[2]
 [4]
 [6]
 [7]
 [9]]
y_train:
 [ 0.  0.  1.  1.  1.]
labels_train:
 [3 6 7 7 3]
X_test:
 [[0]
 [1]
 [3]
 [5]
 [8]]
y_test:
 [ 0.  0.  0.  1.  1.]
labels_test:
 [0 2 0 2 0]


### Time Splitter

In [10]:
time_splitter = TimeSplitter(
    test_percentages=[.2, .4, .6])

In [11]:
show_dataset_and_splits(dummy_time_dataset, time_splitter, seed=43)

X:
 [[0]
 [1]
 [2]
 [3]
 [4]]
y:
 [ 0.  0.  0.  1.  1.]
labels:
 [4 3 2 1 0]

Split: 1
X_train:
 [[2]
 [3]
 [1]
 [4]]
y_train:
 [ 0.  1.  0.  1.]
labels_train:
 [2 1 3 0]
X_test:
 [[0]]
y_test:
 [ 0.]
labels_test:
 [4]

Split: 2
X_train:
 [[4]
 [2]
 [3]]
y_train:
 [ 1.  0.  1.]
labels_train:
 [0 2 1]
X_test:
 [[0]
 [1]]
y_test:
 [ 0.  0.]
labels_test:
 [4 3]

Split: 3
X_train:
 [[3]
 [4]]
y_train:
 [ 1.  1.]
labels_train:
 [1 0]
X_test:
 [[2]
 [0]
 [1]]
y_test:
 [ 0.  0.  0.]
labels_test:
 [2 4 3]


### CV Splitter

In [12]:
cv_splitter = CVSplitter(n_splits=3)

In [13]:
show_dataset_and_splits(dummy_labeled_dataset, cv_splitter, seed=43)

X:
 [[0]
 [1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [7]
 [8]
 [9]]
y:
 [ 0.  0.  0.  0.  0.  1.  1.  1.  1.  1.]
labels:
 [0 2 3 0 6 2 7 7 0 3]

Split: 1
X_train:
 [[2]
 [5]
 [1]
 [7]
 [0]
 [4]]
y_train:
 [ 0.  1.  0.  1.  0.  0.]
labels_train:
 [3 2 2 7 0 6]
X_test:
 [[3]
 [9]
 [6]
 [8]]
y_test:
 [ 0.  1.  1.  1.]
labels_test:
 [0 3 7 0]

Split: 2
X_train:
 [[3]
 [9]
 [6]
 [8]
 [7]
 [0]
 [4]]
y_train:
 [ 0.  1.  1.  1.  1.  0.  0.]
labels_train:
 [0 3 7 0 7 0 6]
X_test:
 [[2]
 [5]
 [1]]
y_test:
 [ 0.  1.  0.]
labels_test:
 [3 2 2]

Split: 3
X_train:
 [[3]
 [9]
 [6]
 [8]
 [2]
 [5]
 [1]]
y_train:
 [ 0.  1.  1.  1.  0.  1.  0.]
labels_train:
 [0 3 7 0 3 2 2]
X_test:
 [[7]
 [0]
 [4]]
y_test:
 [ 1.  0.  0.]
labels_test:
 [7 0 6]
