In [1]:
import numpy as np
from sklearn.model_selection import train_test_split

# Numpy library has functions used for working with arrays and matrices
# faster, clearer and better quality code using Numpy

# sklearn is a library used for machine learning and statistical modeling.
# it includes many tools for classification, regression, clustering and dimensionality reduction
# train test split is a module used for splitting data into training and testing variables

### Generating data for splitting

In [2]:
a = np.arange(1,101)
a
# returns evenly spaced values within a given interval - outputs an ndarray

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])

In [3]:
b = np.arange(501,601)
b

array([501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513,
       514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526,
       527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539,
       540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552,
       553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565,
       566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578,
       579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591,
       592, 593, 594, 595, 596, 597, 598, 599, 600])

### Splitting the Data

In [4]:
train_test_split(a)
# the 2 arrays are shuffled together, the first array is training, second is testing

[array([ 60,  85,   2,  86,  33,  70,  20,  61,  89,  78,  98,  79,  53,
         35,  52,  21,  17,  47,  41,  32, 100,  73,  45,  25,  49,  92,
         19,  15,  38,  39,  29,  80,  14,  69,  13,  64,  31,  94,   4,
         37,  55,   6,   1,  76,  16,  75,  84,  12,  90,  40,  72,  83,
         26,   9,   3,  22,  43,  99,  59,  82,  97,  58,  77,   7,  66,
         28,  91,  74,  67,  50,  46,  88,  56,  18,  23]),
 array([34, 54, 93, 63, 44, 42, 57, 27, 95, 87, 51, 71, 36, 81, 62, 65, 11,
        24, 30, 68,  5, 48, 96,  8, 10])]

In [5]:
a_train, a_test, b_train, b_test = train_test_split(a, b, test_size=0.2, random_state=42)


# testing will be 20% as we have indicated this in 'test_size' as 0.2
# a different shuffle every time the code is run would prevent an objective assessment of the changes
# using the 'random_state=42'allows you to have the same shuffled data every time! (doesn't need to be 42 specifically)

## Exploring the Result

In [6]:
a_train.shape, a_test.shape
# the default split for 'train_test_split' is 75% Training and 25% testing, but we set it to 80% and 20% above

((80,), (20,))

In [7]:
a_train

array([ 56,  89,  27,  43,  70,  16,  41,  97,  10,  73,  12,  48,  86,
        29,  94,   6,  67,  66,  36,  17,  50,  35,   8,  96,  28,  20,
        82,  26,  63,  14,  25,   4,  18,  39,   9,  79,   7,  65,  37,
        90,  57, 100,  55,  44,  51,  68,  47,  69,  62,  98,  80,  42,
        59,  49,  99,  58,  76,  33,  95,  60,  64,  85,  38,  30,   2,
        53,  22,   3,  24,  88,  92,  75,  87,  83,  21,  61,  72,  15,
        93,  52])

In [8]:
a_test

array([84, 54, 71, 46, 45, 40, 23, 81, 11,  1, 19, 31, 74, 34, 91,  5, 77,
       78, 13, 32])

In [9]:
b_train.shape, b_test.shape

((80,), (20,))

In [10]:
b_train
# each element from a_train and b_train have been shuffled in the same way
# e.g. the 9th position of b_train (510) matches the 9th position in a_train (10)
# as they were in the same position before shuffling
# this is important as we want an observations input to match with it's target

array([556, 589, 527, 543, 570, 516, 541, 597, 510, 573, 512, 548, 586,
       529, 594, 506, 567, 566, 536, 517, 550, 535, 508, 596, 528, 520,
       582, 526, 563, 514, 525, 504, 518, 539, 509, 579, 507, 565, 537,
       590, 557, 600, 555, 544, 551, 568, 547, 569, 562, 598, 580, 542,
       559, 549, 599, 558, 576, 533, 595, 560, 564, 585, 538, 530, 502,
       553, 522, 503, 524, 588, 592, 575, 587, 583, 521, 561, 572, 515,
       593, 552])

In [11]:
b_test

array([584, 554, 571, 546, 545, 540, 523, 581, 511, 501, 519, 531, 574,
       534, 591, 505, 577, 578, 513, 532])