# Train Test Split

## Import the relevant libraries

In [3]:
# In this lesson we will explore the train_test_split module
# Therefore we need no more than the module itself and NumPy
import numpy as np
from sklearn.model_selection import train_test_split

## Generate some data we are going to split

In [4]:
# Let's generate a new data frame 'a' which will contain all integers from 1 to 100
# The method np.arange works like the built-in method 'range' with the difference it creates an array
a = np.arange(1,101)

In [5]:
# Let's check it out
a

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])

In [6]:
# Similarly, let's create another ndarray 'b', which will contain integers from 501 to 600
# We have intentionally picked these numbers so we can easily compare the two
# Obviously, the difference between the elements of the two arrays is 500 for any two corresponding elements
b = np.arange(501,601)
b

array([501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513,
       514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526,
       527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539,
       540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552,
       553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565,
       566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578,
       579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591,
       592, 593, 594, 595, 596, 597, 598, 599, 600])

## Split the data
Full documentation: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [7]:
# Let's check out how this works
train_test_split(a)

[array([ 24,  28,  96,  99,  34,  10,  22,  86,  11,  43,  83,  79,  71,
         33,  14,  72,  61,  37,  50,  91,  77,   2,  40,  21,   7,  67,
         66,  47,  74,  36,  58,  75,  55,  59, 100,  65,   1,  54,  46,
         78,  64,  23,  26,  29,  20,  35,  51,  93,   6,  73,  45,  85,
         87,  48,  49,  76,  38,  92,  39,  69,  97,  56,  15,  52,  70,
         68,  89,  41,  19,  95,  13,  12,  53,  32,   8]),
 array([82, 17, 44, 88, 84, 98, 80, 90, 18, 62, 42, 27,  3, 81, 30, 16,  5,
        63, 31, 94, 57,  4,  9, 60, 25])]

In [33]:
# There are several different arguments we can set when we employ this method
# Most often, we have inputs and targets, so we have to split 2 different arrays
# we are simulating this situation by splitting 'a' and 'b'

# You can specify the 'test_size' or the 'train_size' (but the latter is deprecated and will be removed)
# essentially the two have the same meaning 
# Common splits are 75-25, 80-20, 85-15, 90-10

# Finally, you should always employ a 'random_state'
# In this way you ensure that when you are splitting the data you will always get the SAME random shuffle

# Note 2 arrays will be split into 4
# The order is train1, test1, train2, test2 
# It is very useful to store them in 4 variables, so we can later use them
a_train, a_test, b_train, b_test = train_test_split(a, b, test_size=0.2, random_state=12)

## Explore the result

In [34]:
# Let's check the shapes
# Basically, we are checking how does the 'test_size' work
a_train.shape, a_test.shape

((80,), (20,))

In [35]:
# Explore manually
a_train

array([ 84,  56,  27,  55,  20,  58,  47,  24,  37,  92,  82,  66,  85,
        91,  89,  30,  39,  78,  41,  79,  21,  11,  29,  97,  96,  72,
        74,  73,  43,  80,  48,  67,  65,  62,  52,  54,  12,  38,  45,
        44,   2,  86,  70,   5,  57,  26,  95,  60,  99,  33,  83,  46,
        98,  51,  19,  64,  31,  34,  81,  63,  36,  88,  94,   1,  75,
       100,  35,  14,   6,  53,  50,  23,  49,  77,  68,   4,   3,   7,
        28,  76])

In [36]:
# Explore manually
a_test

array([18, 42, 93, 15, 69, 32, 90, 16, 22, 61, 13,  9, 40, 10,  8, 71, 59,
       25, 87, 17])

In [26]:
b_train.shape, b_test.shape

((80,), (20,))

In [27]:
b_train

array([556, 589, 527, 543, 570, 516, 541, 597, 510, 573, 512, 548, 586,
       529, 594, 506, 567, 566, 536, 517, 550, 535, 508, 596, 528, 520,
       582, 526, 563, 514, 525, 504, 518, 539, 509, 579, 507, 565, 537,
       590, 557, 600, 555, 544, 551, 568, 547, 569, 562, 598, 580, 542,
       559, 549, 599, 558, 576, 533, 595, 560, 564, 585, 538, 530, 502,
       553, 522, 503, 524, 588, 592, 575, 587, 583, 521, 561, 572, 515,
       593, 552])

In [28]:
b_test

array([584, 554, 571, 546, 545, 540, 523, 581, 511, 501, 519, 531, 574,
       534, 591, 505, 577, 578, 513, 532])