# Train Test Split

## Import the relevant libraries

In [None]:
# In this lesson we will explore the train_test_split module
# Therefore we need no more than the module itself and NumPy
import numpy as np
from sklearn.model_selection import train_test_split

## Generate some data we are going to split

In [None]:
# Let's generate a new data frame 'a' which will contain all integers from 1 to 100
# The method np.arange works like the built-in method 'range' with the difference it creates an array
a = np.arange(1,101)

In [None]:
# Let's check it out
a

In [None]:
# Similarly, let's create another ndarray 'b', which will contain integers from 501 to 600
# We have intentionally picked these numbers so we can easily compare the two
# Obviously, the difference between the elements of the two arrays is 500 for any two corresponding elements
b = np.arange(501,601)
b

## Split the data
Full documentation: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
# Let's check out how this works
train_test_split(a)

In [None]:
# There are several different arguments we can set when we employ this method
# Most often, we have inputs and targets, so we have to split 2 different arrays
# we are simulating this situation by splitting 'a' and 'b'

# You can specify the 'test_size' or the 'train_size' (but the latter is deprecated and will be removed)
# essentially the two have the same meaning 
# Common splits are 75-25, 80-20, 85-15, 90-10

# Finally, you should always employ a 'random_state'
# In this way you ensure that when you are splitting the data you will always get the SAME random shuffle

# Note 2 arrays will be split into 4
# The order is train1, test1, train2, test2 
# It is very useful to store them in 4 variables, so we can later use them
a_train, a_test, b_train, b_test = train_test_split(a, b, test_size=0.2, random_state=365)

## Explore the result

In [None]:
# Let's check the shapes
# Basically, we are checking how does the 'test_size' work
a_train.shape, a_test.shape

In [None]:
# Explore manually
a_train

In [None]:
# Explore manually
a_test

In [None]:
b_train.shape, b_test.shape

In [None]:
b_train

In [None]:
b_test