# Train Test Split

In [1]:
X = list(range(10))
print(X)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [4]:
y = [x*x for x in X]
print(y)

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]


### The Old Way

In [7]:
import sklearn.model_selection as model_selection

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=101)

print('X train:', X_train) # specified 75% training size
print('y train', y_train) # specified 75% training size
print('X test:', X_test) # specified 25% testing size
print('y test', y_test) # specified 25% testing size

X train: [4, 9, 3, 5, 7, 6, 1]
y train [16, 81, 9, 25, 49, 36, 1]
X test: [8, 2, 0]
y test [64, 4, 0]


The program automatically rounded the ratio specified to 7:3. Also, the numbers in the lists after splitting do not follow the same ascending order as before. By default, the program ignores the original order of data and randomly picks them to form the training and test sets. Though, if desired, the shuffle parameter can be disabled.

### The New Way

In [8]:
import sklearn.model_selection as cross_validation

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, train_size=0.75, random_state=101)

## Using k-Fold

In [11]:
from sklearn.model_selection import KFold
import numpy as np

kf = KFold(n_splits=5, shuffle=True) # no need to specify percent split as it is implied here
X = np.array(X)
y = np.array(y)

# generate test sets and the remainder will be for training
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print('X test:', X_test)

X test: [4 6]
X test: [1 2]
X test: [5 8]
X test: [0 7]
X test: [3 9]


In [33]:
from sklearn.model_selection import cross_val_score

# score = cross_val_score(model, X_train, y_train, cv=None, scoring='neg_mean_absolute_error')