In [1]:
# https://towardsdatascience.com/how-to-split-a-dataset-into-training-and-testing-sets-b146b1649830
import pandas as pd
from sklearn.datasets import load_iris

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                  5.1               3.5                1.4               0.2
1                  4.9               3.0                1.4               0.2
2                  4.7               3.2                1.3               0.2
3                  4.6               3.1                1.5               0.2
4                  5.0               3.6                1.4               0.2
..                 ...               ...                ...               ...
145                6.7               3.0                5.2               2.3
146                6.3               2.5                5.0               1.9
147                6.5               3.0                5.2               2.0
148                6.2               3.4                5.4               2.3
149                5.9               3.0                5.1               1.8

[150 rows x 4 columns]


In [None]:
iris_data = load_iris()
df = pd.DataFrame(iris_data.data, columns=iris_data.feature_names)
print(df)

In [2]:
# The first option is to use pandas DataFrames’ method sample():

training_data = df.sample(frac=0.8, random_state=25)
testing_data = df.drop(training_data.index)

print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

No. of training examples: 120
No. of testing examples: 30


In [3]:
# The second option — and probably the most commonly used —
# is the use of sklearn ‘s method called train_test_split():
from sklearn.model_selection import train_test_split

training_data, testing_data = train_test_split(df, test_size=0.2, random_state=25)

print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

No. of training examples: 120
No. of testing examples: 30


In [6]:
# Finally, a less commonly used way of creating testing and training
# samples is with numpy ‘s method randn():

import numpy as np

mask = np.random.rand(len(df)) <= 0.8
training_data = df[mask]
testing_data = df[~mask]

print(f"mask : {mask} :: No. of training examples: {training_data.shape[0]}")
print(f"mask : {mask} :: No. of testing examples: {testing_data.shape[0]}")

mask : [ True False  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True False  True
  True False  True  True  True False  True  True False  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True False  True  True  True  True  True  True False  True False
  True  True  True  True  True  True  True  True  True  True  True  True
 False  True False False False  True  True  True  True False  True False
 False  True  True  True  True  True  True  True  True False  True  True
  True  True  True  True  True  True  True  True False  True  True  True
 False  True  True False  True  True  True  True False  True False  True
 False  True False  True  True  True] :: No. of training examples: 127
mask : [ True False  True  True  True  True  T