In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/titanic.csv')
df.shape

(846, 12)

In [2]:
def sampler(df, n, col=None, random_state=1337):
    """
    Random/Stratified sampling
    :param df: a DataFrame
    :param n: samples or fraction
    :param col: column name for stratified sampling
    :param random_state: random state
    """
    if 0 < n < 1 and col is None:
        return df.sample(frac=n, random_state=random_state)
    if n > 1 and col is None:
        return df.sample(n=n, random_state=random_state)
    if 0 < n < 1 and col is not None:
        new_df, _ = train_test_split(df, test_size=1-n, stratify=df[[col]], random_state=random_state)
        return new_df
    if n > 1 and col is not None:
        new_df, _ = train_test_split(df, test_size=(len(df)-n)/len(df), stratify=df[[col]], random_state=random_state)
        return new_df

In [3]:
df2 = sampler(df, 0.1)
df2.shape

(85, 12)

In [4]:
df2 = sampler(df, 100)
df2.shape


(100, 12)

In [5]:
df2 = sampler(df, 100, 'Sex')
df2.Sex.value_counts()


male      65
female    35
Name: Sex, dtype: int64

In [6]:
df2 = sampler(df, 0.1, 'Sex')
df2.Sex.value_counts()


male      55
female    29
Name: Sex, dtype: int64