In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/titanic.csv')
df.shape

(846, 12)

In [10]:

def split_df(df, test_ratio=0.2, val_ratio=None, target=None, random_state=1337):
    """
    Split a dataset into training set and test set
    df -> (train, test)
       -> (X_train, X_test, y_train, y_test)
    :param df: a DataFrame to be split
    :param test_ratio: ratio of test set, 0-1
    :param val_ratio: ratio of validation set, 0-1
        split into (train, test) if not specified
        split into (train, val, test) if specified
    :param target:
        split into (train, test) if not specified
        split into (X_train, X_test, y_train, y_test) if specified
    :param random_state: random state
    """
    if target:
        if val_ratio:
            count = df.shape[0]
            val_count = int(count * val_ratio)
            test_count = int(count * test_ratio)
            df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
            val = df[:val_count]
            test = df[val_count:(val_count + test_count)]
            train = df[(val_count + test_count):]
            X_train = train.drop(target, axis=1, inplace=False)
            X_val = val.drop(target, axis=1, inplace=False)
            X_test = test.drop(target, axis=1, inplace=False)
            y_train = train[target]
            y_val = val[target]
            y_test = test[target]
            return X_train, X_val, X_test, y_train, y_val, y_test
        else:
            X = df.drop(target, axis=1, inplace=False)
            y = df[target]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=random_state)
            return X_train, X_test, y_train, y_test
    else:
        if val_ratio:
            count = df.shape[0]
            val_count = int(count * val_ratio)
            test_count = int(count * test_ratio)
            df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
            val = df[:val_count]
            test = df[val_count:(val_count + test_count)]
            train = df[(val_count + test_count):]
            return train, val, test
        else:
            train, test = train_test_split(df, test_size=test_ratio, random_state=random_state)
            return train, test

In [13]:
train, val, test= split_df(df, test_ratio=0.2, val_ratio=0.3, target=None, random_state=1337)

In [16]:
train.tail()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
841,688,3,"Dakic, Mr. Branko",male,19.0,0,0,349228,10.1708,,S,0
842,790,1,"Guggenheim, Mr. Benjamin",male,46.0,0,0,PC 17593,79.2,B82 B84,C,0
843,162,2,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Mi...",female,40.0,0,0,C.A. 33595,15.75,,S,1
844,758,2,"Bailey, Mr. Percy Andrew",male,18.0,0,0,29108,11.5,,S,0
845,732,3,"Hassan, Mr. Houssein G N",male,11.0,0,0,2699,18.7875,,C,0


In [2]:
def sampler(df, n, col=None, random_state=1337):
    """
    Random/Stratified sampling
    :param df: a DataFrame
    :param n: samples or fraction
    :param col: column name for stratified sampling
    :param random_state: random state
    """
    if 0 < n < 1 and col is None:
        return df.sample(frac=n, random_state=random_state)
    if n > 1 and col is None:
        return df.sample(n=n, random_state=random_state)
    if 0 < n < 1 and col is not None:
        new_df, _ = train_test_split(df, test_size=1-n, stratify=df[[col]], random_state=random_state)
        return new_df
    if n > 1 and col is not None:
        new_df, _ = train_test_split(df, test_size=(len(df)-n)/len(df), stratify=df[[col]], random_state=random_state)
        return new_df

In [3]:
df2 = sampler(df, 0.1)
df2.shape

(85, 12)

In [4]:
df2 = sampler(df, 100)
df2.shape


(100, 12)

In [5]:
df2 = sampler(df, 100, 'Sex')
df2.Sex.value_counts()


male      65
female    35
Name: Sex, dtype: int64

In [6]:
df2 = sampler(df, 0.1, 'Sex')
df2.Sex.value_counts()


male      55
female    29
Name: Sex, dtype: int64