In [4]:
from argparse import Namespace

params = Namespace(
    raw_dataset="surnames/raw_data.csv",
    ramdom_seed = 102,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_dataset="surnames/dataset.csv",
)

In [23]:
import pandas as pd

df = pd.read_csv(params.raw_dataset, header=0)

In [24]:
df.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


In [26]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
df['target'] = labelencoder.fit_transform(df['nationality'])

In [11]:
import numpy as np
import collections

def split_and_suffle(data):
    # Create split data
    final_list = []
    np.random.seed(params.ramdom_seed)

    by_nationality = collections.defaultdict(list)
    for _, row in data.iterrows():
        by_nationality[row.nationality].append(row.to_dict())

    for _, item_list in sorted(by_nationality.items()):
        np.random.shuffle(item_list)
        n = len(item_list)
        n_train = int(params.train_proportion * n)
        n_val = int(params.val_proportion * n)
        n_test = int(params.test_proportion * n)
        
        # Give data point a split attribute
        for item in item_list[:n_train]:
            item['split'] = 'train'
        for item in item_list[n_train: n_train+n_val]:
            item['split'] = 'val'
        for item in item_list[n_train + n_val:]:
            item['split'] = 'test'  
        
        # Add to final list
        final_list.extend(item_list)

        return pd.DataFrame(final_list)

In [12]:
df_splited = split_and_suffle(df)

array([ 4,  5,  4, ..., 12,  4,  6])

In [22]:
df_splited['nationality'].value_counts()

Arabic    1603
Name: nationality, dtype: int64

In [21]:
import matplotlib.pyplot as plt
df_splited['split'].value_counts()

train    1122
test      241
val       240
Name: split, dtype: int64

In [21]:
df_splited.to_csv(params.output_dataset, index=False)

In [36]:
df_splited['target'].unique()

array([0], dtype=int8)