# Data Preprocessing Notebook

In [1]:
import pandas as pd
# import modin.pandas as pd
import numpy as np
import os
import re
import warnings

warnings.filterwarnings("ignore")
# os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray
# os.environ["MODIN_ENGINE"] = "dask"  # Modin will use Dask

from sklearn.model_selection import train_test_split
from argparse import Namespace
from tqdm.notebook import tqdm as tqdm_notebook

tqdm_notebook.pandas(desc="Preprocessin Data")

# Define Args

In [2]:
args = Namespace(
    raw_dataset_csv="data/surnames/surnames.csv",
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="data/surnames/surnames_with_splits.csv",
    seed=1337
)

# Load Data suing modin ray backend

In [3]:
surnames = pd.read_csv(args.raw_dataset_csv, header=0)

In [4]:
surnames.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


# Data Insights

In [5]:
surnames.describe()

Unnamed: 0,surname,nationality
count,10980,10980
unique,9041,18
top,Koury,English
freq,24,2972


In [6]:
set(surnames.nationality.unique())

{'Arabic',
 'Chinese',
 'Czech',
 'Dutch',
 'English',
 'French',
 'German',
 'Greek',
 'Irish',
 'Italian',
 'Japanese',
 'Korean',
 'Polish',
 'Portuguese',
 'Russian',
 'Scottish',
 'Spanish',
 'Vietnamese'}

# Split data Train/Test/Val

In [7]:
train_surnames, val_surnames = train_test_split(surnames, train_size=args.train_proportion,
                                             stratify=surnames.nationality.values)

In [8]:
val_surnames, test_surnames = train_test_split(val_surnames, train_size=0.5, 
                                            stratify=val_surnames.nationality.values)

In [13]:
len(train_surnames.nationality.value_counts())

18

In [14]:
len(val_surnames.nationality.value_counts())

18

In [15]:
len(test_surnames.nationality.value_counts())

18

In [16]:
train_surnames.reset_index(drop=True, inplace=True)
val_surnames.reset_index(drop=True, inplace=True)
test_surnames.reset_index(drop=True, inplace=True)

In [17]:
train_surnames["split"] = "train"
val_surnames["split"] = "val"
test_surnames["split"] = "test"

In [18]:
final_surnames = pd.concat([train_surnames, val_surnames, test_surnames], axis=0, copy=True)

In [19]:
final_surnames.split.value_counts()

train    7685
test     1648
val      1647
Name: split, dtype: int64

# Save Data

In [20]:
final_surnames.to_csv(args.output_munged_csv, index=False)