In [1]:
import pandas as pd
import numpy as np
import re

### Pre-processing

Before spliting any dataset is neccesary to make some data processing, such us turning upper case letters to lower case so the model consider them as the same character

In [2]:
def preprocess_surname(text):
    text = text.lower()
    #text = re.sub(r'([.,!?])',r' \1 ', text)
    #text = re.sub(r'[^a-zA-z.,!?]+', r' ', text)
    return text


In [3]:
data = pd.read_csv('surnames.csv')

In [4]:
for row in range(len(data)):
    data.iloc[row,0] = preprocess_surname(data.iloc[row,0])

We are also going to transform the nationalities into one-hot-encoded vectors, in order to make easier the training of the model

In [5]:
def encoding_labels(data):
    all_nations = sorted(list(set(data.loc[:,'nationality'])))
    for i in range(len(data.iloc[:,1])):
        idx = all_nations.index(data.iloc[i,1])
        encoded_sample = np.zeros(len(all_nations))
        encoded_sample[idx] = 1
        data.iloc[i,1] = encoded_sample
    return data

### Split Data

In [6]:
def number_samples_category(data):
    category_dict = {}
    for i in range(len(data)):
        if data.iloc[i,1] not in category_dict:
            category_dict.update({data.iloc[i,1]:1})
        else:
            category_dict[data.iloc[i,1]] += 1
    
    sort_category_dict = {}
    for i in sorted(category_dict.keys()):
        sort_category_dict.update({i:category_dict[i]})
    return sort_category_dict

In [7]:
# given a Dataframe and preprocessed split it with the given proportions
def split_data(data,label,train_percentage,test_percentage,validation_percentage):


    training_set = pd.DataFrame(columns = data.columns)
    validation_set = pd.DataFrame(columns = data.columns)
    test_set = pd.DataFrame(columns = data.columns)
    
    # we sort them by the column of the label so it's going to be easier to classify train, test and validation dataset
    # without taking that much memory
    data = data.sort_values(by=[label])
    
    
    # We get the number of samples that we have fpr each category with the function number_samples_category,
    category_dict = number_samples_category(data)
    labels = category_dict.keys()

    count = 0
    for label in labels:
        
        total_samples = category_dict[label]
        num_training_samples = int(train_percentage*total_samples)
        num_test_samples = int(test_percentage*total_samples)
        num_validation_samples = total_samples - num_training_samples - num_test_samples
        
        
        training_set = training_set.append(data.iloc[count:count+num_training_samples, :])
        count = count+num_training_samples
        
        test_set = test_set.append(data.iloc[count:count+num_test_samples, :])
        count = count + num_test_samples
        
        validation_set = validation_set.append(data.iloc[count:count+num_validation_samples, :])

       
        count = num_validation_samples + count

    
    # Shuffle the dataset after classifying into the train, test and validation datasets
    training_set = training_set.sample(frac=1)
    test_set = test_set.sample(frac=1)
    validation_set = validation_set.sample(frac=1)
   
    
    return training_set, test_set, validation_set

###### First, we are going to explore the data and see how balanced are the dataset

In [8]:
number_samples_category(data)

{'Arabic': 1603,
 'Chinese': 220,
 'Czech': 414,
 'Dutch': 236,
 'English': 2972,
 'French': 229,
 'German': 576,
 'Greek': 156,
 'Irish': 183,
 'Italian': 600,
 'Japanese': 775,
 'Korean': 77,
 'Polish': 120,
 'Portuguese': 55,
 'Russian': 2373,
 'Scottish': 75,
 'Spanish': 258,
 'Vietnamese': 58}

###### Clearly unbalanced, we are going to split the dataset proportionally into training, test and validation datasets

In [9]:
# We need to remark which column name has the labels for the dataset, in this case is the column named nationality
label ='nationality'
training_set,test_set,validation_set = split_data(data,label,0.7,0.15,0.15)

training_set = encoding_labels(training_set)
test_set = encoding_labels(test_set)
validation_set = encoding_labels(validation_set)

In [10]:
training_set.to_csv('train.csv',index=False)
test_set.to_csv('test.csv',index=False)
validation_set.to_csv('validation.csv',index=False)