## Create Customer Data (Sneaker Faqtory)

In [1]:
from faker import Faker
import numpy as np
import pandas as pd
import random as r
from random_address import real_random_address
import random_address

In [2]:
# number of customers per country
NUM_EN = 1500
NUM_ES = 1000
NUM_NL = 2000
NUM_PL = 250
NUM_IT = 750
NUM_SE = 500

tot_num_customers = NUM_EN + NUM_ES + NUM_NL + NUM_PL + NUM_IT + NUM_SE
num_customers = [NUM_EN, NUM_ES, NUM_NL, NUM_PL, NUM_IT, NUM_SE]

print("Total number of customers: ", tot_num_customers)

Total number of customers:  6000


In [3]:
# list of countries for Faker() object
country_list = ['en_GB', 'es_ES', 'nl_NL', 'pl_PL', 'it_IT', 'sv_SE']

In [4]:
# function to split original address in address + zipcode + city per country

def create_geolocation(address_original, country):
    
    # Netherlands
    if country == 'nl_NL':
        address, zipcode, city = address_original.split('\n')
        return address, zipcode, city
    
    # United Kingdom
    elif country == 'en_GB':
        new_address = address_original.split('\n')
        if len(new_address) == 4:
            new_address[0] = new_address[0] + ' '
            new_address[0:2] = [''.join(new_address[0:2])]
        return new_address[0], new_address[2], new_address[1]
    
    # Spain
    elif country == 'es_ES':
        new_address = address_original.split('\n')
        address = new_address[0]
        zip_city = new_address[1].split(', ')
        zipcode = zip_city[0]
        city = zip_city[1]
        return address, city, zipcode
    
    # Poland
    elif country == 'pl_PL':
        new_address = address_original.split('\n')
        address = new_address[0]
        zip_city = new_address[1].split(' ')
        zipcode = zip_city[0]
        city = zip_city[1]
        return address, zipcode, city
    
    # Italy
    elif country == 'it_IT':
        new_address = address_original.split('\n')
        address = new_address[0]
        zip_city = new_address[1].split(', ')
        zipcode = zip_city[1].split()[0]
        city_list = zip_city[1].split()[1:]
        city = ' '.join(city_list)
        return address, zipcode, city
    
    # Sweden
    elif country == 'sv_SE':
        new_address = address_original.split('\n')
        address = new_address[0]
        zip_city = new_address[1].split(' ')
        zipcode = zip_city[0]
        city = zip_city[1]
        return address, zipcode, city
    else:
        return None, None, None
        
create_geolocation('Ängsgränd 682\n17543 Uddevalla', 'sv_SE')

('Ängsgränd 682', '17543', 'Uddevalla')

In [5]:
# function to create fake phone number per country
def fake_number(country):
    number = '+'
    if country == 'nl_NL':
        number += '316 '
        for i in range(1, 9):
            number += str(r.randint(0, 9))
        return number
    elif country == 'en_GB':
        number += '44'
    elif country == 'es_ES':
        number += '34'
    elif country == 'pl_PL':
        number += '48'
    elif country == 'it_IT':
        number += '39'
    elif country == 'sv_SE':
        number += '46'
    for i in range(1, 2):
        number += str(r.randint(0, 9))
    number += ' '
    for i in range(1, 9):
            number += str(r.randint(0, 9))
    return number
        
fake_number('en_GB')

'+442 99128334'

In [6]:
# function to create fake email address

def listToString(s): 
    
    # initialize an empty string
    str1 = "" 
    
    # traverse in the string  
    for ele in s: 
        str1 += ele  
    
    # return string  
    return str1 

def fake_email(first_name, last_name):
    ext_mail = np.random.choice(["@gmail.com", "@hotmail.com", "@xs4all.com"], p=[0.4, 0.4, 0.2])
    last_name = last_name.split()
    last_name = listToString(last_name)
    last_name = str(last_name)
    email = str(first_name.lower() + '.' + last_name.lower() + ext_mail)
    return email
    
fake_email('Kevin', 'van de Velde')

'kevin.vandevelde@gmail.com'

In [7]:
# function to convert country code to country name

def fake_country(country):
    if country == 'nl_NL':
        return 'Netherlands'
    elif country == 'en_GB':
        return 'United Kingdom'
    elif country == 'es_ES':
        return 'Spain'
    elif country == 'pl_PL':
        return 'Poland'
    elif country == 'sv_SE':
        return 'Sweden'
    elif country == 'it_IT':
        return 'Italy'
    else:
        return None


In [8]:
# Generate a dictionary with valid random address information
real_random_address()
random_address.real_random_address_by_postal_code('L3D 1YH')

{}

In [9]:
# function to generate fake customers
def faker_categorical(num=1, seed=None, country=None):
    fake = Faker([country])
    np.random.seed(seed)
    fake.seed_instance(seed)
    
    output = []
    
    for x in range(num):
        #gender
        gender = np.random.choice(["M", "F"], p=[0.5, 0.5])
        
        # email address
        first_name = fake.first_name_male() if gender=="M" else fake.first_name_female()
        last_name = fake.last_name()
        email = fake_email(first_name, last_name)
        
        # address
        address_zipcode_city = fake.address()
        address, zipcode, city = create_geolocation(address_zipcode_city, country)
        
        # phone_number
        phone_number = fake_number(country)
        
        # country
        country_name = fake_country(country)
        
        # create customer
        customer = [
                    first_name,
                    last_name,
                    gender,
                    email,
                    fake.date_of_birth(),
                    country_name,
                    city,
                    address,
                    zipcode,
                    phone_number
                   ]
        output.append(customer)
    return output

In [10]:
# create Dataframe for fake customers
customers = []

for i, country in enumerate(country_list):
    num = num_customers[i]
    output = faker_categorical(num, None, country)
    for item in output:
        customers.append(item)
        
        
df = pd.DataFrame.from_records(customers, columns=['First Name', 'Last Name',
                                                   'Gender', 'Email Address',
                                                   'Date Of Birth', 'Country',
                                                   'City', 'Address', 'ZIP Code', 'Phone Number'])

df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,First Name,Last Name,Gender,Email Address,Date Of Birth,Country,City,Address,ZIP Code,Phone Number
0,Declan,Cox,M,declan.cox@gmail.com,1922-07-27,United Kingdom,South Janetton,81 Turner common,ML96 1AF,+441 89523537
1,Inger,Mårtensson,F,inger.mårtensson@xs4all.com,1989-12-30,Sweden,Malmö,Åkergatan 51,91350,+463 30287683
2,Nicole,Szczechowicz,F,nicole.szczechowicz@xs4all.com,2007-07-27,Poland,Września,plac Kolorowa 84/15,78-865,+481 99015423
3,Jaylinn,Schagen,F,jaylinn.schagen@xs4all.com,1929-12-15,Netherlands,Ter Idzard,Jakehof 7,3089OM,+316 11598809
4,Pasqual,Vianello,M,pasqual.vianello@hotmail.com,2006-11-07,Italy,Benevento (MO),Stretto Faugno 11 Piano 1,90045,+394 05348594
...,...,...,...,...,...,...,...,...,...,...
5995,Rolando,Mercati,M,rolando.mercati@hotmail.com,1943-08-16,Italy,Lucca (PD),Stretto Roccabonella 2 Appartamento 51,79062,+396 78013947
5996,Jill,Bartels,F,jill.bartels@xs4all.com,1991-11-14,Netherlands,Rottum,Aylinlaan 59,8571 WE,+316 22304227
5997,Arkadiusz,Smolec,M,arkadiusz.smolec@hotmail.com,1949-08-14,Poland,Nysa,plac Szafirowa 72/90,49-589,+489 99378131
5998,Dylan,Clarke,M,dylan.clarke@xs4all.com,1909-09-30,United Kingdom,Hodgsonland,Flat 28q Mathew street,SK36 3ZX,+446 12438241


In [14]:
# create CSV from dataframe
df.to_csv('customers.csv', encoding='utf-8-sig', index=True)

In [12]:
# check dependency of fake address ... unfortunately it looks like they are generated independently
fake = Faker([country])
a = fake.address()
a

'Skogsstigen 829\n78015 Göteborg'