# Simulating Foot-Traffic Data With Faker

Faker is a Python package that allows you to generate fake data such as names, addresses, and phone numbers. It can be useful for generating test data for applications, populating databases with fake information, or anonymizing sensitive data. The package uses various localized data sources, such as lists of names and addresses specific to different countries, to generate the fake data. It also allows you to customize the generated data to a certain extent, for example, specifying the format of a phone number or the gender of a name.

## Importing the Faker package

In [None]:
from faker import Faker

fake = Faker()

Once the `fake` object has been initialized, we can use it to generate data values from dozens of categories, called "providers". You can find the full list of providers in the Faker [documentation](https://faker.readthedocs.io/en/master/providers.html).

In [None]:
# generate a fake name
print(fake.name())

#generate male and female names
print([fake.first_name_female(), fake.first_name_male()])

#generate a random date
print(fake.date())

#generate a realistic birthdate
print(fake.date_of_birth(minimum_age=13, maximum_age=100))

#generate fake address
print(fake.address())

#generate fake user profile data
print(fake.profile())

## Using Faker to create the `stores` table

The `stores` table will be used to track the different store locations throughout the country, including their coordinates, city, and state. We want this data to be as realistic as possible so that we can map them later in the workshop, so we will be using the `Nominatim` package from the `geopy` library to...

Create the `generate_store()` function

In [None]:
fake.local_latlng()

In [None]:
from geopy import Nominatim

locator = Nominatim(user_agent='myGeocoder')

def generate_store():
    
    coords = fake.local_latlng(country_code="US")
    location = locator.reverse(coords[:2]).raw
    
    try:
        city_town = location["address"]["city"]
    except:
        try:
            city_town = location["address"]["town"]
        except:
            city_town = location["address"]["county"]
    
        
    
    store = {
        "store_id": fake.pyint(),
        "opened_date": str(fake.date_this_century()),
        "latitude": coords[0],
        "longitude": coords[1],
        "store_address": " ".join([str(fake.pyint()), location["address"]["road"]]),
        "city": city_town,
        "state": location["address"]["state"]
    }
    
    return store

generate_store()

Create `generate_stores()` function

In [None]:
import pandas as pd

def generate_stores(num_stores):
    
    stores = [generate_store() for i in range(num_stores)]
    
    return pd.DataFrame(stores)

generate_stores(5)

Create table of 50 stores

In [None]:
stores = generate_stores(50) #if you get a KeyError, run this cell again

map newly generated store locations

In [None]:
import folium

m = folium.Map(location=[39.8283, -98.5795], zoom_start=4)

for x,y in stores.iterrows():
    folium.Marker(location=[y.latitude, y.longitude], radius=5, tooltip=f"{y.city}, {y.state}").add_to(m)

m

### Exercise: 

#### Part 1
create the function `generate_customer()` that will generate a dictionary of customer data with the following attributes:

```
customer_id
customer_name
customer_birthday
customer_email
is_member
card_on_file
```

#### Part 2
Generate a CSV file containing 1500 customer records. Name the file `customers.csv`.


------
You can look through the Faker [documentation](https://faker.readthedocs.io/en/master/providers.html) to help you.


In [None]:
def generate_customer():
    
    customer = {
        "customer_id": fake.uuid4().split("-")[0],
        "customer_name": fake.name(),
        "customer_birthday": fake.date_of_birth(minimum_age=13, maximum_age=110),
        "customer_email": fake.email(),
        "is_member": fake.boolean(),
        "card_on_file": fake.credit_card_provider()
        
    }
    
    return customer

generate_customer()

In [None]:
def generate_customers(num_customers):
    
    customers = [generate_customer() for i in range(num_customers)]
    
    return pd.DataFrame(customers)

generate_customers(5)

In [None]:
customers = generate_customers(500)
customers.describe()

Create the `generate_visits()` function

In [None]:
import random

def generate_visit(store_df, customer_df, visit_date="01-01-2022"):
    
    visit = {
        "visit_id": str(fake.uuid4().split("-")[0]),
        "visit_date": visit_date,
        "store_id": store_df.sample().store_id.values[0],
        "customer_id": customer_df.sample().customer_id.values[0],
        "order_total": round(random.random() * random.choice([10, 100, 500, 1000]), 2),
        "payment_method": random.choice(["cash", "credit"]),
    }
    
    return visit

generate_visit(stores, customers)

Create `generate_visits()` function

In [None]:
def generate_visits(num_visits, store_df, customer_df, visit_date="01-01-2022"):
    
    def generate_visit(store_df, customer_df, visit_date=visit_date):
    
        visit = {
            "visit_id": str(fake.uuid4().split("-")[0]),
            "visit_date": visit_date,
            "store_id": store_df.sample().store_id.values[0],
            "customer_id": customer_df.sample().customer_id.values[0],
            "order_total": round(random.random() * random.choice([10, 100, 500, 1000]), 2),
            "payment_method": random.choice(["cash", "credit"]),
        }
        
        return visit
    
    visits = pd.DataFrame([generate_visit(store_df, customer_df, ) for i in range(num_visits)])
    
    return visits

generate_visits(5, stores, customers, visit_date="01-01-2022")

## Use functions to create seed_data()

In [None]:
from pathlib import Path

def seed_data(start_date, end_date, directory, num_stores, num_customers):
    
    Path(directory).mkdir(parents=True, exist_ok=True)
    
    stores = generate_stores(num_stores)
    stores.to_csv(f"{directory}/stores.csv", index=False)
    
    customers = generate_customers(num_customers)
    customers.to_csv(f"{directory}/customers.csv", index=False)
    
    visit_data = []
    
    for i in pd.date_range(start_date, end_date):
        visits = generate_visits(random.randrange(1, 10000), stores, customers, visit_date=i)
        visit_data.append(visits)
    
    pd.concat(visit_data).to_csv(f"{directory}/visits.csv", index=False)

seed_data("01-01-2022", "06-01-2022", "data/db", num_stores=50, num_customers=1500)
    
    

# BREAK - BACK TO WORKSHOP GUIDE

## Set up database for generated data

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html

In [None]:
# only run this cell if you no longer have the the stores, customers, and variables dataframes in your environment
import pandas as pd

customers = pd.read_csv("data/db/customers.csv")
stores = pd.read_csv("data/db/stores.csv")
visits = pd.read_csv("data/db/visits.csv")

In [None]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///data.db', echo=False)

customers.to_sql("customers", con=engine, index=False)
stores.to_sql("stores", con=engine, index=False)
visits.to_sql("visits", con=engine, index=False)

engine.dispose()

In [None]:
from sqlalchemy import text

with engine.connect() as conn:
    res = conn.execute(text("SELECT * FROM customers")).fetchall()

pd.DataFrame(res)

# Back to Workshop Guide - Create Live App

## Send new data to database

In [None]:
import time
from sqlalchemy import create_engine
import pandas as pd

def generate_data(db_engine, start_date, end_date, time_delay=2):
    with db_engine.connect() as conn:
        customers = pd.read_sql("customers", conn)
        stores = pd.read_sql("stores", conn)
        
        for i in pd.date_range(start_date, end_date):
            visits = generate_visits(random.randrange(1, 10000), stores, customers, visit_date=i)
            visits.to_sql("visits", con=db_engine, if_exists='append', index=False)
            print(f"inserted {len(visits)} records from {str(i)}")
            print("---")
            time.sleep(time_delay)
        

engine = create_engine("sqlite:///data.db", echo=False)
generate_data(engine, "2022-06-01", "2022-12-31")

In [None]:
from sqlalchemy import text
# engine = create_engine("sqlite:///data.db", echo=False)
with engine.connect() as conn:
    test = conn.execute(text("SELECT * FROM visits")).fetchall()

len(test)