# Foot Traffic Data Analysis

In [31]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px

ModuleNotFoundError: No module named 'matplotlib'

In [3]:
stores = pd.read_csv("data/db/stores.csv", dtype={"store_id":"string"})
customers = pd.read_csv("data/db/customers.csv", dtype={"customer_id":"string"})

visits = pd.read_csv("data/db/visits.csv", dtype={"customer_id":"string", "store_id":"string"})\
    .merge(stores, on="store_id", how="left")\
    .merge(customers, on="customer_id", how="left")

visits["visit_date"] = pd.to_datetime(visits["visit_date"])

display(stores.head())
display(customers.head())
display(visits.head())

Unnamed: 0,store_id,opened_date,latitude,longitude,store_address,city,state
0,985,2004-07-30,45.53929,-122.38731,4651 Southeast 3rd Street,Troutdale,Oregon
1,3534,2022-06-14,40.72371,-73.95097,151 Nassau Avenue,City of New York,New York
2,1907,2015-06-22,26.68451,-80.66756,8718 South Main Street,Belle Glade,Florida
3,5420,2006-10-27,47.38093,-122.23484,6823 2nd Avenue South,Kent,Washington
4,5393,2020-09-13,25.67927,-80.31727,7735 South Dixie Highway,Kendall,Florida


Unnamed: 0,customer_id,customer_name,customer_birthday,customer_email,is_member,card_on_file
0,a6990b5f,Jessica Cooper,1927-04-26,hammondamanda@yahoo.com,True,VISA 13 digit
1,03cbd012,Madison Lawrence,1937-12-24,mwalls@yahoo.com,False,VISA 13 digit
2,fb918ae3,Peter Duncan,1934-12-30,nicholas21@gmail.com,True,JCB 15 digit
3,57c4f41f,Matthew Williams,1982-08-10,sonia20@martinez-wilcox.net,True,Mastercard
4,3be53fdb,Luke Gilbert,1999-07-13,victoria53@yahoo.com,False,Mastercard


Unnamed: 0,visit_id,visit_date,store_id,customer_id,order_total,payment_method,opened_date,latitude,longitude,store_address,city,state,customer_name,customer_birthday,customer_email,is_member,card_on_file
0,a96c5a04,2020-01-01,955,3b373019,871.52,cash,2014-02-04,40.68066,-73.47429,5164 Hicksville Road,Massapequa,New York,Mr. Kevin Pittman Jr.,1952-01-17,robertwang@santana-rice.info,True,Maestro
1,39e9a393,2020-01-01,8531,69367926,41.63,cash,2007-01-12,40.82232,-74.15987,823 Hillside Avenue,Nutley,New Jersey,Dr. David Parks MD,1991-05-12,gperry@garcia.info,True,Discover
2,fcf841c6,2020-01-01,4047,8fcbb073,118.07,credit,2018-12-04,42.35843,-71.05977,1139 Tremont Street,Boston,Massachusetts,Joseph Maddox,1981-12-16,david78@george.com,True,American Express
3,4330bb3d,2020-01-01,7790,7b902f29,80.75,credit,2021-08-06,45.0408,-93.263,4256 University Avenue,Columbia Heights,Minnesota,Brian Fields,1935-04-29,ofisher@hotmail.com,False,VISA 16 digit
4,0ec1288e,2020-01-01,7803,0d1942a9,9.68,cash,2000-12-04,39.71734,-74.96933,8217 Williamstown Road,Winslow Township,New Jersey,Amanda Cox,1932-11-27,tiffany83@glover-cooley.com,False,Discover


## Calculating a time series of total visits by day

In [26]:
visits_by_day = visits.groupby("visit_date").size()\
    .reset_index().rename(columns={0:"visit_count"})

visits_by_day

Unnamed: 0,visit_date,visit_count
0,2020-01-01,4930
1,2020-01-02,995
2,2020-01-03,9182
3,2020-01-04,217
4,2020-01-05,4115
...,...,...
148,2020-05-28,2669
149,2020-05-29,9146
150,2020-05-30,7125
151,2020-05-31,6898


Resampling for 2-week averages

In [29]:
visits_by_day = visits.groupby("visit_date").size().resample("2W").mean()\
    .reset_index().rename(columns={0:"visit_count"})
visits_by_day

Unnamed: 0,visit_date,visit_count
0,2020-01-05,3887.8
1,2020-01-19,3772.142857
2,2020-02-02,3883.428571
3,2020-02-16,5305.214286
4,2020-03-01,5827.428571
5,2020-03-15,5900.214286
6,2020-03-29,5001.142857
7,2020-04-12,4896.714286
8,2020-04-26,4385.714286
9,2020-05-10,5696.642857


In [30]:
visits_by_day.plot()

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

In [9]:
members = visits[visits["is_member"] == True]
mtiers = members.groupby("customer_id").sum()["order_total"].reset_index()
mtiers["membership_tier"] = pd.qcut(mtiers["order_total"], [0,0.5, 0.85, 0.95], labels=["bronze", "silver", "gold"])
df = visits.merge(mtiers[["customer_id", "membership_tier"]], on="customer_id", how="left")
df["membership_tier"] = df["membership_tier"].astype("string").fillna("Nonmember").astype("category")
df.groupby(["visit_date", "membership_tier"]).mean()["order_total"].unstack()#.plot()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



membership_tier,Nonmember,bronze,gold,silver
visit_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-01,205.837250,186.856891,212.122134,192.164642
2020-01-02,203.580593,168.280996,229.307736,211.683769
2020-01-03,202.552633,201.204713,210.049017,206.355542
2020-01-04,213.919524,184.765455,241.224444,183.872917
2020-01-05,202.297409,207.387244,228.677261,200.698487
...,...,...,...,...
2020-05-28,199.844967,204.253265,216.415906,210.714187
2020-05-29,198.971923,199.484884,192.078484,218.386806
2020-05-30,204.005329,185.697598,208.455100,204.610649
2020-05-31,199.708923,183.114376,190.207061,207.455340


In [20]:
x = df["membership_tier"].value_counts().reset_index().rename(columns={"index":"membership_tier", "membership_tier":"count"})
px.pie(x, names="membership_tier", values="count", hole=0.4)

In [7]:
x = df["payment_method"].value_counts().to_frame().reset_index()
x.columns = ["payment_method", "count"]
x

px.pie(x, names='payment_method', values='count', hole=0.4)