# Data Visualization and Exploration - Python

- Jackson Cown

In [1]:
import numpy as np
import pandas as pd
import datetime as dt

### Loading the Datasets

In [2]:
# Load file paths
items_path = "../data/items.csv"
category_hierarchy_path = "../data/category_hierarchy.csv"
orders_path = "../data/orders.csv"
# Load datasets into pandas
items = pd.read_csv(items_path, sep="|")
category_hierarchy = pd.read_csv(category_hierarchy_path, sep="|")
orders = pd.read_csv(orders_path, sep="|")

In [3]:
# Check Items
items.head()

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories
0,22665,861,4,0,490,2,66,"[2890, 855, 3908, 3909]"
1,28640,1366,10,1,537,0,101,
2,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3..."
3,21399,1090,10,1,511,0,0,[3270]
4,8504,768,4,1,484,0,66,[2470]


In [4]:
# Check Category Hierarchy
category_hierarchy.head()

Unnamed: 0,category,parent_category
0,0,75
1,1,1499
2,2,1082
3,3,3498
4,4,1623


In [5]:
# Check Orders
orders.head()

Unnamed: 0,date,userID,itemID,order
0,2020-06-01,38769,3477,1
1,2020-06-01,42535,30474,1
2,2020-06-01,42535,15833,1
3,2020-06-01,42535,20131,1
4,2020-06-01,42535,4325,1


### Munging the Dataset

In [6]:
# Generate Dates
start = dt.datetime.strptime("2020-06-01", "%Y-%m-%d")
date_generated = pd.date_range(start, periods=245)
#print(list(date_generated.strftime("%Y-%m-%d")))
#print(date_generated)

In [7]:
# Init dict for storing date order pairs
date_order_dict = {}
for i, date in enumerate(date_generated):
    # Format date for comparisn
    date = date.strftime("%Y-%m-%d")
    # Compile and sum list of orders per day; update dict
    ordered = np.sum([item for item in orders.loc[orders["date"] == date].order])
    date_order_dict[i] = [date, ordered]
    print(f'Date: {date}; \nOrder Count: {ordered}\n')

Date: 2020-06-01; 
Order Count: 7148

Date: 2020-06-02; 
Order Count: 6039

Date: 2020-06-03; 
Order Count: 6781

Date: 2020-06-04; 
Order Count: 5904

Date: 2020-06-05; 
Order Count: 5118

Date: 2020-06-06; 
Order Count: 9105

Date: 2020-06-07; 
Order Count: 8017

Date: 2020-06-08; 
Order Count: 3482

Date: 2020-06-09; 
Order Count: 1516

Date: 2020-06-10; 
Order Count: 2248

Date: 2020-06-11; 
Order Count: 3238

Date: 2020-06-12; 
Order Count: 4706

Date: 2020-06-13; 
Order Count: 8719

Date: 2020-06-14; 
Order Count: 8189

Date: 2020-06-15; 
Order Count: 5820

Date: 2020-06-16; 
Order Count: 6849

Date: 2020-06-17; 
Order Count: 6440

Date: 2020-06-18; 
Order Count: 5991

Date: 2020-06-19; 
Order Count: 4641

Date: 2020-06-20; 
Order Count: 2170

Date: 2020-06-21; 
Order Count: 7508

Date: 2020-06-22; 
Order Count: 7133

Date: 2020-06-23; 
Order Count: 5158

Date: 2020-06-24; 
Order Count: 3510

Date: 2020-06-25; 
Order Count: 5235

Date: 2020-06-26; 
Order Count: 4776

Date: 2020-0

Date: 2021-01-04; 
Order Count: 8608

Date: 2021-01-05; 
Order Count: 7916

Date: 2021-01-06; 
Order Count: 6911

Date: 2021-01-07; 
Order Count: 6120

Date: 2021-01-08; 
Order Count: 5032

Date: 2021-01-09; 
Order Count: 7914

Date: 2021-01-10; 
Order Count: 8859

Date: 2021-01-11; 
Order Count: 7791

Date: 2021-01-12; 
Order Count: 7997

Date: 2021-01-13; 
Order Count: 7231

Date: 2021-01-14; 
Order Count: 6022

Date: 2021-01-15; 
Order Count: 5687

Date: 2021-01-16; 
Order Count: 8427

Date: 2021-01-17; 
Order Count: 8340

Date: 2021-01-18; 
Order Count: 9534

Date: 2021-01-19; 
Order Count: 8401

Date: 2021-01-20; 
Order Count: 7513

Date: 2021-01-21; 
Order Count: 6215

Date: 2021-01-22; 
Order Count: 6334

Date: 2021-01-23; 
Order Count: 8610

Date: 2021-01-24; 
Order Count: 8564

Date: 2021-01-25; 
Order Count: 8195

Date: 2021-01-26; 
Order Count: 8213

Date: 2021-01-27; 
Order Count: 7029

Date: 2021-01-28; 
Order Count: 5887

Date: 2021-01-29; 
Order Count: 6682

Date: 2021-0

In [8]:
# Convert dict to pandas DataFrame
date_order_df = pd.DataFrame.from_dict(date_order_dict, orient='index', columns=["date", "num_orders"])
date_order_df.head(10)

Unnamed: 0,date,num_orders
0,2020-06-01,7148
1,2020-06-02,6039
2,2020-06-03,6781
3,2020-06-04,5904
4,2020-06-05,5118
5,2020-06-06,9105
6,2020-06-07,8017
7,2020-06-08,3482
8,2020-06-09,1516
9,2020-06-10,2248


In [9]:
# Export to CSV
date_orders_path = '../data/date_orders.csv'
date_order_df.to_csv(date_orders_path)

In [10]:
# Generating DataFrame for Month-Order Pairs
date_order = pd.read_csv(date_orders_path)
date_order.head()

Unnamed: 0.1,Unnamed: 0,date,num_orders
0,0,2020-06-01,7148
1,1,2020-06-02,6039
2,2,2020-06-03,6781
3,3,2020-06-04,5904
4,4,2020-06-05,5118


In [67]:
# Init dict for storing month-order pairs
month_order_dict = {}
months = ["06", "07", "08", "09", "10", "11", "12", "01"]
month_names = ["June", "July", "August", "September", "October", "November", "December", "January"]
month_and_names = {months[i]:month_names[i] for i in range(len(months))}
for month in months:
    for i, item in enumerate(date_order.date):
        if (item[5:7] == month):
            month_order_dict[int(month)] = month_order_dict.get(int(month), 0) + date_order.num_orders[i]
            print(f'Month: {month}; Date: {item}; Count: {date_order.num_orders[i]}')
    print(f'{month_and_names[month]} Total Orders: {month_order_dict[int(month)]}\n')

Month: 06; Date: 2020-06-01; Count: 7148
Month: 06; Date: 2020-06-02; Count: 6039
Month: 06; Date: 2020-06-03; Count: 6781
Month: 06; Date: 2020-06-04; Count: 5904
Month: 06; Date: 2020-06-05; Count: 5118
Month: 06; Date: 2020-06-06; Count: 9105
Month: 06; Date: 2020-06-07; Count: 8017
Month: 06; Date: 2020-06-08; Count: 3482
Month: 06; Date: 2020-06-09; Count: 1516
Month: 06; Date: 2020-06-10; Count: 2248
Month: 06; Date: 2020-06-11; Count: 3238
Month: 06; Date: 2020-06-12; Count: 4706
Month: 06; Date: 2020-06-13; Count: 8719
Month: 06; Date: 2020-06-14; Count: 8189
Month: 06; Date: 2020-06-15; Count: 5820
Month: 06; Date: 2020-06-16; Count: 6849
Month: 06; Date: 2020-06-17; Count: 6440
Month: 06; Date: 2020-06-18; Count: 5991
Month: 06; Date: 2020-06-19; Count: 4641
Month: 06; Date: 2020-06-20; Count: 2170
Month: 06; Date: 2020-06-21; Count: 7508
Month: 06; Date: 2020-06-22; Count: 7133
Month: 06; Date: 2020-06-23; Count: 5158
Month: 06; Date: 2020-06-24; Count: 3510
Month: 06; Date:

In [71]:
# Convert Dict to DataFrame
month_order_df = pd.DataFrame.from_dict(month_order_dict, orient='index', columns=["num_orders"])
month_order_df.insert(0, "month", month_names)
month_order_df.head(10)

Unnamed: 0,month,num_orders
6,June,178050
7,July,171345
8,August,186046
9,September,168536
10,October,177943
11,November,185271
12,December,196761
1,January,235942


In [73]:
# Export month-order pairs to csv
month_order_path = "../data/month_orders.csv"
month_order_df.to_csv(month_order_path)