In [1]:
import pandas as pd


# Load 5 datasets
a_df = pd.read_csv('./data/aisles.csv', sep=';')
d_df = pd.read_csv('./data/departments.csv', sep=';')
i_df = pd.read_csv('./data/instacart_orders.csv', sep=';')
o_df = pd.read_csv('./data/order_products.csv', sep=';')
p_df = pd.read_csv('./data/products.csv', sep=';')

dfs = [
    a_df, # 'a' for 'aisles'
    d_df, # 'd' for 'departments'
    i_df, # 'i' for 'instacart_orders'
    o_df, # 'o' for 'order_products'
    p_df  # 'p' for 'products'
]


In [2]:
# Look at some metadata
def print_lots_of_info(df):
    df.info(show_counts=True)
    print(df.shape)
    print()
    print(df.describe())
    print()
    print(df.head(5))
    print()
    print(df.sample(5))
    print()
    do_each_col_too(df)
    print()
    print()

def do_each_col_too(df):
    for c in df.columns:
        print(c, 'count of unique values =', df[c].nunique())
        # count the number of duplicated values in each column
        mask = df.duplicated(subset=c, keep=False)
        duplicated_values = df.loc[mask, c].unique()
        num_duplicates = len(duplicated_values)
        if num_duplicates > 0:
            print(f"Column '{c}' has {num_duplicates} duplicated values: {duplicated_values}")
        print()



for df in dfs:
    print_lots_of_info(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   aisle_id  134 non-null    int64 
 1   aisle     134 non-null    object
dtypes: int64(1), object(1)
memory usage: 2.2+ KB
(134, 2)

         aisle_id
count  134.000000
mean    67.500000
std     38.826537
min      1.000000
25%     34.250000
50%     67.500000
75%    100.750000
max    134.000000

   aisle_id                       aisle
0         1       prepared soups salads
1         2           specialty cheeses
2         3         energy granola bars
3         4               instant foods
4         5  marinades meat preparation

     aisle_id                aisle
37         38         frozen meals
26         27        beers coolers
2           3  energy granola bars
36         37        ice cream ice
126       127    body lotions soap

aisle_id count of unique values = 134

aisle count of unique values =

In [3]:
i_df = i_df.drop_duplicates()

In [4]:
# column renames
col_renames = [
    { # For a_df
        'aisle_id': 'id',
        'aisle': 'name'
    },
    { # For d_df
        'department_id': 'id',
        'department': 'name'
    },
    { # For i_df
        'order_id': 'id',
        'order_number': 'user_order_number',
        'order_dow': 'day_of_week_numeral'
    },
    { # For o_df
        'add_to_cart_order': 'cart_item_id'
    },
    { # For p_df
        'product_id': 'id',
        'product_name': 'name'
    },
]

In [5]:

a_df = a_df.rename(columns=col_renames[0])
d_df = d_df.rename(columns=col_renames[1])
i_df = i_df.rename(columns=col_renames[2])
o_df = o_df.rename(columns=col_renames[3])
p_df = p_df.rename(columns=col_renames[4])

dfs = [
    a_df, # 'a' for 'aisles'
    d_df, # 'd' for 'departments'
    i_df, # 'i' for 'instacart_orders'
    o_df, # 'o' for 'order_products'
    p_df  # 'p' for 'products'
]

for df in dfs:
    print_lots_of_info(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      134 non-null    int64 
 1   name    134 non-null    object
dtypes: int64(1), object(1)
memory usage: 2.2+ KB
(134, 2)

               id
count  134.000000
mean    67.500000
std     38.826537
min      1.000000
25%     34.250000
50%     67.500000
75%    100.750000
max    134.000000

   id                        name
0   1       prepared soups salads
1   2           specialty cheeses
2   3         energy granola bars
3   4               instant foods
4   5  marinades meat preparation

      id                           name
29    30                   latino foods
11    12                    fresh pasta
106  107                 chips pretzels
19    20                   oral hygiene
114  115  water seltzer sparkling water

id count of unique values = 134

name count of unique values = 134



<class 'panda

## Find and remove duplicate values (and describe why you make your choices)

### `orders` data frame

In [7]:
# Check for duplicated orders


In [8]:
# Check for all orders placed Wednesday at 2:00 AM


In [9]:
# Remove duplicate orders


In [10]:
# Double check for duplicate rows


In [11]:
# Double check for duplicate order IDs only


### `products` data frame

In [12]:
# Check for fully duplicate rows


In [13]:
# Check for just duplicate product IDs


In [14]:
# Check for just duplicate product names (convert names to lowercase to compare better)


In [15]:
# Check for duplicate product names that aren't missing


### `departments` data frame

### `aisles` data frame

### `order_products` data frame

In [16]:
# Check for fullly duplicate rows


In [17]:
# Double check for any other tricky duplicates


## Find and remove missing values


### `products` data frame

In [18]:
# Are all of the missing product names associated with aisle ID 100?


In [19]:
# Are all of the missing product names associated with department ID 21?


In [20]:
# What is this ailse and department?


In [21]:
# Fill missing product names with 'Unknown'


### `orders` data frame

In [22]:
# Are there any missing values where it's not a customer's first order?


### `order_products` data frame

In [23]:
# What are the min and max values in this column?


In [24]:
# Save all order IDs with at least one missing value in 'add_to_cart_order'


In [25]:
# Do all orders with missing values have more than 64 products?


In [26]:
# Replace missing values with 999 and convert column to integer type


# [A] Easy (must complete all to pass)

### [A1] Verify that the `'order_hour_of_day'` and `'order_dow'` values in the `orders` tables are sensible (i.e. `'order_hour_of_day'` ranges from 0 to 23 and `'order_dow'` ranges from 0 to 6)

### [A2] What time of day do people shop for groceries?

### [A3] What day of the week do people shop for groceries?

### [A4] How long do people wait until placing another order?

# [B] Medium (must complete all to pass)

### [B1] Is there a difference in `'order_hour_of_day'` distributions on Wednesdays and Saturdays? Plot the histograms for both days and describe the differences that you see.

### [B2] What's the distribution for the number of orders per customer?

### [B3] What are the top 20 popular products (display their id and name)?

# [C] Hard (must complete at least two to pass)

### [C1] How many items do people typically buy in one order? What does the distribution look like?

### [C2] What are the top 20 items that are reordered most frequently (display their names and product IDs)?

### [C3] For each product, what proportion of its orders are reorders?

### [C4] For each customer, what proportion of their products ordered are reorders?

### [C5] What are the top 20 items that people put in their carts first? 