# EXERCISE WALKTHROUGH

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
data_path = r'/home/nevesfernandes/20250701 Instacart Basket Analysis/2 Data/'
df_prods = pd.read_csv(os.path.join(data_path, '1 Original Data', 'products.csv'), index_col = False)
df_orders = pd.read_csv(os.path.join(data_path, '2 Prepared Data', 'orders_wrangled.csv'), index_col = False)

### Test dataframe to deal with mixed datatypes

In [3]:
df_test = pd.DataFrame()
df_test['mix'] = ['a', 'b', 1, True]
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [4]:
#check for mixed datatypes

for col in df_test.columns.tolist():
  weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [5]:
df_test['mix'] = df_test['mix'].astype('str')

In [6]:
#checking again for mixed datatypes

for col in df_test.columns.tolist():
  weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

No return, it means that the column mix was fixed in terms of datatype

In [7]:
#on a review process of the notebooks, I know by now that I won't need aisle_id for anything in my analysis.
#therefore, I'm dropping it, to ensure i'm saving as much memory as possible in the future

df_prods.drop('aisle_id',axis=1,inplace=True)

In [8]:
df_prods.isnull().sum()

product_id        0
product_name     16
department_id     0
prices            0
dtype: int64

In [9]:
df_nan = df_prods[df_prods['product_name'].isnull() == True]
df_nan

Unnamed: 0,product_id,product_name,department_id,prices
33,34,,14,12.2
68,69,,7,11.8
115,116,,3,10.8
261,262,,13,12.1
525,525,,11,1.2
1511,1511,,16,14.3
1780,1780,,11,12.3
2240,2240,,1,14.2
2586,2586,,13,12.4
3159,3159,,11,13.1


In [10]:
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]
df_prods_clean.shape

(49677, 4)

In [11]:
df_dups = df_prods_clean[df_prods_clean.duplicated()]
df_dups

Unnamed: 0,product_id,product_name,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,11,4.8
18459,18458,Ranger IPA,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,14,6.8
35495,35491,Adore Forever Body Wash,11,9.9


In [12]:
df_prods_clean.shape

(49677, 4)

In [13]:
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()
df_prods_clean_no_dups.shape

(49672, 4)

In [14]:
df_prods_clean_no_dups['product_id'].value_counts(dropna=False)

product_id
26520    2
6800     2
1        1
33136    1
33127    1
        ..
16576    1
16577    1
16578    1
16579    1
49688    1
Name: count, Length: 49670, dtype: int64

In [15]:
index_dup_keys = df_prods_clean_no_dups[df_prods_clean_no_dups['product_id'].isin([26520,6800])].index
df_prods_clean_no_dups.drop(index_dup_keys , inplace=True)
df_prods_clean_no_dups.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prods_clean_no_dups.drop(index_dup_keys , inplace=True)


(49668, 4)

In [16]:
df_sort_by_price = df_prods_clean_no_dups.sort_values('prices')

In [17]:
df_sort_by_price.tail()

Unnamed: 0,product_id,product_name,department_id,prices
25580,25579,Naturally Smoked Trout Fillet,12,25.0
19392,19391,Turkey Breast Tenderloins,12,25.0
21468,21467,Wild Caught Raw Shrimp,12,25.0
21554,21553,Lowfat 2% Milkfat Cottage Cheese,16,14900.0
33666,33664,2 % Reduced Fat Milk,16,99999.0


We can see we have two products with completely illogical prices. After checking market prices, my educated guess is that these two products have their prices multiplied by 10000, and the real prices are respectively 1.49 and 9.99 (this is hard decision, as the product name doesn't uncover the quantity. For example: is the milk 1 Gallon? Is it one quarter?).

Another option would be to simply remove these 2 products.

In [18]:
df_prods_clean_no_dups.loc[df_prods_clean_no_dups['product_id'] == 21553, 'prices'] = 1.49
df_prods_clean_no_dups.loc[df_prods_clean_no_dups['product_id'] == 33664, 'prices'] = 9.99

In [19]:
df_sort_by_price = df_prods_clean_no_dups.sort_values('prices')
df_sort_by_price.tail()

Unnamed: 0,product_id,product_name,department_id,prices
25580,25579,Naturally Smoked Trout Fillet,12,25.0
19392,19391,Turkey Breast Tenderloins,12,25.0
9020,9020,Boneless Skinless Chicken Thighs,12,25.0
40490,40486,Chicken Tenders,12,25.0
21468,21467,Wild Caught Raw Shrimp,12,25.0


In [20]:
df_prods_clean_no_dups.to_csv(os.path.join(data_path,'2 Prepared Data', 'products_checked.csv'))

# TASK

### Step 2 - Run describe function over orders dataframe

In [21]:
df_orders.describe()

Unnamed: 0,order_id,user_id,customer_sequential_order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


* **order_id** assumes values up to 3.421.083, which is according to what we saw as number of orders. Make sense.
* **user_id** has no negatives, has values from 1 up to 206.209, which most likely is the number of existing customers (with orders recorded, of course)
* **customer_sequential_order_number** assumes values from 1 up to 100. It means that the customer that placed more orders, placed 100 orders. It's plausible. To be noted that the average is only 11 (and the Q25 and Q75 are 5 and 23), which means this distribution is skewed, with many more customers with limited amount of orders, and few with many many orders. But in terms of consistency seems ok.
* **orders_day_of_week** Values are ok, from 0 (Saturday) to 6 (Friday). Mean is exactly 3! with 3 millions records, might make sense...
* **order_hour_of_day** minimum (0) and maximum (23) values make sense. Mean and quartiles show that half of the orders are made between 10 am and 4 pm (interval of 6 hours), while the other 18 hours of the day receive the remaining 50%. Everything looks fine in terms of consistency.
* **days_since_prior_order** values vary from 0 (new order from a customer on the same day) up to 30 (took a month to make a new order). Nothing to suspect here as well

### Step 3 - verification of mixed datatypes in orders dataframe

In [None]:
#NOTE: following a warning while running the kernel, I've changed the function applymap to map, because the first is deprecated
#Added a control flag to output something in case no columns are found

col_with_mixed_types = False
for col in df_orders.columns.tolist():
  weird = (df_orders[[col]].map(type) != df_orders[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_orders[weird]) > 0:
    print (col)
    col_with_mixed_types = True
if (not col_with_mixed_types):
  print('No columns with mixed datatypes found')


### Step 5 - verification if there are missing values in orders dataframe

In [None]:
df_orders.isnull().sum()

It makes sense the number of Nulls found in *days_since_prior_order*. This number matched exactly the number of customers, and this is because on their very first order it can't be assigned any value (they haven't shopped with InstaCart yet).

To change it a numeric value would be a mistake, and would jeopardize any numerical analysis I'll want to make with this field. when I want to perform statistics over this field, I should simply keep this in mind and filter out these values, for example making a subset where `customer_sequential_order_number != 1`.

### Step 6 - addressing missing values

As explained above, I won't do any transformation. When it comes to operate over this column, I'll filter out the `Nan`.

### Step 7 - verification of existence of duplicates in orders dataframe

In [None]:
df_dup_orders = df_orders[df_orders.duplicated()]
df_dup_orders

No duplicated orders found!

### Step 8 - addressing duplicate values

N/A

### Step 9 - exporting cleaned datasets

**NOTE**: Products dataframe was exported by the end of the Exercise Walkthrough

In [None]:
df_orders.to_csv(os.path.join(data_path,'2 Prepared Data', 'orders_checked.csv'), index = False)