# <center> Data Wrangling & Consistency Checks </center>

## 01. Import libraries & Original Data

In [1]:
import pandas as pd
import numpy as np
import os

path = r'C:\Users\thang\Desktop\Data Analytics\Data Analytics Immersion\Python\Instacart Basket Analysis'

In [2]:
customers = pd.read_csv(os.path.join(path, '2. Data', 'Original', 'customers.csv'), index_col = False)
departments = pd.read_csv(os.path.join(path, '2. Data', 'Original', 'departments.csv'), index_col = False)
orders = pd.read_csv(os.path.join(path, '2. Data', 'Original', 'orders.csv'), index_col = False)
products = pd.read_csv(os.path.join(path, '2. Data', 'Original', 'products.csv'), index_col = False)

In [3]:
ord_prods_prior = pd.read_csv(os.path.join(path, '2. Data', 'Original', 'order_products_prior.csv'), index_col = False)

## 02. Data Wrangling 

### customers

In [19]:
customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [20]:
# shape check
customers.shape

(206209, 10)

In [4]:
# renaming columns
customers.rename(columns = {'user_id':'customer_id','First Name':'first_name','Surnam':'last_name','Gender':'gender','STATE':'state','Age':'age'}, inplace = True)

In [5]:
# dropping columns
customers.drop(columns = 'date_joined', inplace = True)

In [23]:
# final check
customers.head()

Unnamed: 0,customer_id,first_name,last_name,gender,state,age,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1,married,40374


### departments

In [24]:
departments.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [6]:
# transposing
departments = departments.T

In [26]:
departments.head()

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce


In [7]:
# fix header
departments.reset_index()
new_header = departments.iloc[0]
departments = departments[1:]
departments.columns = new_header 

In [28]:
# final check
departments.head()

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol


### orders

In [29]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [30]:
# shape check
orders.shape

(3421083, 7)

In [8]:
# renaming columns
orders.rename(columns = {'user_id':'customer_id','order_dow':'order_day_of_week'}, inplace = True)

In [9]:
# dropping columns
orders.drop(columns = 'eval_set', inplace = True)

In [33]:
# final check
orders.head()

Unnamed: 0,order_id,customer_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


### products

In [34]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [35]:
# shape check
products.shape

(49693, 5)

### order_products_prior

In [36]:
ord_prods_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [37]:
ord_prods_prior.shape

(32434489, 4)

## 03. Data Consistency 

### customers

In [10]:
# missing values
customers.isnull().sum()

customer_id         0
first_name      11259
last_name           0
gender              0
state               0
age                 0
n_dependants        0
fam_status          0
income              0
dtype: int64

In [11]:
# duplicates
customers[customers.duplicated()]

Unnamed: 0,customer_id,first_name,last_name,gender,state,age,n_dependants,fam_status,income


In [12]:
# mixed-type
for col in customers.columns.tolist():
    weird = (customers[[col]].applymap(type) != customers[[col]].iloc[0].apply(type)).any(axis=1)
    if len(customers[weird]) > 0:
        print(col)

first_name


In [13]:
# fix the mixed-type
customers['first_name'] = customers['first_name'].astype('str')

In [53]:
# check shape
customers.shape

(206209, 9)

### departments

In [14]:
# missing values 
departments.isnull().sum()

department_id
department    0
dtype: int64

In [15]:
# duplicates
departments[departments.duplicated()]

department_id,department


In [16]:
# mixed-type
for col in departments.columns.tolist():
    weird = (departments[[col]].applymap(type) != departments[[col]].iloc[0].apply(type)).any(axis=1)
    if len(departments[weird]) > 0:
        print(col)

### orders

In [17]:
# missing values
orders.isnull().sum()

order_id                       0
customer_id                    0
order_number                   0
order_day_of_week              0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [18]:
# duplicates
orders[orders.duplicated()]

Unnamed: 0,order_id,customer_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order


In [19]:
# mixed-type
for col in orders.columns.tolist():
    weird = (orders[[col]].applymap(type) != orders[[col]].iloc[0].apply(type)).any(axis=1)
    if len(orders[weird]) > 0:
        print(col)

In [55]:
# check shape
orders.shape

(3421083, 6)

### products

In [20]:
# missing values
products.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [21]:
# fix the missing values
products.dropna(inplace = True)

In [49]:
# duplicates
products[products.duplicated()]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [22]:
# drop the duplicates 
products = products.drop_duplicates()

In [23]:
# check the result
products[products.duplicated()]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices


In [24]:
# mixed-type
for col in products.columns.tolist():
    weird = (products[[col]].applymap(type) != products[[col]].iloc[0].apply(type)).any(axis=1)
    if len(products[weird]) > 0:
        print(col)

In [34]:
# check shape
products.shape

(49672, 5)

### order_products_prior

In [25]:
# missing values
ord_prods_prior.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

In [26]:
# duplicates 
ord_prods_prior[ord_prods_prior.duplicated()]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered


In [27]:
# mixed-type
for col in ord_prods_prior.columns.tolist():
    weird = (ord_prods_prior[[col]].applymap(type) != ord_prods_prior[[col]].iloc[0].apply(type)).any(axis=1)
    if len(ord_prods_prior[weird]) > 0:
        print(col)

In [64]:
# check shape
ord_prods_prior.shape

(32434489, 4)

## 04. Downcasting

### customers

In [67]:
# check types
customers.dtypes

customer_id      int64
first_name      object
last_name       object
gender          object
state           object
age              int64
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [68]:
# check min & max for numerical values 
customers.describe()

Unnamed: 0,customer_id,age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [28]:
# change data types 
customers['customer_id'] = customers['customer_id'].astype('int32')
customers['gender'] = customers['gender'].astype('category')
customers['state'] = customers['state'].astype('category')
customers['age'] = customers['age'].astype('int8')
customers['n_dependants'] = customers['n_dependants'].astype('int8')
customers['fam_status'] = customers['fam_status'].astype('category')
customers['income'] = customers['income'].astype('int32')

### departments

In [70]:
# check types
departments.dtypes

department_id
department    object
dtype: object

In [29]:
# change data types 
departments['department'] = departments['department'].astype('category')

### orders

In [72]:
# check types
orders.dtypes

order_id                    int64
customer_id                 int64
order_number                int64
order_day_of_week           int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [73]:
# check min & max for numerical values 
orders.describe()

Unnamed: 0,order_id,customer_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [30]:
# change data types 
orders['order_id'] = orders['order_id'].astype('int32')
orders['customer_id'] = orders['customer_id'].astype('int32')
orders['order_number'] = orders['order_number'].astype('int8')
orders['order_day_of_week'] = orders['order_day_of_week'].astype('int8')
orders['order_hour_of_day'] = orders['order_hour_of_day'].astype('int8')

### products

In [76]:
# check types
products.dtypes

product_id         int64
product_name      object
aisle_id           int64
department_id      int64
prices           float64
dtype: object

In [78]:
# check min & max for numerical values 
products.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49677.0,49677.0,49677.0,49677.0
mean,24850.194235,67.76311,11.728687,9.993164
std,14340.588602,38.316396,5.850651,453.592708
min,1.0,1.0,1.0,1.0
25%,12433.0,35.0,7.0,4.1
50%,24851.0,69.0,13.0,7.1
75%,37267.0,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


In [31]:
# change data types 
products['product_id'] = products['product_id'].astype('int32')
products['product_name'] = products['product_name'].astype('str')
products['aisle_id'] = products['aisle_id'].astype('int16')
products['department_id'] = products['department_id'].astype('int8')
products['prices'] = products['prices'].astype('int32')

### order_products_prior

In [81]:
# check types
ord_prods_prior.dtypes

order_id             int64
product_id           int64
add_to_cart_order    int64
reordered            int64
dtype: object

In [82]:
# check min & max for numerical values 
ord_prods_prior.describe()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
count,32434490.0,32434490.0,32434490.0,32434490.0
mean,1710749.0,25576.34,8.351076,0.5896975
std,987300.7,14096.69,7.126671,0.4918886
min,2.0,1.0,1.0,0.0
25%,855943.0,13530.0,3.0,0.0
50%,1711048.0,25256.0,6.0,1.0
75%,2565514.0,37935.0,11.0,1.0
max,3421083.0,49688.0,145.0,1.0


In [32]:
# change types
ord_prods_prior['order_id'] = ord_prods_prior['order_id'].astype('int32')
ord_prods_prior['product_id'] = ord_prods_prior['product_id'].astype('int32')
ord_prods_prior['add_to_cart_order'] = ord_prods_prior['add_to_cart_order'].astype('int16')
ord_prods_prior['reordered'] = ord_prods_prior['reordered'].astype('int8')

## 05. Export

In [33]:
customers.to_pickle(os.path.join(path, '2. Data', 'Modified', 'customers_cleaned.pkl'))
departments.to_pickle(os.path.join(path, '2. Data', 'Modified', 'departments_wrangled.pkl'))
orders.to_pickle(os.path.join(path, '2. Data', 'Modified', 'orders_cleaned.pkl'))
products.to_pickle(os.path.join(path, '2. Data', 'Modified', 'products_cleaned.pkl'))
ord_prods_prior.to_pickle(os.path.join(path, '2. Data', 'Modified', 'order_products_prior_cleaned.pkl'))