## Attention

An order might have multiple items.

Each item might be fulfiled by a distinct seller

In [1]:
#Import required libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
# import streamlit as st
# import plotly

In [2]:
# The datasets were combined into one Excel file with multiple sheets
#Load workbook
xl = pd.ExcelFile('olist_store_dataset.xlsx', engine='openpyxl')

In [3]:
# list of sheets containing the datasets
xl.sheet_names

['customers_data',
 'geolocation_data',
 'order_items_data',
 'order_payments_data',
 'order_reviews_data',
 'orders_data',
 'products_data',
 'sellers_data',
 'product_categories_data']

### Load the tables from Excel worksheet to a pandas dataframe

In [4]:
# Customers data sheet
customers_df = pd.read_excel(xl, sheet_name='customers_data')
customers_df.head(2)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP


In [5]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


In [6]:
# change column datatypes
convert_dict = {
    'customer_zip_code_prefix': str,
    'customer_city': 'category',
    'customer_state': 'category',
}
customers_df = customers_df.astype(convert_dict)

# Let's standardize customer_zip_code_prefix digits to 5 for the column
customers_df.customer_zip_code_prefix = customers_df.customer_zip_code_prefix.str.zfill(5)

customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   customer_id               99441 non-null  object  
 1   customer_unique_id        99441 non-null  object  
 2   customer_zip_code_prefix  99441 non-null  object  
 3   customer_city             99441 non-null  category
 4   customer_state            99441 non-null  category
dtypes: category(2), object(3)
memory usage: 2.7+ MB


In [7]:
customers_df.customer_zip_code_prefix.sample(5)

93962    95555
72677    12043
10819    08141
96348    20230
92991    88090
Name: customer_zip_code_prefix, dtype: object

In [8]:
# geolocation data sheet
geolocation_df = pd.read_excel(xl, sheet_name='geolocation_data')
geolocation_df

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.644820,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP
...,...,...,...,...,...
1000158,99950,-28.068639,-52.010705,tapejara,RS
1000159,99900,-27.877125,-52.224882,getulio vargas,RS
1000160,99950,-28.071855,-52.014716,tapejara,RS
1000161,99980,-28.388932,-51.846871,david canabarro,RS


In [9]:
geolocation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000163 non-null  int64  
 1   geolocation_lat              1000163 non-null  float64
 2   geolocation_lng              1000163 non-null  float64
 3   geolocation_city             1000163 non-null  object 
 4   geolocation_state            1000163 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 38.2+ MB


- change datatype for some columns
- rename geolocation_zip_code_prefix to seller_zip_code_prefix to merge with sellers_df

In [76]:
# Change data types of listed columns
convert_dict = {
    'geolocation_zip_code_prefix' : str,
    'geolocation_city': 'category',
    'geolocation_state': 'category',
}
geolocation_df = geolocation_df.astype(convert_dict)


# Let's standardize geolocation_zip_code_prefix digits to 5 for the column
geolocation_df.geolocation_zip_code_prefix = geolocation_df.geolocation_zip_code_prefix.str.zfill(5)

# Drop all duplicates in the geolocation_zip_code_prefix column and return the dataframe
geolocation_df = geolocation_df.drop_duplicates(subset=['geolocation_zip_code_prefix'])

In [77]:
geolocation_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19015 entries, 0 to 999846
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   geolocation_zip_code_prefix  19015 non-null  object  
 1   geolocation_lat              19015 non-null  float64 
 2   geolocation_lng              19015 non-null  float64 
 3   geolocation_city             19015 non-null  category
 4   geolocation_state            19015 non-null  category
dtypes: category(2), float64(2), object(1)
memory usage: 971.8+ KB


In [78]:
geolocation_df.geolocation_zip_code_prefix.sample(10)

817370    77308
753129    64207
44726     03183
71010     04018
709968    50870
281685    12830
300395    13336
808437    75905
56138     03442
899618    87955
Name: geolocation_zip_code_prefix, dtype: object

In [38]:
# order_items data sheet
order_items_df = pd.read_excel(xl, sheet_name='order_items_data')
order_items_df.head(2)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93


In [14]:
order_items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             112650 non-null  object        
 1   order_item_id        112650 non-null  int64         
 2   product_id           112650 non-null  object        
 3   seller_id            112650 non-null  object        
 4   shipping_limit_date  112650 non-null  datetime64[ns]
 5   price                112650 non-null  float64       
 6   freight_value        112650 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 6.0+ MB


- Change datatypes for some columns
- Drop the shipping_limit_date column

In [15]:
convert_dict = {
    'order_item_id': str,
    'product_id': str,
    'seller_id': str,    
}

order_items_df = order_items_df.astype(convert_dict)
order_items_df = order_items_df.drop(columns=['shipping_limit_date'])

In [16]:
order_items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   order_id       112650 non-null  object 
 1   order_item_id  112650 non-null  object 
 2   product_id     112650 non-null  object 
 3   seller_id      112650 non-null  object 
 4   price          112650 non-null  float64
 5   freight_value  112650 non-null  float64
dtypes: float64(2), object(4)
memory usage: 5.2+ MB


In [141]:
# order_payments data sheet
order_payments_df = pd.read_excel(xl, sheet_name='order_payments_data')
order_payments_df.head(2)

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39


In [142]:
order_payments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB


In [143]:
order_payments_df.duplicated().any()

False

In [144]:
# order_payments_df contains duplicate rows
order_payments_df[order_payments_df.duplicated()]

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value


In [145]:
# Change datatype
convert_dict = {
    'order_id': str,
    'payment_type': 'category', 
}
order_payments_df = order_payments_df.astype(convert_dict)

# drop irrelevant columns
order_payments_df = order_payments_df.drop(columns=['payment_sequential', 'payment_installments'])
order_payments_df.info()

# drop duplicates
order_payments_df = order_payments_df.drop_duplicates()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   order_id       103886 non-null  object  
 1   payment_type   103886 non-null  category
 2   payment_value  103886 non-null  float64 
dtypes: category(1), float64(1), object(1)
memory usage: 1.7+ MB


In [146]:
order_payments_df.duplicated().any()

False

In [147]:
# order_reviews data sheet
order_reviews_df = pd.read_excel(xl, sheet_name='order_reviews_data')
order_reviews_df.head(2)

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10,2018-03-11 03:05:13


In [148]:
order_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   review_id                99224 non-null  object        
 1   order_id                 99224 non-null  object        
 2   review_score             99224 non-null  int64         
 3   review_comment_title     11567 non-null  object        
 4   review_comment_message   40974 non-null  object        
 5   review_creation_date     99224 non-null  datetime64[ns]
 6   review_answer_timestamp  99224 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 5.3+ MB


In [149]:
# change dtype
convert_dict = {
    'review_score': 'category',
}
order_reviews_df = order_reviews_df.astype(convert_dict)

# drop irrelevant columns
order_reviews_df = order_reviews_df.drop(columns=['review_creation_date','review_comment_message',
                                                  'review_comment_title','review_answer_timestamp',
                                                 ])
order_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   review_id     99224 non-null  object  
 1   order_id      99224 non-null  object  
 2   review_score  99224 non-null  category
dtypes: category(1), object(2)
memory usage: 1.6+ MB


In [150]:
# orders data sheet
orders_df = pd.read_excel(xl, sheet_name='orders_data')
orders_df.head(2)

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13


In [151]:
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  object        
 3   order_purchase_timestamp       99441 non-null  datetime64[ns]
 4   order_approved_at              99281 non-null  datetime64[ns]
 5   order_delivered_carrier_date   97658 non-null  datetime64[ns]
 6   order_delivered_customer_date  96476 non-null  datetime64[ns]
 7   order_estimated_delivery_date  99441 non-null  datetime64[ns]
dtypes: datetime64[ns](5), object(3)
memory usage: 6.1+ MB


In [152]:
# change dtype
orders_df.order_status = orders_df.order_status.astype('category')

# drop irrelevant columns
orders_df = orders_df.drop(columns = ['order_purchase_timestamp','order_delivered_carrier_date', 
                                      'order_estimated_delivery_date'
                                     ])
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  category      
 3   order_approved_at              99281 non-null  datetime64[ns]
 4   order_delivered_customer_date  96476 non-null  datetime64[ns]
dtypes: category(1), datetime64[ns](2), object(2)
memory usage: 3.1+ MB


In [153]:
# products data sheet
products_df = pd.read_excel(xl, sheet_name='products_data')
products_df.head(2)

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0


In [154]:
products_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32951 non-null  object 
 1   product_category_name       32341 non-null  object 
 2   product_name_lenght         32341 non-null  float64
 3   product_description_lenght  32341 non-null  float64
 4   product_photos_qty          32341 non-null  float64
 5   product_weight_g            32949 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_height_cm           32949 non-null  float64
 8   product_width_cm            32949 non-null  float64
dtypes: float64(7), object(2)
memory usage: 2.3+ MB


In [155]:
# drop irrelevant columns
products_df = products_df.drop(columns=['product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm'])
products_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   product_id             32951 non-null  object
 1   product_category_name  32341 non-null  object
dtypes: object(2)
memory usage: 515.0+ KB


In [156]:
# sellers data sheet
sellers_df = pd.read_excel(xl, sheet_name='sellers_data')
sellers_df.sample(10)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
2623,1746c1ae87ac63d530c7c943d0ac42e2,6501,santana de parnaiba,SP
1610,20cb7c2fde3e5bf10f0bbe7394e1c6a9,86385,andira-pr,PR
2693,57e632711dec9ec14ca7546769483e7e,88372,navegantes,SC
3079,4e2627090e6e5b9fabba883a37897683,31565,belo horizonte,MG
1631,34f563c82a85b99ae9e6d60db5fc2e28,93700,campo bom,RS
659,6426d21aca402a131fc0a5d0960a3c90,14091,ribeirao preto,SP
165,d6cd01c59123df02fc226eadbadb5f89,1207,sao paulo,SP
992,c2bda99904207edd1834f03e0022b007,8280,sao paulo,SP
3070,db6a4d4b5f1f5f98820ce6ce2619e2de,2968,sao paulo,SP
1650,458d868c9642f55f18f04e951a3fdde6,90670,porto alegre,RS


In [157]:
sellers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   int64 
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: int64(1), object(3)
memory usage: 96.8+ KB


In [158]:
# change data types of columns
convert_dict = {
    'seller_zip_code_prefix': str,
    'seller_id': str,
    'seller_city': 'category',
    'seller_state': 'category',
}
sellers_df = sellers_df.astype(convert_dict)

# Let's standardize seller_zip_code_prefix digits to 5 for the column
sellers_df.seller_zip_code_prefix = sellers_df.seller_zip_code_prefix.str.zfill(5)

sellers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   seller_id               3095 non-null   object  
 1   seller_zip_code_prefix  3095 non-null   object  
 2   seller_city             3095 non-null   category
 3   seller_state            3095 non-null   category
dtypes: category(2), object(2)
memory usage: 79.2+ KB


In [159]:
# product_categories data sheet
product_categories_df = pd.read_excel(xl, sheet_name='product_categories_data')
product_categories_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 2 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   product_category_name          71 non-null     object
 1   product_category_name_english  71 non-null     object
dtypes: object(2)
memory usage: 1.2+ KB


## Using the schema below, let's merge the tables.

>1.	An order might have multiple items.
2.	Each item might be fulfilled by a distinct seller.
3.	All text identifying stores and partners were replaced by the names of Game of Thrones great houses.


<img src='schema.png' alt='Table schema' width='750px'>

In [160]:
#  orders_df + order_reviews = orders
orders = pd.merge(orders_df, order_reviews_df, on='order_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99992 entries, 0 to 99991
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99992 non-null  object        
 1   customer_id                    99992 non-null  object        
 2   order_status                   99992 non-null  category      
 3   order_approved_at              99831 non-null  datetime64[ns]
 4   order_delivered_customer_date  97005 non-null  datetime64[ns]
 5   review_id                      99224 non-null  object        
 6   review_score                   99224 non-null  category      
dtypes: category(2), datetime64[ns](2), object(3)
memory usage: 4.8+ MB


In [161]:
orders.duplicated().any()

False

In [162]:
# orders + order_payments_df = orders
orders = pd.merge(orders, order_payments_df, on='order_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103857 entries, 0 to 103856
Data columns (total 9 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       103857 non-null  object        
 1   customer_id                    103857 non-null  object        
 2   order_status                   103857 non-null  category      
 3   order_approved_at              103691 non-null  datetime64[ns]
 4   order_delivered_customer_date  100736 non-null  datetime64[ns]
 5   review_id                      103060 non-null  object        
 6   review_score                   103060 non-null  category      
 7   payment_type                   103856 non-null  category      
 8   payment_value                  103856 non-null  float64       
dtypes: category(3), datetime64[ns](2), float64(1), object(3)
memory usage: 5.8+ MB


In [163]:
order_payments_df.query("order_id == '8ca5bdac5ebe8f2d6fc9171d5ebc906a'")

Unnamed: 0,order_id,payment_type,payment_value
752,8ca5bdac5ebe8f2d6fc9171d5ebc906a,voucher,15.0
37465,8ca5bdac5ebe8f2d6fc9171d5ebc906a,credit_card,59.08
83047,8ca5bdac5ebe8f2d6fc9171d5ebc906a,voucher,25.0


In [164]:
# orders + order_items_df = orders
orders = pd.merge(orders, order_items_df, on='order_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 15 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       118432 non-null  object        
 1   customer_id                    118432 non-null  object        
 2   order_status                   118432 non-null  category      
 3   order_approved_at              118265 non-null  datetime64[ns]
 4   order_delivered_customer_date  115044 non-null  datetime64[ns]
 5   review_id                      117438 non-null  object        
 6   review_score                   117438 non-null  category      
 7   payment_type                   118429 non-null  category      
 8   payment_value                  118429 non-null  float64       
 9   order_item_id                  117618 non-null  float64       
 10  product_id                     117618 non-null  object        
 11  

In [165]:
# orders + products = orders
orders = pd.merge(orders, products_df, on='product_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 16 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       118432 non-null  object        
 1   customer_id                    118432 non-null  object        
 2   order_status                   118432 non-null  category      
 3   order_approved_at              118265 non-null  datetime64[ns]
 4   order_delivered_customer_date  115044 non-null  datetime64[ns]
 5   review_id                      117438 non-null  object        
 6   review_score                   117438 non-null  category      
 7   payment_type                   118429 non-null  category      
 8   payment_value                  118429 non-null  float64       
 9   order_item_id                  117618 non-null  float64       
 10  product_id                     117618 non-null  object        
 11  

In [166]:
# orders + sellers_df = orders
orders = pd.merge(orders, sellers_df, on='seller_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 19 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       118432 non-null  object        
 1   customer_id                    118432 non-null  object        
 2   order_status                   118432 non-null  category      
 3   order_approved_at              118265 non-null  datetime64[ns]
 4   order_delivered_customer_date  115044 non-null  datetime64[ns]
 5   review_id                      117438 non-null  object        
 6   review_score                   117438 non-null  category      
 7   payment_type                   118429 non-null  category      
 8   payment_value                  118429 non-null  float64       
 9   order_item_id                  117618 non-null  float64       
 10  product_id                     117618 non-null  object        
 11  

In [167]:
# orders + geolocations_df = orders
orders = pd.merge(orders, geolocation_df, left_on = 'seller_zip_code_prefix', right_on = 'geolocation_zip_code_prefix', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 24 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       118432 non-null  object        
 1   customer_id                    118432 non-null  object        
 2   order_status                   118432 non-null  category      
 3   order_approved_at              118265 non-null  datetime64[ns]
 4   order_delivered_customer_date  115044 non-null  datetime64[ns]
 5   review_id                      117438 non-null  object        
 6   review_score                   117438 non-null  category      
 7   payment_type                   118429 non-null  category      
 8   payment_value                  118429 non-null  float64       
 9   order_item_id                  117618 non-null  float64       
 10  product_id                     117618 non-null  object        
 11  

In [171]:
# orders + customers_df = orders
orders = pd.merge(orders, customers_df, on='customer_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 28 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       118432 non-null  object        
 1   customer_id                    118432 non-null  object        
 2   order_status                   118432 non-null  category      
 3   order_approved_at              118265 non-null  datetime64[ns]
 4   order_delivered_customer_date  115044 non-null  datetime64[ns]
 5   review_id                      117438 non-null  object        
 6   review_score                   117438 non-null  category      
 7   payment_type                   118429 non-null  category      
 8   payment_value                  118429 non-null  float64       
 9   order_item_id                  117618 non-null  float64       
 10  product_id                     117618 non-null  object        
 11  

### Finally, let's rename our combined table to sales_data

In [172]:
# order
sales_data = orders.copy()
sales_data

Unnamed: 0,order_id,customer_id,order_status,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,order_item_id,...,seller_state,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 11:07:15,2017-10-10 21:25:13,a54f0611adc9ed256b57ede6b6eb5114,4,credit_card,18.12,1.0,...,SP,09350,-23.680114,-46.452454,maua,SP,7c396fd4830fd04220f754e42b4e5bff,03149,sao paulo,SP
1,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 11:07:15,2017-10-10 21:25:13,a54f0611adc9ed256b57ede6b6eb5114,4,voucher,2.00,1.0,...,SP,09350,-23.680114,-46.452454,maua,SP,7c396fd4830fd04220f754e42b4e5bff,03149,sao paulo,SP
2,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 11:07:15,2017-10-10 21:25:13,a54f0611adc9ed256b57ede6b6eb5114,4,voucher,18.59,1.0,...,SP,09350,-23.680114,-46.452454,maua,SP,7c396fd4830fd04220f754e42b4e5bff,03149,sao paulo,SP
3,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-26 03:24:27,2018-08-07 15:27:45,8d5266042046a06655c8db133d120ba5,4,boleto,141.46,1.0,...,SP,31570,-19.810119,-43.984727,belo horizonte,MG,af07308b275d755c9edb36a90c618231,47813,barreiras,BA
4,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:55:23,2018-08-17 18:06:29,e73b67b67587f7644d5bd1a52deb1b01,5,credit_card,179.12,1.0,...,SP,14840,-21.362358,-48.232976,guariba,SP,3a653a41f6f9fc3d2a113cf8398680e8,75265,vianopolis,GO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118427,63943bddc261676b46f01ca7ac2f7bd8,1fca14ff2861355f6e5f14306ff977a7,delivered,2018-02-06 13:10:37,2018-02-28 17:37:56,29bb71b2760d0f876dfa178a76bc4734,4,credit_card,195.00,1.0,...,SP,17602,-21.935321,-50.497562,tupa,SP,da62f9e57a76d978d02ab5362c509660,11722,praia grande,SP
118428,83c1379a015df1e13d02aae0204711ab,1aa71eb042121263aafbe80c1b562c9c,delivered,2017-08-27 15:04:16,2017-09-21 11:24:17,371579771219f6db2d830d50805977bb,5,credit_card,271.01,1.0,...,SP,08290,-23.551013,-46.448489,sao paulo,SP,737520a9aad80b3fbbdad19b66b37b30,45920,nova vicosa,BA
118429,11c177c8e97725db2631073c19f07b62,b331b74b18dc79bcdf6532d51e1637c1,delivered,2018-01-08 21:36:21,2018-01-25 23:32:54,8ab6855b9fe9b812cd03a480a25058a1,2,credit_card,441.16,1.0,...,MG,37175,-20.944706,-45.827098,ilicinea,MG,5097a5312c8b157bb7be58ae360ef43c,28685,japuiba,RJ
118430,11c177c8e97725db2631073c19f07b62,b331b74b18dc79bcdf6532d51e1637c1,delivered,2018-01-08 21:36:21,2018-01-25 23:32:54,8ab6855b9fe9b812cd03a480a25058a1,2,credit_card,441.16,2.0,...,MG,37175,-20.944706,-45.827098,ilicinea,MG,5097a5312c8b157bb7be58ae360ef43c,28685,japuiba,RJ


## Clean the combined table data

In [177]:
# Export to csv for visual cleaning
to_csv = sales_data.to_csv('dirty_sales_data.csv', index=False)

### Data Quality Issues
- Product category column not in English
- Replace '_' with ' ' in category names
- City names are in lower case
- wrong datatype: review_score, payment_sequential, payment_installments, payment_type, customer_unique_id, geolocation_zip_code_prefix, geolocation_state, geolocation_city,
- Inconsistent customer zipcode prefix format. 4 digits sometimes, 5 digits other times
- non-ascii characters: geolocation city
- Inconsistent spelling formats. e.g 'sÃ£o paulo', 'sao paulo'; getÃºlio vargas, getulio vargas; etc.
- Certain columns are of type float instead of int
- wrong datatype: order_item_id
- Wrong datatype: payment_type,  payment_sequential, payment_installments
- wrong datatype: customer_city, customer_state, customer_zip_code_prefix
- Inconsistent customer zipcode prefix format. 4 digits sometimes, 5 digits other times
- City name in lower case