## Attention

An order might have multiple items.

Each item might be fulfiled by a distinct seller

In [1]:
#Import required libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
# import streamlit as st
# import plotly

In [2]:
# The datasets were combined into one Excel file with multiple sheets
#Load workbook
xl = pd.ExcelFile('olist_store_dataset.xlsx', engine='openpyxl')

In [3]:
# list of sheets containing the datasets
xl.sheet_names

['customers_data',
 'geolocation_data',
 'order_items_data',
 'order_payments_data',
 'order_reviews_data',
 'orders_data',
 'products_data',
 'sellers_data',
 'product_categories_data']

### Load the tables from Excel worksheet to a pandas dataframe

In [4]:
# Customers data sheet
customers_df = pd.read_excel(xl, sheet_name='customers_data')
customers_df.head(2)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP


In [5]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


In [6]:
# change column datatypes
convert_dict = {
    'customer_zip_code_prefix': str,
    'customer_city': 'category',
    'customer_state': 'category',
}

customers_df = customers_df.astype(convert_dict)
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   customer_id               99441 non-null  object  
 1   customer_unique_id        99441 non-null  object  
 2   customer_zip_code_prefix  99441 non-null  object  
 3   customer_city             99441 non-null  category
 4   customer_state            99441 non-null  category
dtypes: category(2), object(3)
memory usage: 2.7+ MB


In [7]:
# geolocation data sheet
geolocation_df = pd.read_excel(xl, sheet_name='geolocation_data')
geolocation_df.head(2)

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP


In [8]:
geolocation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000163 non-null  int64  
 1   geolocation_lat              1000163 non-null  float64
 2   geolocation_lng              1000163 non-null  float64
 3   geolocation_city             1000163 non-null  object 
 4   geolocation_state            1000163 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 38.2+ MB


- change datatype for some columns
- rename geolocation_zip_code_prefix to seller_zip_code_prefix to merge with sellers_df

In [9]:
convert_dict = {
    'geolocation_zip_code_prefix' : str,
    'geolocation_city': 'category',
    'geolocation_state': 'category',
}
geolocation_df = geolocation_df.astype(convert_dict)

# rename geolocation_zip_code_prefix to seller_zip_code_prefix to merge with sellers_df
# Due to cache or other errors related to the large file, I had to do this in two steps as shown below

geolocation_df['seller_zip_code_prefix'] = geolocation_df.geolocation_zip_code_prefix
geolocation_df = geolocation_df.drop(columns=['geolocation_zip_code_prefix'])

In [10]:
geolocation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                  Non-Null Count    Dtype   
---  ------                  --------------    -----   
 0   geolocation_lat         1000163 non-null  float64 
 1   geolocation_lng         1000163 non-null  float64 
 2   geolocation_city        1000163 non-null  category
 3   geolocation_state       1000163 non-null  category
 4   seller_zip_code_prefix  1000163 non-null  object  
dtypes: category(2), float64(2), object(1)
memory usage: 26.1+ MB


In [11]:
# order_items data sheet
order_items_df = pd.read_excel(xl, sheet_name='order_items_data')
order_items_df.head(2)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93


In [12]:
order_items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             112650 non-null  object        
 1   order_item_id        112650 non-null  int64         
 2   product_id           112650 non-null  object        
 3   seller_id            112650 non-null  object        
 4   shipping_limit_date  112650 non-null  datetime64[ns]
 5   price                112650 non-null  float64       
 6   freight_value        112650 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 6.0+ MB


- Change datatypes for some columns
- Drop the shipping_limit_date column

In [13]:
convert_dict = {
    'order_item_id': str,
    'product_id': str,
    'seller_id': str,    
}

order_items_df = order_items_df.astype(convert_dict)
order_items_df = order_items_df.drop(columns=['shipping_limit_date'])

In [14]:
order_items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   order_id       112650 non-null  object 
 1   order_item_id  112650 non-null  object 
 2   product_id     112650 non-null  object 
 3   seller_id      112650 non-null  object 
 4   price          112650 non-null  float64
 5   freight_value  112650 non-null  float64
dtypes: float64(2), object(4)
memory usage: 5.2+ MB


In [15]:
# order_payments data sheet
order_payments_df = pd.read_excel(xl, sheet_name='order_payments_data')
order_payments_df.head(2)

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39


In [16]:
order_payments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB


In [17]:
# Change datatype
convert_dict = {
    'order_id': str,
    'payment_type': 'category', 
}
order_payments_df = order_payments_df.astype(convert_dict)

# drop irrelevant columns
order_payments_df = order_payments_df.drop(columns=['payment_sequential', 'payment_installments'])
order_payments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   order_id       103886 non-null  object  
 1   payment_type   103886 non-null  category
 2   payment_value  103886 non-null  float64 
dtypes: category(1), float64(1), object(1)
memory usage: 1.7+ MB


In [18]:
# order_reviews data sheet
order_reviews_df = pd.read_excel(xl, sheet_name='order_reviews_data')
order_reviews_df.head(2)

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10,2018-03-11 03:05:13


In [19]:
order_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   review_id                99224 non-null  object        
 1   order_id                 99224 non-null  object        
 2   review_score             99224 non-null  int64         
 3   review_comment_title     11567 non-null  object        
 4   review_comment_message   40974 non-null  object        
 5   review_creation_date     99224 non-null  datetime64[ns]
 6   review_answer_timestamp  99224 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 5.3+ MB


In [20]:
# change dtype
convert_dict = {
    'review_score': 'category',
}
order_reviews_df = order_reviews_df.astype(convert_dict)

# drop irrelevant columns
order_reviews_df = order_reviews_df.drop(columns=['review_creation_date','review_comment_message',
                                                  'review_comment_title','review_answer_timestamp',
                                                 ])
order_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   review_id     99224 non-null  object  
 1   order_id      99224 non-null  object  
 2   review_score  99224 non-null  category
dtypes: category(1), object(2)
memory usage: 1.6+ MB


In [21]:
# orders data sheet
orders_df = pd.read_excel(xl, sheet_name='orders_data')
orders_df.head(2)

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13


In [22]:
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  object        
 3   order_purchase_timestamp       99441 non-null  datetime64[ns]
 4   order_approved_at              99281 non-null  datetime64[ns]
 5   order_delivered_carrier_date   97658 non-null  datetime64[ns]
 6   order_delivered_customer_date  96476 non-null  datetime64[ns]
 7   order_estimated_delivery_date  99441 non-null  datetime64[ns]
dtypes: datetime64[ns](5), object(3)
memory usage: 6.1+ MB


In [23]:
# change dtype
orders_df.order_status = orders_df.order_status.astype('category')

# drop irrelevant columns
orders_df = orders_df.drop(columns = ['order_purchase_timestamp','order_delivered_carrier_date', 
                                      'order_estimated_delivery_date'
                                     ])
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  category      
 3   order_approved_at              99281 non-null  datetime64[ns]
 4   order_delivered_customer_date  96476 non-null  datetime64[ns]
dtypes: category(1), datetime64[ns](2), object(2)
memory usage: 3.1+ MB


In [24]:
# products data sheet
products_df = pd.read_excel(xl, sheet_name='products_data')
products_df.head(2)

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0


In [25]:
products_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32951 non-null  object 
 1   product_category_name       32341 non-null  object 
 2   product_name_lenght         32341 non-null  float64
 3   product_description_lenght  32341 non-null  float64
 4   product_photos_qty          32341 non-null  float64
 5   product_weight_g            32949 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_height_cm           32949 non-null  float64
 8   product_width_cm            32949 non-null  float64
dtypes: float64(7), object(2)
memory usage: 2.3+ MB


In [26]:
# drop irrelevant columns
products_df = products_df.drop(columns=['product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm'])
products_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   product_id             32951 non-null  object
 1   product_category_name  32341 non-null  object
dtypes: object(2)
memory usage: 515.0+ KB


In [27]:
# sellers data sheet
sellers_df = pd.read_excel(xl, sheet_name='sellers_data')
sellers_df.head(2)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP


In [28]:
sellers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   int64 
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: int64(1), object(3)
memory usage: 96.8+ KB


In [29]:
# change datatypes of columns
convert_dict = {
    'seller_zip_code_prefix': str,
    'seller_id': str,
    'seller_city': 'category',
    'seller_state': 'category',
}
sellers_df = sellers_df.astype(convert_dict)
sellers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   seller_id               3095 non-null   object  
 1   seller_zip_code_prefix  3095 non-null   object  
 2   seller_city             3095 non-null   category
 3   seller_state            3095 non-null   category
dtypes: category(2), object(2)
memory usage: 79.2+ KB


In [30]:
# product_categories data sheet
product_categories_df = pd.read_excel(xl, sheet_name='product_categories_data')
product_categories_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 2 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   product_category_name          71 non-null     object
 1   product_category_name_english  71 non-null     object
dtypes: object(2)
memory usage: 1.2+ KB


## Using the schema below, let's merge the tables.

>1.	An order might have multiple items.
2.	Each item might be fulfilled by a distinct seller.
3.	All text identifying stores and partners were replaced by the names of Game of Thrones great houses.


<img src='schema.png' alt='Table schema' width='750px'>

In [31]:
#  orders_df + order_reviews_df = orders
orders = pd.merge(orders_df, order_reviews_df, on='order_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99992 entries, 0 to 99991
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99992 non-null  object        
 1   customer_id                    99992 non-null  object        
 2   order_status                   99992 non-null  category      
 3   order_approved_at              99831 non-null  datetime64[ns]
 4   order_delivered_customer_date  97005 non-null  datetime64[ns]
 5   review_id                      99224 non-null  object        
 6   review_score                   99224 non-null  category      
dtypes: category(2), datetime64[ns](2), object(3)
memory usage: 4.8+ MB


In [32]:
# orders + order_payments_df = orders
orders = pd.merge(orders, order_payments_df, on='order_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104478 entries, 0 to 104477
Data columns (total 9 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       104478 non-null  object        
 1   customer_id                    104478 non-null  object        
 2   order_status                   104478 non-null  category      
 3   order_approved_at              104302 non-null  datetime64[ns]
 4   order_delivered_customer_date  101324 non-null  datetime64[ns]
 5   review_id                      103678 non-null  object        
 6   review_score                   103678 non-null  category      
 7   payment_type                   104477 non-null  category      
 8   payment_value                  104477 non-null  float64       
dtypes: category(3), datetime64[ns](2), float64(1), object(3)
memory usage: 5.9+ MB


In [33]:
# orders + customers_df = orders
orders = pd.merge(orders, customers_df, on='customer_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104478 entries, 0 to 104477
Data columns (total 13 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       104478 non-null  object        
 1   customer_id                    104478 non-null  object        
 2   order_status                   104478 non-null  category      
 3   order_approved_at              104302 non-null  datetime64[ns]
 4   order_delivered_customer_date  101324 non-null  datetime64[ns]
 5   review_id                      103678 non-null  object        
 6   review_score                   103678 non-null  category      
 7   payment_type                   104477 non-null  category      
 8   payment_value                  104477 non-null  float64       
 9   customer_unique_id             104478 non-null  object        
 10  customer_zip_code_prefix       104478 non-null  object        
 11  

In [34]:
# orders + order_items_df = orders
orders = pd.merge(orders, order_items_df, on='order_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119143 entries, 0 to 119142
Data columns (total 18 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       119143 non-null  object        
 1   customer_id                    119143 non-null  object        
 2   order_status                   119143 non-null  category      
 3   order_approved_at              118966 non-null  datetime64[ns]
 4   order_delivered_customer_date  115722 non-null  datetime64[ns]
 5   review_id                      118146 non-null  object        
 6   review_score                   118146 non-null  category      
 7   payment_type                   119140 non-null  category      
 8   payment_value                  119140 non-null  float64       
 9   customer_unique_id             119143 non-null  object        
 10  customer_zip_code_prefix       119143 non-null  object        
 11  

In [35]:
# orders + products_df = orders
orders = pd.merge(orders, products_df, on='product_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119143 entries, 0 to 119142
Data columns (total 19 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       119143 non-null  object        
 1   customer_id                    119143 non-null  object        
 2   order_status                   119143 non-null  category      
 3   order_approved_at              118966 non-null  datetime64[ns]
 4   order_delivered_customer_date  115722 non-null  datetime64[ns]
 5   review_id                      118146 non-null  object        
 6   review_score                   118146 non-null  category      
 7   payment_type                   119140 non-null  category      
 8   payment_value                  119140 non-null  float64       
 9   customer_unique_id             119143 non-null  object        
 10  customer_zip_code_prefix       119143 non-null  object        
 11  

In [36]:
# orders + sellers_df = orders
orders = pd.merge(orders, sellers_df, on='seller_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119143 entries, 0 to 119142
Data columns (total 22 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       119143 non-null  object        
 1   customer_id                    119143 non-null  object        
 2   order_status                   119143 non-null  category      
 3   order_approved_at              118966 non-null  datetime64[ns]
 4   order_delivered_customer_date  115722 non-null  datetime64[ns]
 5   review_id                      118146 non-null  object        
 6   review_score                   118146 non-null  category      
 7   payment_type                   119140 non-null  category      
 8   payment_value                  119140 non-null  float64       
 9   customer_unique_id             119143 non-null  object        
 10  customer_zip_code_prefix       119143 non-null  object        
 11  

# Copy orders df b4 merging with geo

In [37]:
orders_copy = orders.copy()
geolocation_df_copy = geolocation_df.copy()

In [38]:
# orders_copy + geolocation_df = orders_copy
geolocation_df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                  Non-Null Count    Dtype   
---  ------                  --------------    -----   
 0   geolocation_lat         1000163 non-null  float64 
 1   geolocation_lng         1000163 non-null  float64 
 2   geolocation_city        1000163 non-null  category
 3   geolocation_state       1000163 non-null  category
 4   seller_zip_code_prefix  1000163 non-null  object  
dtypes: category(2), float64(2), object(1)
memory usage: 26.1+ MB


In [39]:
geolocation_df_copy.shape

(1000163, 5)

In [40]:
geolocation_df_copy.seller_zip_code_prefix.sample(15)

753463    64220
363084    16400
430420    21545
144133     5887
68568      3923
800800    74805
699455    47700
380550    18056
581331    34006
704634    49046
617795    36555
338803    14711
221151     9195
919001    88810
810470    76230
Name: seller_zip_code_prefix, dtype: object

In [41]:
# Sometimes, the zipcode lenght is 4 digits, other times, 5 digits
# Let's fix this before merging with the zipcode in the orders table
geolocation_df_copy.seller_zip_code_prefix = geolocation_df_copy.seller_zip_code_prefix.str.zfill(5)

In [42]:
geolocation_df_copy.seller_zip_code_prefix.sample(15)

394188    18605
858495    82940
962628    94920
229830    09420
417590    20756
623471    37062
94350     04517
950859    91770
559548    31080
180442    07025
77094     04194
548091    30380
447056    22441
971929    95702
334879    14415
Name: seller_zip_code_prefix, dtype: object

In [43]:
orders_copy.seller_zip_code_prefix.sample(25)

114925    18048
117770    14090
41622     99500
118221    12327
102690     4248
63239      3204
78750     31160
4855       8250
99769     31030
101110    15025
29143     13165
92314     80310
89522      3702
40448      2215
7127       6144
65874      2804
69294     80330
66070      3035
69795      3976
110912     4438
113026    13186
8965      14940
76429      5588
43124      9780
36897     80215
Name: seller_zip_code_prefix, dtype: object

In [44]:
# Also, apply zfill to the seller_zpi_code_prefix of the orders table to rhyme with that of the geolocation table
orders_copy.seller_zip_code_prefix = orders_copy.seller_zip_code_prefix.str.zfill(5)
orders_copy.seller_zip_code_prefix.sample(25)

14801     80610
29016     87230
104845    14840
115827    88352
10445     03471
38724     01212
104038    04403
66090     89023
103078    14085
18286     09190
13331     02150
35277     13419
54329     03804
54825     11900
93063     11701
29747     09571
83173     02407
21236     03561
58637     03573
87409     15025
34409     05849
117173    09270
69139     13481
7509      14940
75702     03504
Name: seller_zip_code_prefix, dtype: object

In [45]:
# Demo dfs
orders_zipcode = pd.DataFrame(data=orders_copy.seller_zip_code_prefix,)
geo_zipcode = pd.DataFrame(data=geolocation_df_copy.seller_zip_code_prefix,)

In [46]:
orders_zipcode

Unnamed: 0,seller_zip_code_prefix
0,09350
1,09350
2,09350
3,31570
4,14840
...,...
119138,17602
119139,08290
119140,37175
119141,37175


In [47]:
geo_zipcode

Unnamed: 0,seller_zip_code_prefix
0,01037
1,01046
2,01046
3,01041
4,01035
...,...
1000158,99950
1000159,99900
1000160,99950
1000161,99980


In [52]:
geo_zipcode.duplicated().sum()

981148

In [50]:
fk_merge = pd.merge(orders_zipcode, geo_zipcode, how='outer')
fk_merge

Unnamed: 0,seller_zip_code_prefix
0,09350
1,09350
2,09350
3,09350
4,09350
...,...
17825683,99920
17825684,99920
17825685,99920
17825686,99952


### Data Quality Issues
- wrong datatype: review_score, payment_sequential, payment_installments, payment_type, customer_unique_id, geolocation_zip_code_prefix, geolocation_state, geolocation_city,
- Inconsistent customer zipcode prefix format. 4 digits sometimes, 5 digits other times
- non-ascii characters: geolocation city
- Inconsistent spelling formats. e.g 'sÃ£o paulo', 'sao paulo'; getÃºlio vargas, getulio vargas; etc.
- Certain columns are of type float instead of int
- wrong datatype: order_item_id
- Wrong datatype: payment_type,  payment_sequential, payment_installments
- wrong datatype: customer_city, customer_state, customer_zip_code_prefix
- Inconsistent customer zipcode prefix format. 4 digits sometimes, 5 digits other times
- City name in lower case
- 

## Now, let's clean up the combined dataset

In [82]:
orders.shape

(17094341, 43)

In [24]:
customers_df.columns

Index(['customer_id', 'customer_unique_id', 'customer_zip_code_prefix',
       'customer_city', 'customer_state'],
      dtype='object')

In [25]:
customers_df.customer_zip_code_prefix.nunique()

14994

In [26]:
geolocation_df.geolocation_zip_code_prefix.nunique()

19015

In [27]:
seller_clean.zip_code_prefix.astype('str')

NameError: name 'seller_clean' is not defined

In [None]:
# def change_col_type(df, col, dtype):
#     df[col] = df[col].astype(str(dtype))

def normalize_char_lenght(df, col, to_lenght):
    '''Method to set the objects in a series to a fixed number of characters'''
    
    #get the original datatype of the column
    dtype = df[col].dtype
    
    #convert to string to apply zfill function
    df[col] = df[col].astype('str')
    
    error_log = []
    for i in df[col]:
        if len(i) == to_lenght:
            pass
        elif len(i) > to_lenght:
            error_log.append('{} is more than desired number of characters. modify manually'.format(i))
        else:
            i = str(i).zfill(to_lenght)
            
        #convert series back to original datatype
        df[col] = df[col].astype(dtype)
        return df[col]
        

In [None]:
normalize_char_lenght(seller_clean, 'zip_code_prefix', 5)

In [None]:
seller_clean.zip_code_prefix

In [None]:
xyz = '354'
# print(xyz.zfill(5))

if len(xyz) == 5:
        pass
elif len(xyz) > 5:
        print('xyz is more than desired digits')
else:
    xyz = xyz.zfill(5)
    print(xyz)

In [None]:
# Merge geolocation and seller dataframes
# First, rename the zipcode prefix in the two tables to match
seller_clean = sellers_df.copy()
geolocation_clean = geolocation_df.copy()
seller_clean.rename(columns={'seller_zip_code_prefix': 'zip_code_prefix'}, inplace=True)
geolocation_clean.rename(columns={'geolocation_zip_code_prefix': 'zip_code_prefix'}, inplace=True)

# Ensure that all zip_code prefixes are of equal character lenght:


merged = pd.merge(seller_clean, geolocation_clean, on='zip_code_prefix', how='right')
merged

In [None]:
sellers_df.shape

In [53]:
sellers_df_copy = sellers_df.copy()

In [59]:
# Merge with geo data first
with_geo = pd.merge(sellers_df_copy, geolocation_df_copy, how='left')

In [60]:
with_geo

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP,-22.898536,-47.063125,campinas,SP
1,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP,-22.895499,-47.061944,campinas,SP
2,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP,-22.891740,-47.060820,campinas,SP
3,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP,-22.895762,-47.066144,campinas,SP
4,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP,-22.896154,-47.062431,campinas,SP
...,...,...,...,...,...,...,...,...
339972,9e25199f6ef7e7c347120ff175652c3b,12051,taubate,SP,-23.011130,-45.592347,taubate,SP
339973,9e25199f6ef7e7c347120ff175652c3b,12051,taubate,SP,-23.013452,-45.584299,taubatÃ©,SP
339974,9e25199f6ef7e7c347120ff175652c3b,12051,taubate,SP,-23.009155,-45.592019,taubate,SP
339975,9e25199f6ef7e7c347120ff175652c3b,12051,taubate,SP,-23.009019,-45.584925,taubate,SP


In [62]:
customers_copy = customers_df.copy()
customers_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   customer_id               99441 non-null  object  
 1   customer_unique_id        99441 non-null  object  
 2   customer_zip_code_prefix  99441 non-null  object  
 3   customer_city             99441 non-null  category
 4   customer_state            99441 non-null  category
dtypes: category(2), object(3)
memory usage: 2.7+ MB


In [70]:
customers_copy.customer_zip_code_prefix = customers_copy.customer_zip_code_prefix.str.zfill(5)
customers_copy.customer_zip_code_prefix.sample(15)

45098    17290
24998    86870
44125    86350
51123    29560
47106    28621
75530    59090
81179    36720
11218    27150
97414    20511
94633    13468
45590    35702
25275    24320
2079     13840
95249    05448
33135    17800
Name: customer_zip_code_prefix, dtype: object

In [72]:
with_customer = pd.merge(customers_copy, geolocation_df_copy, left_on='customer_zip_code_prefix', 
                         right_on='seller_zip_code_prefix', how='left') #.drop(columns=['customer_zip_code_prefix'])
with_customer

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state,seller_zip_code_prefix
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.509897,-47.397866,franca,SP,14409
1,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.497396,-47.399241,franca,SP,14409
2,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.510459,-47.399553,franca,SP,14409
3,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.480940,-47.394161,franca,SP,14409
4,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.515413,-47.398194,franca,SP,14409
...,...,...,...,...,...,...,...,...,...,...
15083728,274fa6071e5e17fe303b9748641082c8,84732c5050c01db9b23e19ba39899398,06703,cotia,SP,-23.599369,-46.905603,cotia,SP,06703
15083729,274fa6071e5e17fe303b9748641082c8,84732c5050c01db9b23e19ba39899398,06703,cotia,SP,-23.593577,-46.910112,cotia,SP,06703
15083730,274fa6071e5e17fe303b9748641082c8,84732c5050c01db9b23e19ba39899398,06703,cotia,SP,-23.584425,-46.892014,cotia,SP,06703
15083731,274fa6071e5e17fe303b9748641082c8,84732c5050c01db9b23e19ba39899398,06703,cotia,SP,-23.595022,-46.918546,cotia,SP,06703


In [66]:
with_customer.duplicated().sum()

3791133