## Attention

An order might have multiple items.

Each item might be fulfiled by a distinct seller

In [1]:
#Import required libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
# import streamlit as st
# import plotly

In [2]:
# The datasets were combined into one Excel file with multiple sheets
#Load workbook
xl = pd.ExcelFile('olist_store_dataset.xlsx', engine='openpyxl')

In [3]:
# list of sheets containing the datasets
xl.sheet_names

['customers_data',
 'geolocation_data',
 'order_items_data',
 'order_payments_data',
 'order_reviews_data',
 'orders_data',
 'products_data',
 'sellers_data',
 'product_categories_data']

### Load the tables from Excel worksheet to a pandas dataframe

In [4]:
# Customers data sheet
customers_df = pd.read_excel(xl, sheet_name='customers_data')
customers_df.head(2)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP


In [5]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


In [6]:
# change column datatypes
convert_dict = {
    'customer_zip_code_prefix': str,
    'customer_city': 'category',
    'customer_state': 'category',
}
customers_df = customers_df.astype(convert_dict)

# Let's standardize customer_zip_code_prefix digits to 5 for the column
customers_df.customer_zip_code_prefix = customers_df.customer_zip_code_prefix.str.zfill(5)

customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   customer_id               99441 non-null  object  
 1   customer_unique_id        99441 non-null  object  
 2   customer_zip_code_prefix  99441 non-null  object  
 3   customer_city             99441 non-null  category
 4   customer_state            99441 non-null  category
dtypes: category(2), object(3)
memory usage: 2.7+ MB


In [7]:
customers_df.customer_zip_code_prefix.sample(5)

93962    95555
72677    12043
10819    08141
96348    20230
92991    88090
Name: customer_zip_code_prefix, dtype: object

In [8]:
# geolocation data sheet
geolocation_df = pd.read_excel(xl, sheet_name='geolocation_data')
geolocation_df

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.644820,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP
...,...,...,...,...,...
1000158,99950,-28.068639,-52.010705,tapejara,RS
1000159,99900,-27.877125,-52.224882,getulio vargas,RS
1000160,99950,-28.071855,-52.014716,tapejara,RS
1000161,99980,-28.388932,-51.846871,david canabarro,RS


In [9]:
geolocation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000163 non-null  int64  
 1   geolocation_lat              1000163 non-null  float64
 2   geolocation_lng              1000163 non-null  float64
 3   geolocation_city             1000163 non-null  object 
 4   geolocation_state            1000163 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 38.2+ MB


- change datatype for some columns
- rename geolocation_zip_code_prefix to seller_zip_code_prefix to merge with sellers_df

In [76]:
# Change data types of listed columns
convert_dict = {
    'geolocation_zip_code_prefix' : str,
    'geolocation_city': 'category',
    'geolocation_state': 'category',
}
geolocation_df = geolocation_df.astype(convert_dict)


# Let's standardize geolocation_zip_code_prefix digits to 5 for the column
geolocation_df.geolocation_zip_code_prefix = geolocation_df.geolocation_zip_code_prefix.str.zfill(5)

# Drop all duplicates in the geolocation_zip_code_prefix column and return the dataframe
geolocation_df = geolocation_df.drop_duplicates(subset=['geolocation_zip_code_prefix'])

In [77]:
geolocation_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19015 entries, 0 to 999846
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   geolocation_zip_code_prefix  19015 non-null  object  
 1   geolocation_lat              19015 non-null  float64 
 2   geolocation_lng              19015 non-null  float64 
 3   geolocation_city             19015 non-null  category
 4   geolocation_state            19015 non-null  category
dtypes: category(2), float64(2), object(1)
memory usage: 971.8+ KB


In [78]:
geolocation_df.geolocation_zip_code_prefix.sample(10)

817370    77308
753129    64207
44726     03183
71010     04018
709968    50870
281685    12830
300395    13336
808437    75905
56138     03442
899618    87955
Name: geolocation_zip_code_prefix, dtype: object

In [38]:
# order_items data sheet
order_items_df = pd.read_excel(xl, sheet_name='order_items_data')
order_items_df.head(2)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93


In [14]:
order_items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             112650 non-null  object        
 1   order_item_id        112650 non-null  int64         
 2   product_id           112650 non-null  object        
 3   seller_id            112650 non-null  object        
 4   shipping_limit_date  112650 non-null  datetime64[ns]
 5   price                112650 non-null  float64       
 6   freight_value        112650 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 6.0+ MB


- Change datatypes for some columns
- Drop the shipping_limit_date column

In [15]:
convert_dict = {
    'order_item_id': str,
    'product_id': str,
    'seller_id': str,    
}

order_items_df = order_items_df.astype(convert_dict)
order_items_df = order_items_df.drop(columns=['shipping_limit_date'])

In [16]:
order_items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   order_id       112650 non-null  object 
 1   order_item_id  112650 non-null  object 
 2   product_id     112650 non-null  object 
 3   seller_id      112650 non-null  object 
 4   price          112650 non-null  float64
 5   freight_value  112650 non-null  float64
dtypes: float64(2), object(4)
memory usage: 5.2+ MB


In [17]:
# order_payments data sheet
order_payments_df = pd.read_excel(xl, sheet_name='order_payments_data')
order_payments_df.head(2)

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39


In [18]:
order_payments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB


In [19]:
# Change datatype
convert_dict = {
    'order_id': str,
    'payment_type': 'category', 
}
order_payments_df = order_payments_df.astype(convert_dict)

# drop irrelevant columns
order_payments_df = order_payments_df.drop(columns=['payment_sequential', 'payment_installments'])
order_payments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   order_id       103886 non-null  object  
 1   payment_type   103886 non-null  category
 2   payment_value  103886 non-null  float64 
dtypes: category(1), float64(1), object(1)
memory usage: 1.7+ MB


In [20]:
# order_reviews data sheet
order_reviews_df = pd.read_excel(xl, sheet_name='order_reviews_data')
order_reviews_df.head(2)

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10,2018-03-11 03:05:13


In [21]:
order_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   review_id                99224 non-null  object        
 1   order_id                 99224 non-null  object        
 2   review_score             99224 non-null  int64         
 3   review_comment_title     11567 non-null  object        
 4   review_comment_message   40974 non-null  object        
 5   review_creation_date     99224 non-null  datetime64[ns]
 6   review_answer_timestamp  99224 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 5.3+ MB


In [22]:
# change dtype
convert_dict = {
    'review_score': 'category',
}
order_reviews_df = order_reviews_df.astype(convert_dict)

# drop irrelevant columns
order_reviews_df = order_reviews_df.drop(columns=['review_creation_date','review_comment_message',
                                                  'review_comment_title','review_answer_timestamp',
                                                 ])
order_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   review_id     99224 non-null  object  
 1   order_id      99224 non-null  object  
 2   review_score  99224 non-null  category
dtypes: category(1), object(2)
memory usage: 1.6+ MB


In [23]:
# orders data sheet
orders_df = pd.read_excel(xl, sheet_name='orders_data')
orders_df.head(2)

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13


In [24]:
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  object        
 3   order_purchase_timestamp       99441 non-null  datetime64[ns]
 4   order_approved_at              99281 non-null  datetime64[ns]
 5   order_delivered_carrier_date   97658 non-null  datetime64[ns]
 6   order_delivered_customer_date  96476 non-null  datetime64[ns]
 7   order_estimated_delivery_date  99441 non-null  datetime64[ns]
dtypes: datetime64[ns](5), object(3)
memory usage: 6.1+ MB


In [25]:
# change dtype
orders_df.order_status = orders_df.order_status.astype('category')

# drop irrelevant columns
orders_df = orders_df.drop(columns = ['order_purchase_timestamp','order_delivered_carrier_date', 
                                      'order_estimated_delivery_date'
                                     ])
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  category      
 3   order_approved_at              99281 non-null  datetime64[ns]
 4   order_delivered_customer_date  96476 non-null  datetime64[ns]
dtypes: category(1), datetime64[ns](2), object(2)
memory usage: 3.1+ MB


In [26]:
# products data sheet
products_df = pd.read_excel(xl, sheet_name='products_data')
products_df.head(2)

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0


In [27]:
products_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32951 non-null  object 
 1   product_category_name       32341 non-null  object 
 2   product_name_lenght         32341 non-null  float64
 3   product_description_lenght  32341 non-null  float64
 4   product_photos_qty          32341 non-null  float64
 5   product_weight_g            32949 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_height_cm           32949 non-null  float64
 8   product_width_cm            32949 non-null  float64
dtypes: float64(7), object(2)
memory usage: 2.3+ MB


In [28]:
# drop irrelevant columns
products_df = products_df.drop(columns=['product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm'])
products_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   product_id             32951 non-null  object
 1   product_category_name  32341 non-null  object
dtypes: object(2)
memory usage: 515.0+ KB


In [29]:
# sellers data sheet
sellers_df = pd.read_excel(xl, sheet_name='sellers_data')
sellers_df.sample(10)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
2582,900ba814c251a692506d7834c1218441,13328,salto,SP
198,1284de4ae8aa26997e748c851557cf0e,85301,laranjeiras do sul,SP
1849,557f22c76691849db52d2abccf0015d0,75650,morrinhos,GO
442,610f72e407cdd7caaa2f8167b0163fd8,1201,sao paulo,SP
2140,42fb44130ac3134cde8e35cf8ec7df25,12995,pinhalzinho,SP
2867,651fe66032c9520ee5587f694908040d,94810,alvorada,RS
2221,8fb791022c1fc8909664f48ab7dc636d,3976,sao paulo,SP
1411,b19f3ca2ea475913750f25a5c37c8d8f,35501,divinopolis,MG
854,515d781150feed28a6ac091bb0e3cb8c,79090,campo grande,MS
2581,e116c7455dd26a4d8c3e92532583905f,1140,sao paulo,SP


In [40]:
sellers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   seller_id               3095 non-null   object  
 1   seller_zip_code_prefix  3095 non-null   object  
 2   seller_city             3095 non-null   category
 3   seller_state            3095 non-null   category
dtypes: category(2), object(2)
memory usage: 79.2+ KB


In [31]:
# change data types of columns
convert_dict = {
    'seller_zip_code_prefix': str,
    'seller_id': str,
    'seller_city': 'category',
    'seller_state': 'category',
}
sellers_df = sellers_df.astype(convert_dict)

# Let's standardize seller_zip_code_prefix digits to 5 for the column
sellers_df.seller_zip_code_prefix = sellers_df.seller_zip_code_prefix.str.zfill(5)

sellers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   seller_id               3095 non-null   object  
 1   seller_zip_code_prefix  3095 non-null   object  
 2   seller_city             3095 non-null   category
 3   seller_state            3095 non-null   category
dtypes: category(2), object(2)
memory usage: 79.2+ KB


In [39]:
# product_categories data sheet
product_categories_df = pd.read_excel(xl, sheet_name='product_categories_data')
product_categories_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 2 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   product_category_name          71 non-null     object
 1   product_category_name_english  71 non-null     object
dtypes: object(2)
memory usage: 1.2+ KB


## Using the schema below, let's merge the tables.

>1.	An order might have multiple items.
2.	Each item might be fulfilled by a distinct seller.
3.	All text identifying stores and partners were replaced by the names of Game of Thrones great houses.


<img src='schema.png' alt='Table schema' width='750px'>

In [121]:
#  orders_df + order_reviews = orders
orders = pd.merge(orders_df, order_reviews_df, on='order_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99992 entries, 0 to 99991
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99992 non-null  object        
 1   customer_id                    99992 non-null  object        
 2   order_status                   99992 non-null  category      
 3   order_approved_at              99831 non-null  datetime64[ns]
 4   order_delivered_customer_date  97005 non-null  datetime64[ns]
 5   review_id                      99224 non-null  object        
 6   review_score                   99224 non-null  category      
dtypes: category(2), datetime64[ns](2), object(3)
memory usage: 4.8+ MB


In [122]:
orders.duplicated().any()

False

In [123]:
# orders + order_payments_df = orders
orders = pd.merge(orders, order_payments_df, on='order_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104478 entries, 0 to 104477
Data columns (total 9 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       104478 non-null  object        
 1   customer_id                    104478 non-null  object        
 2   order_status                   104478 non-null  category      
 3   order_approved_at              104302 non-null  datetime64[ns]
 4   order_delivered_customer_date  101324 non-null  datetime64[ns]
 5   review_id                      103678 non-null  object        
 6   review_score                   103678 non-null  category      
 7   payment_type                   104477 non-null  category      
 8   payment_value                  104477 non-null  float64       
dtypes: category(3), datetime64[ns](2), float64(1), object(3)
memory usage: 5.9+ MB


In [130]:
order_payments_df.query("order_id == '8ca5bdac5ebe8f2d6fc9171d5ebc906a'")

Unnamed: 0,order_id,payment_type,payment_value
752,8ca5bdac5ebe8f2d6fc9171d5ebc906a,voucher,15.0
3675,8ca5bdac5ebe8f2d6fc9171d5ebc906a,voucher,15.0
8191,8ca5bdac5ebe8f2d6fc9171d5ebc906a,voucher,15.0
15778,8ca5bdac5ebe8f2d6fc9171d5ebc906a,voucher,15.0
37465,8ca5bdac5ebe8f2d6fc9171d5ebc906a,credit_card,59.08
49967,8ca5bdac5ebe8f2d6fc9171d5ebc906a,voucher,15.0
69644,8ca5bdac5ebe8f2d6fc9171d5ebc906a,voucher,15.0
78419,8ca5bdac5ebe8f2d6fc9171d5ebc906a,voucher,15.0
83047,8ca5bdac5ebe8f2d6fc9171d5ebc906a,voucher,25.0


In [106]:
# orders + order_items_df = orders
orders = pd.merge(orders, order_items_df, on='order_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119143 entries, 0 to 119142
Data columns (total 15 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       119143 non-null  object        
 1   customer_id                    119143 non-null  object        
 2   order_status                   119143 non-null  category      
 3   order_approved_at              118966 non-null  datetime64[ns]
 4   order_delivered_customer_date  115722 non-null  datetime64[ns]
 5   review_id                      118146 non-null  object        
 6   review_score                   118146 non-null  category      
 7   payment_type                   119140 non-null  category      
 8   payment_value                  119140 non-null  float64       
 9   order_item_id                  118310 non-null  float64       
 10  product_id                     118310 non-null  object        
 11  

In [107]:
# orders + products = orders
orders = pd.merge(orders, products_df, on='product_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119143 entries, 0 to 119142
Data columns (total 16 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       119143 non-null  object        
 1   customer_id                    119143 non-null  object        
 2   order_status                   119143 non-null  category      
 3   order_approved_at              118966 non-null  datetime64[ns]
 4   order_delivered_customer_date  115722 non-null  datetime64[ns]
 5   review_id                      118146 non-null  object        
 6   review_score                   118146 non-null  category      
 7   payment_type                   119140 non-null  category      
 8   payment_value                  119140 non-null  float64       
 9   order_item_id                  118310 non-null  float64       
 10  product_id                     118310 non-null  object        
 11  

In [108]:
# orders + sellers_df = orders
orders = pd.merge(orders, sellers_df, on='seller_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119143 entries, 0 to 119142
Data columns (total 19 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       119143 non-null  object        
 1   customer_id                    119143 non-null  object        
 2   order_status                   119143 non-null  category      
 3   order_approved_at              118966 non-null  datetime64[ns]
 4   order_delivered_customer_date  115722 non-null  datetime64[ns]
 5   review_id                      118146 non-null  object        
 6   review_score                   118146 non-null  category      
 7   payment_type                   119140 non-null  category      
 8   payment_value                  119140 non-null  float64       
 9   order_item_id                  118310 non-null  float64       
 10  product_id                     118310 non-null  object        
 11  

In [110]:
# orders + geolocations_df = orders
orders = pd.merge(orders, geolocation_df, left_on = 'seller_zip_code_prefix', right_on = 'geolocation_zip_code_prefix', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119143 entries, 0 to 119142
Data columns (total 24 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       119143 non-null  object        
 1   customer_id                    119143 non-null  object        
 2   order_status                   119143 non-null  category      
 3   order_approved_at              118966 non-null  datetime64[ns]
 4   order_delivered_customer_date  115722 non-null  datetime64[ns]
 5   review_id                      118146 non-null  object        
 6   review_score                   118146 non-null  category      
 7   payment_type                   119140 non-null  category      
 8   payment_value                  119140 non-null  float64       
 9   order_item_id                  118310 non-null  float64       
 10  product_id                     118310 non-null  object        
 11  

In [114]:
# # orders + customers_df = orders
orders_copy = pd.merge(orders, customers_df, on='customer_id', how='left')
orders_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119143 entries, 0 to 119142
Data columns (total 28 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       119143 non-null  object        
 1   customer_id                    119143 non-null  object        
 2   order_status                   119143 non-null  category      
 3   order_approved_at              118966 non-null  datetime64[ns]
 4   order_delivered_customer_date  115722 non-null  datetime64[ns]
 5   review_id                      118146 non-null  object        
 6   review_score                   118146 non-null  category      
 7   payment_type                   119140 non-null  category      
 8   payment_value                  119140 non-null  float64       
 9   order_item_id                  118310 non-null  float64       
 10  product_id                     118310 non-null  object        
 11  

In [120]:
orders_copy[orders_copy.duplicated()]

Unnamed: 0,order_id,customer_id,order_status,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,order_item_id,...,seller_state,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
399,d7f4e2f755cf2f40aa7c39804cfe01c4,18ad705482141f60ad05945506ef2b9e,delivered,2017-11-26 13:14:18,2017-12-08 17:02:43,0c2afd1405e5f0df16aeebe8a029b89d,5,voucher,30.22,1.0,...,SP,17209,-22.275687,-48.542600,jau,SP,37d2d1ac62901f9a36cff32ca86c9337,38400,uberlandia,MG
1147,92a9ef7f0a0d392c9ad0d5f44e86b4ba,4d8c75952e09254a602a3a059d0ac367,delivered,2018-03-25 16:55:50,2018-03-29 16:19:02,2b3e1b6cc7d379fd0a2296180a0fae73,3,voucher,25.00,1.0,...,SP,03504,-23.538779,-46.536486,sao paulo,SP,c375736396906dff8414217e02c45661,13050,campinas,SP
1148,92a9ef7f0a0d392c9ad0d5f44e86b4ba,4d8c75952e09254a602a3a059d0ac367,delivered,2018-03-25 16:55:50,2018-03-29 16:19:02,2b3e1b6cc7d379fd0a2296180a0fae73,3,voucher,25.00,2.0,...,SP,03504,-23.538779,-46.536486,sao paulo,SP,c375736396906dff8414217e02c45661,13050,campinas,SP
1149,92a9ef7f0a0d392c9ad0d5f44e86b4ba,4d8c75952e09254a602a3a059d0ac367,delivered,2018-03-25 16:55:50,2018-03-29 16:19:02,2b3e1b6cc7d379fd0a2296180a0fae73,3,voucher,25.00,3.0,...,SP,03504,-23.538779,-46.536486,sao paulo,SP,c375736396906dff8414217e02c45661,13050,campinas,SP
1173,06875ab72c5b9bb2eb303a70031bfeb0,686ca7499141a82f95123c370af061b0,delivered,2018-01-11 20:15:47,2018-01-15 21:06:54,737372e855cbef4d342692e9a38f9848,4,voucher,22.68,1.0,...,SP,02134,-23.499318,-46.578124,sao paulo,SP,b14edf7220c5c77f27f11567178c24fb,12228,sao jose dos campos,SP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117906,de06e436fbb3dd6e10bafb5f24ed5530,11ff39f6282e2a89f5c079a47af66743,delivered,2017-09-07 02:00:15,2017-09-12 21:44:35,1d41c685c7ba8c6a348bdafdf4cc76ac,5,voucher,9.17,1.0,...,SP,12215,-23.180938,-45.869363,sao jose dos campos,SP,a3631b903f909b6749a452358116cf86,23058,rio de janeiro,RJ
118084,5b03dc75d71e4a3940c88c4c9cdf6c2c,ab0275d64ab2e7e854269eed99e9412a,delivered,2018-04-25 10:55:15,2018-05-18 18:56:37,f2793561b5c8e902bc22b2b2b34a74ef,4,voucher,20.00,1.0,...,SC,88115,-27.543787,-48.625680,sao jose,SC,caa8b91f5582eca3d032203ad5a11c32,22733,rio de janeiro,RJ
118415,aa61ca4def1d3385bafe461f6ef46faa,bf6b253b57084074028db05fcb09acba,delivered,2017-02-24 20:41:57,2017-03-08 11:38:02,d3e139349ebcc1d7a7eb4b23d00a7285,3,credit_card,53.96,1.0,...,SP,05849,-23.651115,-46.755211,sao paulo,SP,6b46cfed037b73c631cf418c96169857,13036,campinas,SP
118416,aa61ca4def1d3385bafe461f6ef46faa,bf6b253b57084074028db05fcb09acba,delivered,2017-02-24 20:41:57,2017-03-08 11:38:02,d3e139349ebcc1d7a7eb4b23d00a7285,3,credit_card,53.96,2.0,...,SP,05849,-23.651115,-46.755211,sao paulo,SP,6b46cfed037b73c631cf418c96169857,13036,campinas,SP


In [103]:
order_reviews_df.duplicated().any()

False

In [103]:
order_reviews_df.duplicated().any()

False

In [103]:
order_reviews_df.duplicated().any()

False

In [112]:
fk_1 = pd.merge(orders, customers_df, on='customer_id', how='left')
fk_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119143 entries, 0 to 119142
Data columns (total 28 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       119143 non-null  object        
 1   customer_id                    119143 non-null  object        
 2   order_status                   119143 non-null  category      
 3   order_approved_at              118966 non-null  datetime64[ns]
 4   order_delivered_customer_date  115722 non-null  datetime64[ns]
 5   review_id                      118146 non-null  object        
 6   review_score                   118146 non-null  category      
 7   payment_type                   119140 non-null  category      
 8   payment_value                  119140 non-null  float64       
 9   order_item_id                  118310 non-null  float64       
 10  product_id                     118310 non-null  object        
 11  

### Next, merge products and order_items tables

In [97]:
# order_items_df + products_df = order_items

order_items = pd.merge(order_items_df, products_df,  on='product_id', how='left')
order_items.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112650 entries, 0 to 112649
Data columns (total 8 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   order_id               112650 non-null  object        
 1   order_item_id          112650 non-null  int64         
 2   product_id             112650 non-null  object        
 3   seller_id              112650 non-null  object        
 4   shipping_limit_date    112650 non-null  datetime64[ns]
 5   price                  112650 non-null  float64       
 6   freight_value          112650 non-null  float64       
 7   product_category_name  111047 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 7.7+ MB


### Next, merge geolocations table with sellers table

In [64]:
geolocation_df.duplicated(subset=['geolocation_lng', 'geolocation_lat', 'geolocation_city', 'geolocation_state']) #.sum()

1546

In [65]:
geolocation_df.geolocation_zip_code_prefix.duplicated().sum()

719317

In [67]:
geolocation_df.geolocation_zip_code_prefix.value_counts()

38400    779
35500    751
11680    727
11740    678
36400    627
        ... 
90038      1
90002      1
29174      1
72230      1
45824      1
Name: geolocation_zip_code_prefix, Length: 19015, dtype: int64

In [55]:
seller_copy.columns

Index(['seller_id', 'seller_zip_code_prefix', 'seller_city', 'seller_state'], dtype='object')

In [57]:
seller_copy.seller_city.nunique()

611

In [68]:
geo_copy = geolocation_df.copy()
geo_copy.drop(columns=['geolocation_city', 'geolocation_state'], inplace=True)

In [70]:
geo_copy = geo_copy.drop_duplicates(subset=['geolocation_zip_code_prefix'])
geo_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19015 entries, 0 to 999846
Data columns (total 3 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   geolocation_zip_code_prefix  19015 non-null  object 
 1   geolocation_lat              19015 non-null  float64
 2   geolocation_lng              19015 non-null  float64
dtypes: float64(2), object(1)
memory usage: 594.2+ KB


In [71]:
seller_copy = sellers_df.copy()

try_merge = pd.merge(seller_copy, geo_copy, left_on='seller_zip_code_prefix', right_on='geolocation_zip_code_prefix', how='left')
try_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3095 entries, 0 to 3094
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   seller_id                    3095 non-null   object  
 1   seller_zip_code_prefix       3095 non-null   object  
 2   seller_city                  3095 non-null   category
 3   seller_state                 3095 non-null   category
 4   geolocation_zip_code_prefix  3088 non-null   object  
 5   geolocation_lat              3088 non-null   float64 
 6   geolocation_lng              3088 non-null   float64 
dtypes: category(2), float64(2), object(3)
memory usage: 175.8+ KB


In [72]:
try_merge

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP,13023,-22.898536,-47.063125
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP,13844,-22.382941,-46.946641
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ,20031,-22.910641,-43.176510
3,c0f3eea2e14555b6faeea3dd58c1b1c3,04195,sao paulo,SP,04195,-23.657250,-46.610759
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP,12914,-22.971648,-46.533618
...,...,...,...,...,...,...,...
3090,98dddbc4601dd4443ca174359b237166,87111,sarandi,PR,87111,-23.456431,-51.866369
3091,f8201cab383e484733266d1906e2fdfa,88137,palhoca,SC,88137,-27.623801,-48.674286
3092,74871d19219c7d518d0090283e03c137,04650,sao paulo,SP,04650,-23.659845,-46.677882
3093,e603cf3fec55f8697c9059638d6c8eb5,96080,pelotas,RS,96080,-31.744231,-52.328761


In [84]:
# sellers_df + geolocations_df = sellers
sellers = pd.merge(sellers_df, geolocation_df, right_on='geolocation_zip_code_prefix', left_on ='seller_zip_code_prefix', how='left')
sellers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3095 entries, 0 to 3094
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   seller_id                    3095 non-null   object  
 1   seller_zip_code_prefix       3095 non-null   object  
 2   seller_city                  3095 non-null   category
 3   seller_state                 3095 non-null   category
 4   geolocation_zip_code_prefix  3088 non-null   object  
 5   geolocation_lat              3088 non-null   float64 
 6   geolocation_lng              3088 non-null   float64 
 7   geolocation_city             3088 non-null   category
 8   geolocation_state            3088 non-null   category
dtypes: category(4), float64(2), object(3)
memory usage: 506.8+ KB


### Next, we merge the three aggregated tables to form our `sales` table

In [85]:
# Let's standardize zipcode digits to 5 for all columns:
sellers.columns

Index(['seller_id', 'seller_zip_code_prefix', 'seller_city', 'seller_state',
       'geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng',
       'geolocation_city', 'geolocation_state'],
      dtype='object')

In [50]:
order_items.columns

Index(['order_id', 'order_item_id', 'product_id', 'seller_id', 'price',
       'freight_value', 'product_category_name'],
      dtype='object')

In [51]:
orders.columns 

Index(['order_id', 'customer_id', 'order_status', 'order_approved_at',
       'order_delivered_customer_date', 'customer_unique_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state',
       'review_id', 'review_score', 'payment_type', 'payment_value'],
      dtype='object')

In [94]:
for i in order_items.columns:
    if i in orders.columns:
        print(i)
    else:
        pass

order_id
order_item_id
product_id
seller_id
shipping_limit_date
price
freight_value


In [95]:
orders.freight_value

0          8.72
1          8.72
2          8.72
3         22.76
4         19.22
          ...  
119138    20.10
119139    65.02
119140    40.59
119141    40.59
119142    18.36
Name: freight_value, Length: 119143, dtype: float64

In [96]:
# Here, order_items is the fact table, sellers is the dimensional table
sales_data = pd.merge(orders, order_items, on='order_id', how='left')
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165649 entries, 0 to 165648
Data columns (total 26 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       165649 non-null  object        
 1   customer_id                    165649 non-null  object        
 2   order_status                   165649 non-null  category      
 3   order_approved_at              165470 non-null  datetime64[ns]
 4   order_delivered_customer_date  161330 non-null  datetime64[ns]
 5   customer_unique_id             165649 non-null  object        
 6   customer_zip_code_prefix       165649 non-null  object        
 7   customer_city                  165649 non-null  category      
 8   customer_state                 165649 non-null  category      
 9   review_id                      163704 non-null  object        
 10  review_score                   163704 non-null  category      
 11  

In [89]:
pd.Series(list(orders.columns) + list(order_items.columns))

0                          order_id
1                       customer_id
2                      order_status
3                 order_approved_at
4     order_delivered_customer_date
5                customer_unique_id
6          customer_zip_code_prefix
7                     customer_city
8                    customer_state
9                         review_id
10                     review_score
11                     payment_type
12                    payment_value
13                    order_item_id
14                       product_id
15                        seller_id
16              shipping_limit_date
17                            price
18                    freight_value
19                         order_id
20                    order_item_id
21                       product_id
22                        seller_id
23              shipping_limit_date
24                            price
25                    freight_value
26            product_category_name
dtype: object

# Copy orders df b4 merging with geo

In [37]:
orders_copy = orders.copy()
geolocation_df_copy = geolocation_df.copy()

In [38]:
# orders_copy + geolocation_df = orders_copy
geolocation_df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                  Non-Null Count    Dtype   
---  ------                  --------------    -----   
 0   geolocation_lat         1000163 non-null  float64 
 1   geolocation_lng         1000163 non-null  float64 
 2   geolocation_city        1000163 non-null  category
 3   geolocation_state       1000163 non-null  category
 4   seller_zip_code_prefix  1000163 non-null  object  
dtypes: category(2), float64(2), object(1)
memory usage: 26.1+ MB


In [39]:
# Drop irrelevant columns
geolocation_df_copy = geolocation_df_copy.drop(columns=['geolocation_city', 'geolocation_state'])

In [40]:
# Drop duplicates
geolocation_df_copy = geolocation_df_copy.drop_duplicates()

In [41]:
geolocation_df_copy.shape

(720154, 3)

In [42]:
geolocation_df_copy.seller_zip_code_prefix.sample(15)

928985    89239
482126    25065
947056    91170
855558    82220
416323    20730
537481    29400
780350    70233
389316    18400
524082    28943
29942      2405
671763    40342
949328    91520
253233    11420
506211    27320
655831    38800
Name: seller_zip_code_prefix, dtype: object

In [43]:
# Sometimes, the zipcode lenght is 4 digits, other times, 5 digits
# Let's fix this before merging with the zipcode in the orders table
geolocation_df_copy.seller_zip_code_prefix = geolocation_df_copy.seller_zip_code_prefix.str.zfill(5)

In [44]:
geolocation_df_copy.seller_zip_code_prefix.sample(15)

400056    19470
674914    41230
350929    15220
63380     03693
913145    88331
95211     04531
156115    06320
931411    89580
110274    04853
233897    09691
410381    20521
104359    04743
338619    14680
376980    17930
606280    35930
Name: seller_zip_code_prefix, dtype: object

In [45]:
orders_copy.seller_zip_code_prefix.sample(25)

79703     11010
23251     14940
29771      9911
51729     14850
70206     14940
101107    14407
50728     13324
84136      3554
4256      37590
11831     11701
92506     13405
35322     15014
26366     37175
116399    72015
21160     38442
30900      7094
67401      8250
96098     80230
44835     14085
84654      2443
92078     14940
80469     14850
13371     14940
113117     9020
70159     51250
Name: seller_zip_code_prefix, dtype: object

In [46]:
# Also, apply zfill to the seller_zpi_code_prefix of the orders table to rhyme with that of the geolocation table
orders_copy.seller_zip_code_prefix = orders_copy.seller_zip_code_prefix.str.zfill(5)
orders_copy.seller_zip_code_prefix.sample(25)

96977     87050
22425     06060
26006     31570
15686     03504
18654     81650
44739     02258
38853     09015
764       01547
74480     83020
6477      14940
83540     37708
64425     38440
83688     13456
93552     12328
91108     02310
86186     12327
6898      05849
82625     37443
43027     01222
53903     15085
111335    06060
75521     03426
56480     14940
23222     03426
108380    14940
Name: seller_zip_code_prefix, dtype: object

In [47]:
# Demo dfs
try_merge = pd.merge(orders_copy, geolocation_df_copy, on='seller_zip_code_prefix', how='left')

In [48]:
try_merge.shape

(12610995, 24)

In [52]:
sellers_copy = sellers_df.copy()

In [53]:
sellers_copy.seller_zip_code_prefix.sample(15)

1751    80030
782     35500
1547    14140
1755    39801
1374    91350
36       7241
906     14020
291     88056
622     29142
2207     3426
1088    11701
1848    87504
1217    13275
662     26600
333     97502
Name: seller_zip_code_prefix, dtype: object

In [56]:
sellers_copy.seller_zip_code_prefix = sellers_copy.seller_zip_code_prefix.str.zfill(5)
sellers_copy.seller_zip_code_prefix.sample(15)

408     85807
951     13082
2231    88813
2687    35160
1049    88316
2935    06766
2891    03908
1785    04773
363     14406
439     03029
1356    15603
2538    29315
1722    04445
2892    70856
133     23550
Name: seller_zip_code_prefix, dtype: object

In [57]:
# Tried merging geolocations table with other tables it shares common keys with 
# to obtain table with smallest rows

try_merge_2 = pd.merge(sellers_copy, geolocation_df_copy, on='seller_zip_code_prefix', how='left')
try_merge_2.shape

(303892, 6)

In [58]:
sellers_copy.duplicated().any()

False

In [59]:
customers_copy = customers_df.copy()
customers_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   customer_id               99441 non-null  object  
 1   customer_unique_id        99441 non-null  object  
 2   customer_zip_code_prefix  99441 non-null  object  
 3   customer_city             99441 non-null  category
 4   customer_state            99441 non-null  category
dtypes: category(2), object(3)
memory usage: 2.7+ MB


In [60]:
customers_copy.customer_zip_code_prefix.sample(15)

75973    12244
13418     4551
62506    74884
63193    39390
59224     6855
92528    33400
63386     3550
86155    65907
75900    76913
75376    88101
58720     3630
31927    73006
92786     4633
17039     5786
18990    95800
Name: customer_zip_code_prefix, dtype: object

In [62]:
customers_copy.customer_zip_code_prefix = customers_copy.customer_zip_code_prefix.str.zfill(5)
customers_copy.customer_zip_code_prefix.sample(15)

44159    79200
17013    11035
28714    26900
10792    91920
80565    06900
21955    29118
14386    13050
63196    73753
67839    13820
59623    02228
81726    97065
26036    24230
79937    13157
36549    32605
28105    05415
Name: customer_zip_code_prefix, dtype: object

In [63]:
try_merge_3 = pd.merge(customers_copy, geolocation_df_copy, right_on='seller_zip_code_prefix', 
                       left_on='customer_zip_code_prefix', how='left')
try_merge_3.shape

(10071642, 8)

In [89]:
geolocation_df_copy[geolocation_df_copy.duplicated(keep=False)]

Unnamed: 0,geolocation_lat,geolocation_lng,seller_zip_code_prefix
0,-23.545621,-46.639292,01037
1,-23.546081,-46.644820,01046
2,-23.546129,-46.642951,01046
6,-23.546273,-46.641225,01047
7,-23.546923,-46.634264,01013
...,...,...,...
1000153,-28.343273,-51.873734,99970
1000154,-28.070493,-52.011342,99950
1000159,-27.877125,-52.224882,99900
1000160,-28.071855,-52.014716,99950


In [50]:
fk_merge = pd.merge(orders_zipcode, geo_zipcode, how='outer')
fk_merge

Unnamed: 0,seller_zip_code_prefix
0,09350
1,09350
2,09350
3,09350
4,09350
...,...
17825683,99920
17825684,99920
17825685,99920
17825686,99952


### Data Quality Issues
- wrong datatype: review_score, payment_sequential, payment_installments, payment_type, customer_unique_id, geolocation_zip_code_prefix, geolocation_state, geolocation_city,
- Inconsistent customer zipcode prefix format. 4 digits sometimes, 5 digits other times
- non-ascii characters: geolocation city
- Inconsistent spelling formats. e.g 'sÃ£o paulo', 'sao paulo'; getÃºlio vargas, getulio vargas; etc.
- Certain columns are of type float instead of int
- wrong datatype: order_item_id
- Wrong datatype: payment_type,  payment_sequential, payment_installments
- wrong datatype: customer_city, customer_state, customer_zip_code_prefix
- Inconsistent customer zipcode prefix format. 4 digits sometimes, 5 digits other times
- City name in lower case
- 

## Now, let's clean up the combined dataset

In [82]:
orders.shape

(17094341, 43)

In [24]:
customers_df.columns

Index(['customer_id', 'customer_unique_id', 'customer_zip_code_prefix',
       'customer_city', 'customer_state'],
      dtype='object')

In [25]:
customers_df.customer_zip_code_prefix.nunique()

14994

In [26]:
geolocation_df.geolocation_zip_code_prefix.nunique()

19015

In [27]:
seller_clean.zip_code_prefix.astype('str')

NameError: name 'seller_clean' is not defined

In [None]:
# def change_col_type(df, col, dtype):
#     df[col] = df[col].astype(str(dtype))

def normalize_char_lenght(df, col, to_lenght):
    '''Method to set the objects in a series to a fixed number of characters'''
    
    #get the original datatype of the column
    dtype = df[col].dtype
    
    #convert to string to apply zfill function
    df[col] = df[col].astype('str')
    
    error_log = []
    for i in df[col]:
        if len(i) == to_lenght:
            pass
        elif len(i) > to_lenght:
            error_log.append('{} is more than desired number of characters. modify manually'.format(i))
        else:
            i = str(i).zfill(to_lenght)
            
        #convert series back to original datatype
        df[col] = df[col].astype(dtype)
        return df[col]
        

In [None]:
normalize_char_lenght(seller_clean, 'zip_code_prefix', 5)

In [None]:
seller_clean.zip_code_prefix

In [None]:
xyz = '354'
# print(xyz.zfill(5))

if len(xyz) == 5:
        pass
elif len(xyz) > 5:
        print('xyz is more than desired digits')
else:
    xyz = xyz.zfill(5)
    print(xyz)

In [None]:
# Merge geolocation and seller dataframes
# First, rename the zipcode prefix in the two tables to match
seller_clean = sellers_df.copy()
geolocation_clean = geolocation_df.copy()
seller_clean.rename(columns={'seller_zip_code_prefix': 'zip_code_prefix'}, inplace=True)
geolocation_clean.rename(columns={'geolocation_zip_code_prefix': 'zip_code_prefix'}, inplace=True)

# Ensure that all zip_code prefixes are of equal character lenght:


merged = pd.merge(seller_clean, geolocation_clean, on='zip_code_prefix', how='right')
merged

In [None]:
sellers_df.shape

In [53]:
sellers_df_copy = sellers_df.copy()

In [59]:
# Merge with geo data first
with_geo = pd.merge(sellers_df_copy, geolocation_df_copy, how='left')

In [60]:
with_geo

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP,-22.898536,-47.063125,campinas,SP
1,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP,-22.895499,-47.061944,campinas,SP
2,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP,-22.891740,-47.060820,campinas,SP
3,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP,-22.895762,-47.066144,campinas,SP
4,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP,-22.896154,-47.062431,campinas,SP
...,...,...,...,...,...,...,...,...
339972,9e25199f6ef7e7c347120ff175652c3b,12051,taubate,SP,-23.011130,-45.592347,taubate,SP
339973,9e25199f6ef7e7c347120ff175652c3b,12051,taubate,SP,-23.013452,-45.584299,taubatÃ©,SP
339974,9e25199f6ef7e7c347120ff175652c3b,12051,taubate,SP,-23.009155,-45.592019,taubate,SP
339975,9e25199f6ef7e7c347120ff175652c3b,12051,taubate,SP,-23.009019,-45.584925,taubate,SP


In [62]:
customers_copy = customers_df.copy()
customers_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   customer_id               99441 non-null  object  
 1   customer_unique_id        99441 non-null  object  
 2   customer_zip_code_prefix  99441 non-null  object  
 3   customer_city             99441 non-null  category
 4   customer_state            99441 non-null  category
dtypes: category(2), object(3)
memory usage: 2.7+ MB


In [70]:
customers_copy.customer_zip_code_prefix = customers_copy.customer_zip_code_prefix.str.zfill(5)
customers_copy.customer_zip_code_prefix.sample(15)

45098    17290
24998    86870
44125    86350
51123    29560
47106    28621
75530    59090
81179    36720
11218    27150
97414    20511
94633    13468
45590    35702
25275    24320
2079     13840
95249    05448
33135    17800
Name: customer_zip_code_prefix, dtype: object

In [72]:
with_customer = pd.merge(customers_copy, geolocation_df_copy, left_on='customer_zip_code_prefix', 
                         right_on='seller_zip_code_prefix', how='left') #.drop(columns=['customer_zip_code_prefix'])
with_customer

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state,seller_zip_code_prefix
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.509897,-47.397866,franca,SP,14409
1,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.497396,-47.399241,franca,SP,14409
2,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.510459,-47.399553,franca,SP,14409
3,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.480940,-47.394161,franca,SP,14409
4,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.515413,-47.398194,franca,SP,14409
...,...,...,...,...,...,...,...,...,...,...
15083728,274fa6071e5e17fe303b9748641082c8,84732c5050c01db9b23e19ba39899398,06703,cotia,SP,-23.599369,-46.905603,cotia,SP,06703
15083729,274fa6071e5e17fe303b9748641082c8,84732c5050c01db9b23e19ba39899398,06703,cotia,SP,-23.593577,-46.910112,cotia,SP,06703
15083730,274fa6071e5e17fe303b9748641082c8,84732c5050c01db9b23e19ba39899398,06703,cotia,SP,-23.584425,-46.892014,cotia,SP,06703
15083731,274fa6071e5e17fe303b9748641082c8,84732c5050c01db9b23e19ba39899398,06703,cotia,SP,-23.595022,-46.918546,cotia,SP,06703


In [66]:
with_customer.duplicated().sum()

3791133

In [78]:
geolocation_df_copy.duplicated().sum()

261831

In [64]:
test_len = ['abc', 'abcd', 'abcde', 'fgh', 'ijk']
for i in test_len:
    if len(i) > 4:
        print(i)

abcde


In [74]:
count = 0
for i in sellers_df.seller_zip_code_prefix:
    if len(i) > 5:
        count+=1
    else:
        pass
print(str(count))

0
