## Attention

An order might have multiple items.

Each item might be fulfiled by a distinct seller

In [23]:
#Import required libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import plotly.express as px

In [24]:
# The datasets were combined into one Excel file with multiple sheets
#Load workbook
xl = pd.ExcelFile('olist_store_dataset.xlsx', engine='openpyxl')

In [25]:
# list of sheets containing the datasets
xl.sheet_names

['customers_data',
 'geolocation_data',
 'order_items_data',
 'order_payments_data',
 'order_reviews_data',
 'orders_data',
 'products_data',
 'sellers_data',
 'product_categories_data']

### Load the tables from Excel worksheet to a pandas dataframe

In [26]:
# Customers data sheet
customers_df = pd.read_excel(xl, sheet_name='customers_data')
customers_df.head(2)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP


In [27]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


In [32]:
# change column datatypes
customers_df['customer_zip_code_prefix'] = customers_df.customer_zip_code_prefix.astype('str')

# Let's standardize customer_zip_code_prefix digits to 5 for the column
customers_df.customer_zip_code_prefix = customers_df.customer_zip_code_prefix.str.zfill(5)

customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  object
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: object(5)
memory usage: 3.8+ MB


In [30]:
customers_df.customer_zip_code_prefix.sample(5)

97060    14780
83565    04205
9133     30668
88922    58701
73754    15084
Name: customer_zip_code_prefix, dtype: object

In [31]:
# geolocation data sheet
geolocation_df = pd.read_excel(xl, sheet_name='geolocation_data')
geolocation_df

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.644820,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP
...,...,...,...,...,...
1000158,99950,-28.068639,-52.010705,tapejara,RS
1000159,99900,-27.877125,-52.224882,getulio vargas,RS
1000160,99950,-28.071855,-52.014716,tapejara,RS
1000161,99980,-28.388932,-51.846871,david canabarro,RS


In [11]:
geolocation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000163 non-null  int64  
 1   geolocation_lat              1000163 non-null  float64
 2   geolocation_lng              1000163 non-null  float64
 3   geolocation_city             1000163 non-null  object 
 4   geolocation_state            1000163 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 38.2+ MB


In [33]:
# Change data types of listed columns

geolocation_df['geolocation_zip_code_prefix'] = geolocation_df.geolocation_zip_code_prefix.astype(str)


# Let's standardize geolocation_zip_code_prefix digits to 5 for the column
geolocation_df.geolocation_zip_code_prefix = geolocation_df.geolocation_zip_code_prefix.str.zfill(5)

In [34]:
geolocation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000163 non-null  object 
 1   geolocation_lat              1000163 non-null  float64
 2   geolocation_lng              1000163 non-null  float64
 3   geolocation_city             1000163 non-null  object 
 4   geolocation_state            1000163 non-null  object 
dtypes: float64(2), object(3)
memory usage: 38.2+ MB


In [35]:
geolocation_df.geolocation_zip_code_prefix.sample(10)

527993    29090
443348    22261
88183     04304
41676     03062
677496    41770
945940    91010
358479    15996
685803    44590
52018     03227
230503    09560
Name: geolocation_zip_code_prefix, dtype: object

In [36]:
# order_items data sheet
order_items_df = pd.read_excel(xl, sheet_name='order_items_data')
order_items_df.head(2)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93


In [37]:
order_items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             112650 non-null  object        
 1   order_item_id        112650 non-null  int64         
 2   product_id           112650 non-null  object        
 3   seller_id            112650 non-null  object        
 4   shipping_limit_date  112650 non-null  datetime64[ns]
 5   price                112650 non-null  float64       
 6   freight_value        112650 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 6.0+ MB


- Change datatypes for some columns
- Drop the shipping_limit_date column

In [40]:
order_items_df['order_item_id'] = order_items_df['order_item_id'].astype('str')
order_items_df['product_id'] = order_items_df['product_id'].astype('str')
order_items_df['seller_id'] = order_items_df['seller_id'].astype('str')

order_items_df = order_items_df.drop(columns=['shipping_limit_date'])

In [41]:
order_items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   order_id       112650 non-null  object 
 1   order_item_id  112650 non-null  object 
 2   product_id     112650 non-null  object 
 3   seller_id      112650 non-null  object 
 4   price          112650 non-null  float64
 5   freight_value  112650 non-null  float64
dtypes: float64(2), object(4)
memory usage: 5.2+ MB


In [42]:
# order_payments data sheet
order_payments_df = pd.read_excel(xl, sheet_name='order_payments_data')
order_payments_df.head(2)

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39


In [43]:
order_payments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB


In [44]:
order_payments_df.duplicated().any()

False

In [45]:
# order_payments_df contains duplicate rows
order_payments_df[order_payments_df.duplicated()]

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value


In [46]:
# Change datatype
order_payments_df['order_id'] = order_payments_df['order_id'].astype('str')

# drop irrelevant columns
order_payments_df = order_payments_df.drop(columns=['payment_sequential', 'payment_installments'])
order_payments_df.info()

# drop duplicates
order_payments_df = order_payments_df.drop_duplicates()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   order_id       103886 non-null  object 
 1   payment_type   103886 non-null  object 
 2   payment_value  103886 non-null  float64
dtypes: float64(1), object(2)
memory usage: 2.4+ MB


In [47]:
order_payments_df.duplicated().any()

False

In [48]:
# order_reviews data sheet
order_reviews_df = pd.read_excel(xl, sheet_name='order_reviews_data')
order_reviews_df.head(2)

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10,2018-03-11 03:05:13


In [49]:
order_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   review_id                99224 non-null  object        
 1   order_id                 99224 non-null  object        
 2   review_score             99224 non-null  int64         
 3   review_comment_title     11567 non-null  object        
 4   review_comment_message   40974 non-null  object        
 5   review_creation_date     99224 non-null  datetime64[ns]
 6   review_answer_timestamp  99224 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 5.3+ MB


In [50]:
# drop irrelevant columns
order_reviews_df = order_reviews_df.drop(columns=['review_creation_date','review_comment_message',
                                                  'review_comment_title','review_answer_timestamp',
                                                 ])
order_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   review_id     99224 non-null  object
 1   order_id      99224 non-null  object
 2   review_score  99224 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


In [51]:
# orders data sheet
orders_df = pd.read_excel(xl, sheet_name='orders_data')
orders_df.head(2)

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13


In [52]:
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  object        
 3   order_purchase_timestamp       99441 non-null  datetime64[ns]
 4   order_approved_at              99281 non-null  datetime64[ns]
 5   order_delivered_carrier_date   97658 non-null  datetime64[ns]
 6   order_delivered_customer_date  96476 non-null  datetime64[ns]
 7   order_estimated_delivery_date  99441 non-null  datetime64[ns]
dtypes: datetime64[ns](5), object(3)
memory usage: 6.1+ MB


In [53]:
# drop irrelevant columns
orders_df = orders_df.drop(columns = ['order_delivered_carrier_date', 
                                      'order_estimated_delivery_date'
                                     ])
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 6 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  object        
 3   order_purchase_timestamp       99441 non-null  datetime64[ns]
 4   order_approved_at              99281 non-null  datetime64[ns]
 5   order_delivered_customer_date  96476 non-null  datetime64[ns]
dtypes: datetime64[ns](3), object(3)
memory usage: 4.6+ MB


In [54]:
# products data sheet
products_df = pd.read_excel(xl, sheet_name='products_data')
products_df.head(2)

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0


In [55]:
products_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32951 non-null  object 
 1   product_category_name       32341 non-null  object 
 2   product_name_lenght         32341 non-null  float64
 3   product_description_lenght  32341 non-null  float64
 4   product_photos_qty          32341 non-null  float64
 5   product_weight_g            32949 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_height_cm           32949 non-null  float64
 8   product_width_cm            32949 non-null  float64
dtypes: float64(7), object(2)
memory usage: 2.3+ MB


In [56]:
# drop irrelevant columns
products_df = products_df.drop(columns=['product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm'])
products_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   product_id             32951 non-null  object
 1   product_category_name  32341 non-null  object
dtypes: object(2)
memory usage: 515.0+ KB


In [57]:
# sellers data sheet
sellers_df = pd.read_excel(xl, sheet_name='sellers_data')
sellers_df.sample(10)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
1675,3442033154be05bac582e173ef54fd12,5413,sao paulo,SP
1700,615c3462099ffa940d37b17dfda19594,24855,rio de janeiro,RJ
2026,3a4127e1ba7208ea37044c5092aae170,8226,sao paulo,SP
1612,fd435faa3c0422b60440ea3480d0e77c,58073,joao pessoa,PB
2741,827f8f69dfa529c561901c4f2e0f332f,81880,curitiba,PR
2384,325f3178fb58e2a9778334621eecdbf9,6790,taboao da serra,SP
1190,edf3fabebcc20f7463cc9c53da932ea8,8320,sao paulo,SP
2285,282f23a9769b2690c5dda22e316f9941,31573,belo horizonte,MG
1064,a154d7316f158bb42e6fa18bbe3afd3a,3804,sao paulo,SP
2063,a353b1083c9863d756d9404695016d64,29150,cariacica,ES


In [58]:
sellers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   int64 
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: int64(1), object(3)
memory usage: 96.8+ KB


In [59]:
# change data types of columns
sellers_df['seller_zip_code_prefix'] = sellers_df['seller_zip_code_prefix'].astype('str')
sellers_df['seller_id'] = sellers_df['seller_id'].astype('str')

# Let's standardize seller_zip_code_prefix digits to 5 for the column
sellers_df.seller_zip_code_prefix = sellers_df.seller_zip_code_prefix.str.zfill(5)

sellers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   object
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: object(4)
memory usage: 96.8+ KB


In [60]:
# product_categories data sheet
product_categories_df = pd.read_excel(xl, sheet_name='product_categories_data')
product_categories_df

Unnamed: 0,product_category_name,product_category_name_english
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories
2,automotivo,auto
3,cama_mesa_banho,bed_bath_table
4,moveis_decoracao,furniture_decor
...,...,...
66,flores,flowers
67,artes_e_artesanato,arts_and_craftmanship
68,fraldas_higiene,diapers_and_hygiene
69,fashion_roupa_infanto_juvenil,fashion_childrens_clothes


## Using the schema below, let's merge the tables.

>1.	An order might have multiple items.
2.	Each item might be fulfilled by a distinct seller.
3.	All text identifying stores and partners were replaced by the names of Game of Thrones great houses.


<img src='schema.png' alt='Table schema' width='750px'>

In [61]:
#  orders_df + order_reviews = orders
orders = pd.merge(orders_df, order_reviews_df, on='order_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99992 entries, 0 to 99991
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99992 non-null  object        
 1   customer_id                    99992 non-null  object        
 2   order_status                   99992 non-null  object        
 3   order_purchase_timestamp       99992 non-null  datetime64[ns]
 4   order_approved_at              99831 non-null  datetime64[ns]
 5   order_delivered_customer_date  97005 non-null  datetime64[ns]
 6   review_id                      99224 non-null  object        
 7   review_score                   99224 non-null  float64       
dtypes: datetime64[ns](3), float64(1), object(4)
memory usage: 6.9+ MB


In [62]:
orders.duplicated().any()

False

In [63]:
# orders + order_payments_df = orders
orders = pd.merge(orders, order_payments_df, on='order_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103857 entries, 0 to 103856
Data columns (total 10 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       103857 non-null  object        
 1   customer_id                    103857 non-null  object        
 2   order_status                   103857 non-null  object        
 3   order_purchase_timestamp       103857 non-null  datetime64[ns]
 4   order_approved_at              103691 non-null  datetime64[ns]
 5   order_delivered_customer_date  100736 non-null  datetime64[ns]
 6   review_id                      103060 non-null  object        
 7   review_score                   103060 non-null  float64       
 8   payment_type                   103856 non-null  object        
 9   payment_value                  103856 non-null  float64       
dtypes: datetime64[ns](3), float64(2), object(5)
memory usage: 8.7+ MB


In [64]:
order_payments_df.query("order_id == '8ca5bdac5ebe8f2d6fc9171d5ebc906a'")

Unnamed: 0,order_id,payment_type,payment_value
752,8ca5bdac5ebe8f2d6fc9171d5ebc906a,voucher,15.0
37465,8ca5bdac5ebe8f2d6fc9171d5ebc906a,credit_card,59.08
83047,8ca5bdac5ebe8f2d6fc9171d5ebc906a,voucher,25.0


In [65]:
# orders + order_items_df = orders
orders = pd.merge(orders, order_items_df, on='order_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 15 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       118432 non-null  object        
 1   customer_id                    118432 non-null  object        
 2   order_status                   118432 non-null  object        
 3   order_purchase_timestamp       118432 non-null  datetime64[ns]
 4   order_approved_at              118265 non-null  datetime64[ns]
 5   order_delivered_customer_date  115044 non-null  datetime64[ns]
 6   review_id                      117438 non-null  object        
 7   review_score                   117438 non-null  float64       
 8   payment_type                   118429 non-null  object        
 9   payment_value                  118429 non-null  float64       
 10  order_item_id                  117618 non-null  object        
 11  

In [66]:
# orders + products = orders
orders = pd.merge(orders, products_df, on='product_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 16 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       118432 non-null  object        
 1   customer_id                    118432 non-null  object        
 2   order_status                   118432 non-null  object        
 3   order_purchase_timestamp       118432 non-null  datetime64[ns]
 4   order_approved_at              118265 non-null  datetime64[ns]
 5   order_delivered_customer_date  115044 non-null  datetime64[ns]
 6   review_id                      117438 non-null  object        
 7   review_score                   117438 non-null  float64       
 8   payment_type                   118429 non-null  object        
 9   payment_value                  118429 non-null  float64       
 10  order_item_id                  117618 non-null  object        
 11  

In [67]:
# orders + sellers_df = orders
orders = pd.merge(orders, sellers_df, on='seller_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 19 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       118432 non-null  object        
 1   customer_id                    118432 non-null  object        
 2   order_status                   118432 non-null  object        
 3   order_purchase_timestamp       118432 non-null  datetime64[ns]
 4   order_approved_at              118265 non-null  datetime64[ns]
 5   order_delivered_customer_date  115044 non-null  datetime64[ns]
 6   review_id                      117438 non-null  object        
 7   review_score                   117438 non-null  float64       
 8   payment_type                   118429 non-null  object        
 9   payment_value                  118429 non-null  float64       
 10  order_item_id                  117618 non-null  object        
 11  

### Fetch Seller Latitude and Longitude values from the geolocation table

In [68]:
orders['seller_lat'] = pd.merge(orders, geolocation_df, 
                                        left_on=['seller_zip_code_prefix', 'seller_state', 'seller_city'], 
                                        right_on=['geolocation_zip_code_prefix', 'geolocation_state', 'geolocation_city'], 
                                         how='left')['geolocation_lat']

In [76]:
orders['seller_lng'] = pd.merge(orders, geolocation_df, 
                                         left_on=['seller_zip_code_prefix', 'seller_state', 'seller_city'], 
                                         right_on=['geolocation_zip_code_prefix', 'geolocation_state', 'geolocation_city'], 
                                         how='left')['geolocation_lng']

In [79]:
orders_part = orders.to_csv('partly_merged.csv', index=False)

In [74]:
# EXPERIMENTAL. PLEASE IGNORE!

# # Convert columns to string, then combine to form a new id column
# geolocation_df['geo_multikey'] = geolocation_df['geolocation_zip_code_prefix'].astype(str).replace(' ', '') + "" + geolocation_df['geolocation_state'].astype(str).replace(' ', '') + "" + geolocation_df['geolocation_city'].astype(str).replace(' ', '')  #+"" #.str.replace('\n', '')
# orders_copy['seller_multikey'] = orders_copy['seller_zip_code_prefix'].astype(str).replace(' ', '') + "" + orders_copy['seller_state'].astype(str).replace(' ', '') + "" + orders_copy['seller_city'].astype(str).replace(' ', '')

# # Ensure that there are no white space in between characters in the resulting string
# geolocation_df['geo_multikey'].replace(' ', '', regex =True, inplace=True)
# orders_copy['seller_multikey'].replace(' ', '', regex =True, inplace=True)

In [None]:
# EXPERIMENTAL. PLEASE IGNORE!

# # Repeat step as above
# orders_copy['seller_lat'] = pd.merge(orders_copy, geolocation_df, left_on='seller_multikey', right_on='geo_multikey', how='left')['geolocation_lat']
# orders_copy['seller_lng'] = pd.merge(orders_copy, geolocation_df, left_on='seller_multikey', right_on='geo_multikey', how='left')['geolocation_lng']

# orders_copy['seller_lat'].replace(' ', '', regex =True, inplace=True)
# orders_copy['seller_lng'].replace(' ', '', regex =True, inplace=True)

In [80]:
orders.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,...,product_id,seller_id,price,freight_value,product_category_name,seller_zip_code_prefix,seller_city,seller_state,seller_lat,seller_lng
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-10 21:25:13,a54f0611adc9ed256b57ede6b6eb5114,4.0,credit_card,18.12,...,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,8.72,utilidades_domesticas,9350,maua,SP,-23.680114,-46.452454
1,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-10 21:25:13,a54f0611adc9ed256b57ede6b6eb5114,4.0,voucher,2.0,...,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,8.72,utilidades_domesticas,9350,maua,SP,-23.675223,-46.441038
2,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-10 21:25:13,a54f0611adc9ed256b57ede6b6eb5114,4.0,voucher,18.59,...,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,8.72,utilidades_domesticas,9350,maua,SP,-23.679951,-46.448247
3,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-08-07 15:27:45,8d5266042046a06655c8db133d120ba5,4.0,boleto,141.46,...,595fac2a385ac33a80bd5114aec74eb8,289cdb325fb7e7f891c38608bf9e0962,118.7,22.76,perfumaria,31570,belo horizonte,SP,-23.678662,-46.442578
4,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-17 18:06:29,e73b67b67587f7644d5bd1a52deb1b01,5.0,credit_card,179.12,...,aa4383b373c6aca5d8797843e5594415,4869f7a5dfa277a7dca6462dcf3b52b2,159.9,19.22,automotivo,14840,guariba,SP,-23.671205,-46.441246


### Also, fetch Customer Latitude and Longitude values from the geolocation table

In [81]:
customers_df['customer_lat'] = pd.merge(customers_df, geolocation_df, 
                                         left_on=['customer_zip_code_prefix', 'customer_state', 'customer_city'], right_on=['geolocation_zip_code_prefix', 'geolocation_state', 'geolocation_city'], 
                                         how='left')['geolocation_lat']

In [82]:
customers_df['customer_lng'] = pd.merge(customers_df, geolocation_df, 
                                         left_on=['customer_zip_code_prefix', 'customer_state', 'customer_city'], right_on=['geolocation_zip_code_prefix', 'geolocation_state', 'geolocation_city'], 
                                         how='left')['geolocation_lng']

In [None]:
# EXPERIMENTAL. PLEASE IGNORE!

# customers_df['customer_multikey'] = customers_df['customer_zip_code_prefix'] + str(customers_df['customer_state']) + str(customers_df['customer_city'])
# customers_df['customer_lat'] = pd.merge(orders, geolocation_df, left_on='customer_multikey', right_on = 'geo_multikey')['geolocation_lat']
# customers_df['customer_lng'] = pd.merge(orders, geolocation_df, left_on='customer_multikey', right_on = 'geo_multikey')['geolocation_lng']

In [83]:
customers_df.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,customer_lat,customer_lng
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.509897,-47.397866
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP,-20.497396,-47.399241
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP,-20.510459,-47.399553
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP,-20.48094,-47.394161
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP,-20.515413,-47.398194


### Finally, merge customers_df with orders using customer_id column

In [84]:
# orders + customers_df = sales_df
sales_df = pd.merge(orders, customers_df, on = 'customer_id', how='right')
sales_df.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,...,seller_city,seller_state,seller_lat,seller_lng,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,customer_lat,customer_lng
0,00e7ee1b050b8499577073aeb2a297a1,06b8999e2fba1a1fbc88172c00ba8bc7,delivered,2017-05-16 15:05:35,2017-05-16 15:22:12,2017-05-25 10:35:35,88b8b52d46df026a9d1ad2136a59b30b,4.0,credit_card,146.87,...,itaquaquecetuba,SP,-23.652802,-46.768284,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.509897,-47.397866
1,29150127e6685892b6eab3eec79f59c7,18955e83d337fd6b2def6b18a428ac77,delivered,2018-01-12 20:48:24,2018-01-12 20:58:32,2018-01-29 12:41:19,02fc48a9efa3e3d0f1a8ea26507eeec3,5.0,credit_card,335.48,...,itajai,SC,-18.925991,-48.290849,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP,-20.497396,-47.399241
2,b2059ed67ce144a36e2aa97d2c9e9ad2,4e7b3e00288586ebd08712fdd0374a03,delivered,2018-05-19 16:07:45,2018-05-20 16:19:10,2018-06-14 17:58:51,5ad6695d76ee186dc473c42706984d87,5.0,credit_card,157.73,...,itaquaquecetuba,SP,-22.372746,-46.935023,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP,-20.510459,-47.399553
3,951670f92359f4fe4a63112aa7306eba,b2b6027bc5c5109e529d4dc6358b12c3,delivered,2018-03-13 16:06:38,2018-03-13 17:29:19,2018-03-28 16:04:25,059a801bb31f6aab2266e672cab87bc5,5.0,credit_card,173.3,...,itaquaquecetuba,SP,-23.644491,-46.556242,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP,-20.48094,-47.394161
4,6b7d50bd145f6fc7f33cebabd7e49d0f,4f2d8ab171c80ec8364f7c12e35b23ad,delivered,2018-07-29 09:51:30,2018-07-29 10:10:09,2018-08-09 20:55:48,8490879d58d6c5d7773f2739a03f089a,5.0,credit_card,252.25,...,ibitinga,SP,-23.494428,-46.364439,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP,-20.515413,-47.398194


### Copy the sales_df table for further cleaning

In [85]:
sales_df_clean = sales_df.copy()

In [86]:
# Export to csv for visual cleaning
sales_df_clean.to_csv('dirty_sales_data.csv', index=False)

In [87]:
sales_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 27 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       118432 non-null  object        
 1   customer_id                    118432 non-null  object        
 2   order_status                   118432 non-null  object        
 3   order_purchase_timestamp       118432 non-null  datetime64[ns]
 4   order_approved_at              118265 non-null  datetime64[ns]
 5   order_delivered_customer_date  115044 non-null  datetime64[ns]
 6   review_id                      117438 non-null  object        
 7   review_score                   117438 non-null  float64       
 8   payment_type                   118429 non-null  object        
 9   payment_value                  118429 non-null  float64       
 10  order_item_id                  117618 non-null  object        
 11  

In [88]:
sales_df_clean.describe()

Unnamed: 0,review_score,payment_value,price,freight_value,seller_lat,seller_lng,customer_lat,customer_lng
count,117438.0,118429.0,117618.0,117618.0,118406.0,118406.0,118430.0,118430.0
mean,4.015753,173.575062,120.628165,20.019567,-22.753694,-47.338053,-22.00148,-45.743073
std,1.400427,268.325853,184.12144,15.809956,2.230947,2.369884,4.218093,3.383411
min,1.0,0.0,0.85,0.0,-30.08092,-67.884719,-32.103825,-63.017772
25%,4.0,61.5,39.9,13.08,-23.624601,-48.829262,-23.559614,-47.391128
50%,5.0,108.73,74.9,16.28,-22.890283,-47.209417,-22.9083,-46.353037
75%,5.0,189.69,134.9,21.18,-21.754732,-46.477467,-20.261674,-43.237025
max,5.0,13664.08,6735.0,409.68,-2.484892,-38.289328,0.08643,-34.8211


### Additional Data Quality Issues
- Product category column not in English
- '_' in item and category names
- City names are in lower case

### Product category column not in English

Replace product category name column with the translated one in the products_category_translation csv file

In [89]:
sales_df_clean.product_category_name.sample(15)

82863                    cama_mesa_banho
29000                       beleza_saude
113466                             bebes
59572                   eletrodomesticos
97766                          papelaria
12184                         cool_stuff
105711                     esporte_lazer
39370              utilidades_domesticas
70920     fashion_underwear_e_moda_praia
53193                         cool_stuff
80928                 relogios_presentes
93031                         cool_stuff
33577             informatica_acessorios
29674                         cool_stuff
65210                         automotivo
Name: product_category_name, dtype: object

In [90]:
for i in sales_df_clean.product_category_name:
    assert (str(i).islower())

In [91]:
# Some products categories were not translated to English, while others are not available
print(sales_df_clean.product_category_name.nunique(dropna=False))
print(product_categories_df.product_category_name.nunique(dropna=False))

74
71


In [92]:
missing_cat = {'product_category_name' : ['portateis_cozinha_e_preparadores_de_alimentos', 'pc_gamer'],
    'product_category_name_english' : ['kitchen_equipment', 'pc_gamer',]}

df = pd.DataFrame(data=missing_cat)

# Concatenate with the product_categories_df dataframe
product_categories_df = pd.concat([product_categories_df, df], ignore_index=True)
product_categories_df

Unnamed: 0,product_category_name,product_category_name_english
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories
2,automotivo,auto
3,cama_mesa_banho,bed_bath_table
4,moveis_decoracao,furniture_decor
...,...,...
69,fashion_roupa_infanto_juvenil,fashion_childrens_clothes
70,seguros_e_servicos,security_and_services
71,portateis_cozinha_e_preparadores_de_alimentos,Kitchen Equipment
72,pc_gamer,PC Gamer


In [93]:
# Next, replace the Portugese category names with English version
# EXPERIMENTAL. UNCOMMENT ONLY IF NECESSARY!

# sales_df_clean = sales_df_clean.replace(sales_df_clean.product_category_name.unique(), product_categories_df.product_category_name_english.unique())
# sales_df_clean.product_category_name.unique()

sales_df_clean = sales_df_clean.replace(sales_df_clean.product_category_name, product_categories_df.product_category_name_english)
sales_df_clean.product_category_name.unique()

array(['health_beauty', 'computers_accessories', 'auto', 'bed_bath_table',
       'furniture_decor', 'sports_leisure', 'perfumery', 'housewares',
       'telephony', 'watches_gifts', 'food_drink', 'baby', 'stationery',
       'tablets_printing_image', 'toys', 'fixed_telephony',
       'garden_tools', 'fashion_bags_accessories', 'small_appliances',
       'consoles_games', 'audio', 'fashion_shoes', 'cool_stuff',
       'luggage_accessories', 'air_conditioning',
       'construction_tools_construction',
       'kitchen_dining_laundry_garden_furniture',
       'costruction_tools_garden', 'fashion_male_clothing', 'pet_shop',
       'office_furniture', 'market_place', 'electronics',
       'home_appliances', 'party_supplies', 'home_confort',
       'costruction_tools_tools', 'agro_industry_and_commerce',
       'furniture_mattress_and_upholstery', 'books_technical',
       'home_construction', 'musical_instruments',
       'furniture_living_room', 'construction_tools_lights',
       'indust

### '_' in item and category names

### City names are in lower case

Replace all underscores in named columns of the Dataframe to have a cleaner data, then, convert to title case

In [95]:
# Replace all underscores with white space
named_columns = ['order_status', 'payment_type', 'product_category_name', 'seller_city', 'customer_city',]
sales_df_clean[named_columns] = sales_df_clean[named_columns].astype(str)
sales_df_clean[named_columns] = sales_df_clean[named_columns].replace('_', ' ', regex=True)

# Next, convert to title case
for col in named_columns:
    sales_df_clean[col] = sales_df_clean[col].str.title()

In [97]:
sales_df_clean.customer_city

0                        Franca
1         Sao Bernardo Do Campo
2                     Sao Paulo
3               Mogi Das Cruzes
4                      Campinas
                  ...          
118427                Sao Paulo
118428          Taboao Da Serra
118429                Fortaleza
118430                   Canoas
118431                    Cotia
Name: customer_city, Length: 118432, dtype: object

In [96]:
sales_df_clean[named_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 5 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   order_status           118432 non-null  object
 1   payment_type           118432 non-null  object
 2   product_category_name  118432 non-null  object
 3   seller_city            118432 non-null  object
 4   customer_city          118432 non-null  object
dtypes: object(5)
memory usage: 5.4+ MB


In [63]:
sales_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 26 columns):
 #   Column                         Non-Null Count   Dtype   
---  ------                         --------------   -----   
 0   order_id                       118432 non-null  object  
 1   customer_id                    118432 non-null  object  
 2   order_status                   118432 non-null  category
 3   order_approved_at              118432 non-null  object  
 4   order_delivered_customer_date  118432 non-null  object  
 5   review_id                      118432 non-null  object  
 6   review_score                   117438 non-null  category
 7   payment_type                   118432 non-null  category
 8   payment_value                  118432 non-null  object  
 9   order_item_id                  118432 non-null  object  
 10  product_id                     118432 non-null  object  
 11  seller_id                      118432 non-null  object  
 12  price           

In [110]:
# This row contains bad data
# 'sports_leisure' across multiple columns
sales_df_clean[sales_df_clean.order_id == '6e98de3a85c84ead6689189b825d35b5']

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,...,seller_city,seller_state,seller_lat,seller_lng,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,customer_lat,customer_lng
52,6e98de3a85c84ead6689189b825d35b5,f34a6e874087ec1f0e3dab9fdf659c5d,Canceled,2018-03-15 10:07:02,2018-03-15 10:29:33,sports_leisure,e76cda681b681b65fffa28dfc0030641,1.0,Credit Card,73.16,...,Sports Leisure,sports_leisure,-23.309687,-45.963977,233896de79986082f1f479f1f85281cb,38300,Ituiutaba,MG,-20.499905,-47.40206


In [112]:
# drop the record for that particular order since it was also cancelled
sales_df_clean = sales_df_clean[~sales_df_clean.order_id.str.contains('6e98de3a85c84ead6689189b825d35b5')]

In [None]:
# EXPERIMENT. IGNORE!

# sales_df_clean['payment_value'] = sales_df_clean['payment_value'].str.replace('sports_leisure', str(0), regex=True)
# sales_df_clean['payment_value'] = sales_df_clean['payment_value'].astype('float64')

In [197]:
# Add date dimension
# This will enable us apply filters using different time-based factors such as year, month, quarter, etc. 
# Please ignore warning in the output

sales_df_clean['day'] = sales_df_clean['order_purchase_timestamp'].dt.day_name()
sales_df_clean['week'] = sales_df_clean['order_purchase_timestamp'].dt.week
sales_df_clean['quarter'] = sales_df_clean['order_purchase_timestamp'].dt.quarter
sales_df_clean['year'] = sales_df_clean['order_purchase_timestamp'].dt.year
sales_df_clean['half_year'] = (sales_df_clean['quarter'] + 1) // 2
sales_df_clean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales_df_clean['day'] = sales_df_clean['order_purchase_timestamp'].dt.day_name()
  sales_df_clean['week'] = sales_df_clean['order_purchase_timestamp'].dt.week
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales_df_clean['week'] = sales_df_clean['order_purchase_timestamp'].dt.week
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ret

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,...,customer_zip_code_prefix,customer_city,customer_state,customer_lat,customer_lng,day,week,quarter,year,half_year
0,00e7ee1b050b8499577073aeb2a297a1,06b8999e2fba1a1fbc88172c00ba8bc7,Delivered,2017-05-16 15:05:35,2017-05-16 15:22:12,2017-05-25 10:35:35,88b8b52d46df026a9d1ad2136a59b30b,4.0,Credit Card,146.87,...,14409,Franca,SP,-20.509897,-47.397866,Tuesday,20,2,2017,1
1,29150127e6685892b6eab3eec79f59c7,18955e83d337fd6b2def6b18a428ac77,Delivered,2018-01-12 20:48:24,2018-01-12 20:58:32,2018-01-29 12:41:19,02fc48a9efa3e3d0f1a8ea26507eeec3,5.0,Credit Card,335.48,...,09790,Sao Bernardo Do Campo,SP,-20.497396,-47.399241,Friday,2,1,2018,1
2,b2059ed67ce144a36e2aa97d2c9e9ad2,4e7b3e00288586ebd08712fdd0374a03,Delivered,2018-05-19 16:07:45,2018-05-20 16:19:10,2018-06-14 17:58:51,5ad6695d76ee186dc473c42706984d87,5.0,Credit Card,157.73,...,01151,Sao Paulo,SP,-20.510459,-47.399553,Saturday,20,2,2018,1
3,951670f92359f4fe4a63112aa7306eba,b2b6027bc5c5109e529d4dc6358b12c3,Delivered,2018-03-13 16:06:38,2018-03-13 17:29:19,2018-03-28 16:04:25,059a801bb31f6aab2266e672cab87bc5,5.0,Credit Card,173.3,...,08775,Mogi Das Cruzes,SP,-20.48094,-47.394161,Tuesday,11,1,2018,1
4,6b7d50bd145f6fc7f33cebabd7e49d0f,4f2d8ab171c80ec8364f7c12e35b23ad,Delivered,2018-07-29 09:51:30,2018-07-29 10:10:09,2018-08-09 20:55:48,8490879d58d6c5d7773f2739a03f089a,5.0,Credit Card,252.25,...,13056,Campinas,SP,-20.515413,-47.398194,Sunday,30,3,2018,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118427,6760e20addcf0121e9d58f2f1ff14298,17ddf5dd5d51696bb3d7c6291687be6f,Delivered,2018-04-07 15:48:17,2018-04-07 16:08:45,2018-04-13 20:06:37,36e2cdbaa9f639b57c53b37ac798fee8,4.0,Credit Card,88.78,...,03937,Sao Paulo,SP,-21.721842,-43.35388,Saturday,14,2,2018,1
118428,9ec0c8947d973db4f4e8dcf1fbfa8f1b,e7b71a9017aa05c9a7fd292d714858e8,Delivered,2018-04-04 08:20:22,2018-04-04 08:35:12,2018-04-11 18:54:45,b273b431c3aedb4eed18643309652940,5.0,Credit Card,129.06,...,06764,Taboao Da Serra,SP,-21.718059,-43.350766,Wednesday,14,2,2018,1
118429,fed4434add09a6f332ea398efd656a5c,5e28dfe12db7fb50a4b2f691faecea5e,Delivered,2018-04-08 20:11:50,2018-04-08 20:30:03,2018-05-09 19:03:15,fa4f16891e6b2edd1354668d07f5648b,1.0,Credit Card,56.04,...,60115,Fortaleza,CE,-21.719544,-43.352861,Sunday,14,2,2018,1
118430,e31ec91cea1ecf97797787471f98a8c2,56b18e2166679b8a959d72dd06da27f9,Delivered,2017-11-03 21:08:33,2017-11-03 21:31:20,2017-11-16 19:58:39,0bcdc9e450ea500811a8d39ee993cd47,5.0,Credit Card,711.07,...,92120,Canoas,RS,-21.720405,-43.353035,Friday,44,4,2017,2


In [198]:
# OPTIONAL
# Now, let's export our cleaned data

sales_df_clean.to_csv('cleaned_sales_data.csv', index=False, )

In [66]:
# Distribution of variables
sales_df_clean.order_id.nunique()

99441

Total unique customers till date

In [67]:
sales_df_clean.customer_unique_id.nunique()

96096

In [None]:
sales_df_clean[]

### 1: Sales Volume by location

In [119]:
sales_df_clean.payment_type.unique()

array(['Credit Card', 'Debit Card', 'Voucher', 'Boleto', 'Not Defined',
       'Sports Leisure'], dtype=object)

In [121]:
sales_df_clean.query("payment_type == 'Sports Leisure'")

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,...,seller_city,seller_state,seller_lat,seller_lng,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,customer_lat,customer_lng
25202,bfbd0f9bdef84302105ad712db648a6c,86dc2ffce2dfff336de2f386a786e574,Delivered,2016-09-15 12:16:38,2016-09-15 12:16:38,2016-11-09 07:47:38,6916ca4502d6d3bfd39818759d55d536,1.0,Sports Leisure,sports_leisure,...,Curitiba,PR,-22.715892,-47.6691,830d5b7aaa3b6f1e9ad63703bec97d23,14600,Sao Joaquim Da Barra,SP,-21.792986,-46.588419
25203,bfbd0f9bdef84302105ad712db648a6c,86dc2ffce2dfff336de2f386a786e574,Delivered,2016-09-15 12:16:38,2016-09-15 12:16:38,2016-11-09 07:47:38,6916ca4502d6d3bfd39818759d55d536,1.0,Sports Leisure,sports_leisure,...,Curitiba,PR,-22.718858,-47.670388,830d5b7aaa3b6f1e9ad63703bec97d23,14600,Sao Joaquim Da Barra,SP,-21.792986,-46.588419
25204,bfbd0f9bdef84302105ad712db648a6c,86dc2ffce2dfff336de2f386a786e574,Delivered,2016-09-15 12:16:38,2016-09-15 12:16:38,2016-11-09 07:47:38,6916ca4502d6d3bfd39818759d55d536,1.0,Sports Leisure,sports_leisure,...,Curitiba,PR,-22.707719,-47.668962,830d5b7aaa3b6f1e9ad63703bec97d23,14600,Sao Joaquim Da Barra,SP,-21.792986,-46.588419


In [126]:
sales_df_clean.query("order_id == '4637ca194b6387e2d538dc89b124b0ee'")

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,...,seller_city,seller_state,seller_lat,seller_lng,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,customer_lat,customer_lng
4474,4637ca194b6387e2d538dc89b124b0ee,a73c1f73f5772cf801434bf984b0b1a7,Canceled,2018-09-03 14:14:25,sports_leisure,sports_leisure,9efef4d70190c1bdb9b12872cc8144fc,3.0,Not Defined,0.0,...,Sports Leisure,sports_leisure,-22.902236,-47.075296,968fac81e2c44fb6c1e3ac2a45e6a102,4685,Sao Paulo,SP,-27.587699,-48.497507


In [128]:
product_categories_df.query("product_category_name_english == 'sports_leisure'")

Unnamed: 0,product_category_name,product_category_name_english
5,esporte_lazer,sports_leisure


In [132]:
mask = np.column_stack([sales_df[col].astype(str).str.contains(r'sports_leisure', na=False) for col in sales_df])
sales_df.loc[mask.any(axis=1)]

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,...,seller_city,seller_state,seller_lat,seller_lng,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,customer_lat,customer_lng


In [134]:
mask = np.column_stack([orders[col].astype(str).str.contains(r'esporte_lazer', na=False) for col in orders])
orders.loc[mask.any(axis=1)]

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,...,product_id,seller_id,price,freight_value,product_category_name,seller_zip_code_prefix,seller_city,seller_state,seller_lat,seller_lng
49,948097deef559c742e7ce321e5e58919,8644be24d48806bc3a88fd59fb47ceb1,delivered,2017-08-04 17:10:39,2017-08-04 17:25:11,2017-08-12 14:08:40,eba492bfeacda5b918f67f0900030dff,5.0,credit_card,95.67,...,cd935d283d47f1050c505e1c39c48b67,a3a38f4affed601eb87a97788c949667,69.90,25.77,esporte_lazer,89204,joinville,SC,-23.681725,-46.441499
53,d17dc4a904426827ca80f2ccb3a6be56,569cf68214806a39acc0f39344aea67f,delivered,2017-05-14 20:28:25,2017-05-14 20:42:45,2017-05-25 09:14:31,bcab37e37778893d858b3d159849a56d,4.0,credit_card,54.82,...,ba4bfbf74dbe7ab37e263b9326da0523,f8db351d8c4c4c22c6835c19a46f01b0,36.90,17.92,esporte_lazer,13324,salto,SP,-23.687932,-46.448392
55,5820a1100976432c7968a52da59e9364,2b56e94c2f66f2d97cfa63356f69cee8,delivered,2018-07-29 11:24:17,2018-07-29 11:44:19,2018-08-02 22:09:11,7fe86452ea24bb5085343b5e9d684a3e,5.0,credit_card,52.24,...,1deda1acffb44ed38494667d7e49a9f3,f52c2422904463fdd7741f99045fecb6,33.90,18.34,esporte_lazer,09230,santo andre/sao paulo,SP,-23.677941,-46.439934
59,9faeb9b2746b9d7526aef5acb08e2aa0,79183cd650e2bb0d475b0067d45946ac,delivered,2018-07-26 14:39:59,2018-07-26 14:55:10,2018-07-31 22:26:55,25b502beeef8b3aff87449ccdc001f53,1.0,credit_card,151.04,...,f48eb5c2fde13ca63664f0bb05f55346,f7ba60f8c3f99e7ee4042fdef03b70c4,60.00,15.52,esporte_lazer,09628,sao bernardo do campo,SP,-23.680258,-46.449900
60,9faeb9b2746b9d7526aef5acb08e2aa0,79183cd650e2bb0d475b0067d45946ac,delivered,2018-07-26 14:39:59,2018-07-26 14:55:10,2018-07-31 22:26:55,25b502beeef8b3aff87449ccdc001f53,1.0,credit_card,151.04,...,f48eb5c2fde13ca63664f0bb05f55346,f7ba60f8c3f99e7ee4042fdef03b70c4,60.00,15.52,esporte_lazer,09628,sao bernardo do campo,SP,-23.680838,-46.449040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118395,71c4b7b8844a031b7db8ca41c41d4aae,dce0014426964924f9564758e4fadcbf,delivered,2017-09-03 20:52:34,2017-09-05 04:10:34,2017-09-14 15:49:47,7bb4ee5964c58afcb9dd6d43591c421a,5.0,boleto,49.59,...,bd5556852920e0d5ebb6044894391ca6,e9779976487b77c6d4ac45f75ec7afe9,34.49,15.10,esporte_lazer,11701,praia grande,SP,-21.758716,-48.831782
118399,e8fd20068b9f7e6ec07068bb7537f781,609b9fb8cad4fe0c7b376f77c8ab76ad,delivered,2017-08-10 21:21:07,2017-08-10 21:35:26,2017-08-23 15:36:29,f5453b3d33358ed459eb4312392b96a8,4.0,credit_card,748.24,...,0df37da38a30a713453b03053d60d3f7,218d46b86c1881d022bce9c68a7d4b15,356.00,18.12,esporte_lazer,14070,ribeirao preto,SP,-21.760595,-48.847152
118400,e8fd20068b9f7e6ec07068bb7537f781,609b9fb8cad4fe0c7b376f77c8ab76ad,delivered,2017-08-10 21:21:07,2017-08-10 21:35:26,2017-08-23 15:36:29,f5453b3d33358ed459eb4312392b96a8,4.0,credit_card,748.24,...,0df37da38a30a713453b03053d60d3f7,218d46b86c1881d022bce9c68a7d4b15,356.00,18.12,esporte_lazer,14070,ribeirao preto,SP,-21.744743,-48.817306
118403,e6cc57f923c4dab2222b8c9aa8742eea,2f4a42f9bb4b9a8cd402fa549df5c7fd,delivered,2018-02-07 12:09:45,2018-02-07 12:31:13,2018-03-05 18:56:39,c03690b720704c98edf718aedc425a68,5.0,credit_card,128.54,...,7b35ccd93a2184646c03b70326626923,4d6d651bd7684af3fffabd5f08d12e5a,113.00,15.54,esporte_lazer,17209,jau,SP,-21.757985,-48.826776


In [220]:
sales_df_clean[sales_df_clean.payment_value.str.contains('[a-zA-Z]', na =True)]

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,...,customer_zip_code_prefix,customer_city,customer_state,customer_lat,customer_lng,day,week,quarter,year,half_year
0,00e7ee1b050b8499577073aeb2a297a1,06b8999e2fba1a1fbc88172c00ba8bc7,Delivered,2017-05-16 15:05:35,2017-05-16 15:22:12,2017-05-25 10:35:35,88b8b52d46df026a9d1ad2136a59b30b,4.0,Credit Card,146.87,...,14409,Franca,SP,-20.509897,-47.397866,Tuesday,20,2,2017,1
1,29150127e6685892b6eab3eec79f59c7,18955e83d337fd6b2def6b18a428ac77,Delivered,2018-01-12 20:48:24,2018-01-12 20:58:32,2018-01-29 12:41:19,02fc48a9efa3e3d0f1a8ea26507eeec3,5.0,Credit Card,335.48,...,09790,Sao Bernardo Do Campo,SP,-20.497396,-47.399241,Friday,2,1,2018,1
2,b2059ed67ce144a36e2aa97d2c9e9ad2,4e7b3e00288586ebd08712fdd0374a03,Delivered,2018-05-19 16:07:45,2018-05-20 16:19:10,2018-06-14 17:58:51,5ad6695d76ee186dc473c42706984d87,5.0,Credit Card,157.73,...,01151,Sao Paulo,SP,-20.510459,-47.399553,Saturday,20,2,2018,1
3,951670f92359f4fe4a63112aa7306eba,b2b6027bc5c5109e529d4dc6358b12c3,Delivered,2018-03-13 16:06:38,2018-03-13 17:29:19,2018-03-28 16:04:25,059a801bb31f6aab2266e672cab87bc5,5.0,Credit Card,173.3,...,08775,Mogi Das Cruzes,SP,-20.48094,-47.394161,Tuesday,11,1,2018,1
4,6b7d50bd145f6fc7f33cebabd7e49d0f,4f2d8ab171c80ec8364f7c12e35b23ad,Delivered,2018-07-29 09:51:30,2018-07-29 10:10:09,2018-08-09 20:55:48,8490879d58d6c5d7773f2739a03f089a,5.0,Credit Card,252.25,...,13056,Campinas,SP,-20.515413,-47.398194,Sunday,30,3,2018,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118427,6760e20addcf0121e9d58f2f1ff14298,17ddf5dd5d51696bb3d7c6291687be6f,Delivered,2018-04-07 15:48:17,2018-04-07 16:08:45,2018-04-13 20:06:37,36e2cdbaa9f639b57c53b37ac798fee8,4.0,Credit Card,88.78,...,03937,Sao Paulo,SP,-21.721842,-43.35388,Saturday,14,2,2018,1
118428,9ec0c8947d973db4f4e8dcf1fbfa8f1b,e7b71a9017aa05c9a7fd292d714858e8,Delivered,2018-04-04 08:20:22,2018-04-04 08:35:12,2018-04-11 18:54:45,b273b431c3aedb4eed18643309652940,5.0,Credit Card,129.06,...,06764,Taboao Da Serra,SP,-21.718059,-43.350766,Wednesday,14,2,2018,1
118429,fed4434add09a6f332ea398efd656a5c,5e28dfe12db7fb50a4b2f691faecea5e,Delivered,2018-04-08 20:11:50,2018-04-08 20:30:03,2018-05-09 19:03:15,fa4f16891e6b2edd1354668d07f5648b,1.0,Credit Card,56.04,...,60115,Fortaleza,CE,-21.719544,-43.352861,Sunday,14,2,2018,1
118430,e31ec91cea1ecf97797787471f98a8c2,56b18e2166679b8a959d72dd06da27f9,Delivered,2017-11-03 21:08:33,2017-11-03 21:31:20,2017-11-16 19:58:39,0bcdc9e450ea500811a8d39ee993cd47,5.0,Credit Card,711.07,...,92120,Canoas,RS,-21.720405,-43.353035,Friday,44,4,2017,2


In [255]:
sales_df3 = sales_df_clean.copy()
# sales_df3['payment_value'] = sales_df3['payment_value'].str.replace('sports_leisure', str(0))
# sales_df3['payment_value'] = sales_df3['payment_value'].astype('float64')

In [256]:
sales_df3.payment_value.info()

<class 'pandas.core.series.Series'>
Int64Index: 118431 entries, 0 to 118431
Series name: payment_value
Non-Null Count   Dtype 
--------------   ----- 
118431 non-null  object
dtypes: object(1)
memory usage: 1.8+ MB


In [257]:
sales_df3[sales_df3.payment_value == 'sports_leisure']

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,...,customer_zip_code_prefix,customer_city,customer_state,customer_lat,customer_lng,day,week,quarter,year,half_year
25202,bfbd0f9bdef84302105ad712db648a6c,86dc2ffce2dfff336de2f386a786e574,Delivered,2016-09-15 12:16:38,2016-09-15 12:16:38,2016-11-09 07:47:38,6916ca4502d6d3bfd39818759d55d536,1.0,Sports Leisure,sports_leisure,...,14600,Sao Joaquim Da Barra,SP,-21.792986,-46.588419,Thursday,37,3,2016,2
25203,bfbd0f9bdef84302105ad712db648a6c,86dc2ffce2dfff336de2f386a786e574,Delivered,2016-09-15 12:16:38,2016-09-15 12:16:38,2016-11-09 07:47:38,6916ca4502d6d3bfd39818759d55d536,1.0,Sports Leisure,sports_leisure,...,14600,Sao Joaquim Da Barra,SP,-21.792986,-46.588419,Thursday,37,3,2016,2
25204,bfbd0f9bdef84302105ad712db648a6c,86dc2ffce2dfff336de2f386a786e574,Delivered,2016-09-15 12:16:38,2016-09-15 12:16:38,2016-11-09 07:47:38,6916ca4502d6d3bfd39818759d55d536,1.0,Sports Leisure,sports_leisure,...,14600,Sao Joaquim Da Barra,SP,-21.792986,-46.588419,Thursday,37,3,2016,2


In [246]:
sales_df3.payment_value.value_counts(dropna=False)

NaN    118428
0.0         3
Name: payment_value, dtype: int64

In [2]:
retouch = pd.read_csv('cleaned_sales_data.csv', encoding='utf-8')

  retouch = pd.read_csv('cleaned_sales_data.csv', encoding='utf-8')


In [3]:
retouch.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,...,customer_zip_code_prefix,customer_city,customer_state,customer_lat,customer_lng,day,week,quarter,year,half_year
0,00e7ee1b050b8499577073aeb2a297a1,06b8999e2fba1a1fbc88172c00ba8bc7,Delivered,2017-05-16 15:05:35,2017-05-16 15:22:12,2017-05-25 10:35:35,88b8b52d46df026a9d1ad2136a59b30b,4.0,Credit Card,146.87,...,14409,Franca,SP,-20.509897,-47.397866,Tuesday,20,2,2017,1
1,29150127e6685892b6eab3eec79f59c7,18955e83d337fd6b2def6b18a428ac77,Delivered,2018-01-12 20:48:24,2018-01-12 20:58:32,2018-01-29 12:41:19,02fc48a9efa3e3d0f1a8ea26507eeec3,5.0,Credit Card,335.48,...,9790,Sao Bernardo Do Campo,SP,-20.497396,-47.399241,Friday,2,1,2018,1
2,b2059ed67ce144a36e2aa97d2c9e9ad2,4e7b3e00288586ebd08712fdd0374a03,Delivered,2018-05-19 16:07:45,2018-05-20 16:19:10,2018-06-14 17:58:51,5ad6695d76ee186dc473c42706984d87,5.0,Credit Card,157.73,...,1151,Sao Paulo,SP,-20.510459,-47.399553,Saturday,20,2,2018,1
3,951670f92359f4fe4a63112aa7306eba,b2b6027bc5c5109e529d4dc6358b12c3,Delivered,2018-03-13 16:06:38,2018-03-13 17:29:19,2018-03-28 16:04:25,059a801bb31f6aab2266e672cab87bc5,5.0,Credit Card,173.3,...,8775,Mogi Das Cruzes,SP,-20.48094,-47.394161,Tuesday,11,1,2018,1
4,6b7d50bd145f6fc7f33cebabd7e49d0f,4f2d8ab171c80ec8364f7c12e35b23ad,Delivered,2018-07-29 09:51:30,2018-07-29 10:10:09,2018-08-09 20:55:48,8490879d58d6c5d7773f2739a03f089a,5.0,Credit Card,252.25,...,13056,Campinas,SP,-20.515413,-47.398194,Sunday,30,3,2018,2


In [5]:
retouch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118431 entries, 0 to 118430
Data columns (total 32 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   order_id                       118431 non-null  object
 1   customer_id                    118431 non-null  object
 2   order_status                   118431 non-null  object
 3   order_purchase_timestamp       118431 non-null  object
 4   order_approved_at              118431 non-null  object
 5   order_delivered_customer_date  118431 non-null  object
 6   review_id                      118431 non-null  object
 7   review_score                   118431 non-null  object
 8   payment_type                   118431 non-null  object
 9   payment_value                  118431 non-null  object
 10  order_item_id                  118431 non-null  object
 11  product_id                     118431 non-null  object
 12  seller_id                      118431 non-nu

In [22]:
mask = np.column_stack([retouch[col].astype(str).str.contains(r'sports_leisure', na=False) for col in retouch])
retouch.loc[mask.any(axis=1)].query("order_delivered_customer_date == 'sports_leisure'")

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,...,customer_zip_code_prefix,customer_city,customer_state,customer_lat,customer_lng,day,week,quarter,year,half_year
52,b5172d4a722c0ebb34c29bbbb7a3cb42,c132855c926907970dcf6f2bf0b33a24,Invoiced,2018-04-17 13:14:13,2018-04-18 05:31:51,sports_leisure,f588cda1416895d04f9960ba794df07b,1.0,Boleto,143.25,...,18740,Taquarituba,SP,-20.474103,-47.395517,Tuesday,16,2,2018,1
83,ad380680e87dea0f2abf5cd5bace626c,4c7241af24b5344cb01fe687643de4fe,Shipped,2018-01-21 15:54:02,2018-01-22 13:52:41,sports_leisure,614c979929f4c737a7b5034897a30267,3.0,Credit Card,74.38,...,60336,Fortaleza,CE,-20.48577,-47.402501,Sunday,3,1,2018,1
94,cc07a8fdd3a8e94d683c8142a117dbc1,f6529ffebe6b3440d45d89604a4239ac,Shipped,2018-01-11 17:11:11,2018-01-12 17:49:45,sports_leisure,b3e872bf70b6b54f82b468147ad1319d,1.0,Voucher,137.6,...,26272,Nova Iguacu,RJ,-20.482137,-47.396822,Thursday,2,1,2018,1
118,4c33ec562d9dc5655e160913aa86eb53,ae76a4650235ab18764708174f1da31e,Shipped,2017-08-11 23:47:48,2017-08-12 00:03:51,sports_leisure,18a80894de129917815e652bde9bf45a,1.0,Credit Card,58.59,...,2983,Sao Paulo,SP,-20.48396,-47.403324,Friday,32,3,2017,2
133,ff536d93ae4214b4d51c2894ccfc569f,8891eb5ca0e28df961b2b5b8f3c0eb23,Shipped,2017-12-21 09:01:00,2017-12-22 12:33:26,sports_leisure,1a6be4c794d5e3da242a364f6a98163f,3.0,Credit Card,47.27,...,2632,Sao Paulo,SP,-20.50207,-47.404241,Thursday,51,4,2017,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118199,8c0dd54ff6a273a02de12494c6bf299f,6a65b5d6d4f4c9dfda93798cf014ee47,Shipped,2018-07-17 22:35:46,2018-07-18 21:22:25,sports_leisure,291cd4cf382eaa53812b04a26d53288a,5.0,Credit Card,20.86,...,3959,Sao Paulo,SP,-20.858427,-41.098155,Tuesday,29,3,2018,2
118235,e5e59fe47003046e2ed48ebb641564b2,9ce70b8dafe76524f3d5e5e68cbf29f5,Shipped,2018-04-27 13:52:12,2018-04-27 14:09:00,sports_leisure,f34466252980873915d4ab7d133997b2,5.0,Credit Card,98.7,...,14093,Ribeirao Preto,SP,-22.748748,-42.865895,Friday,17,2,2018,1
118243,12d9fe1a3efa51a7fae337cba819adc1,2a6fc1968c4a29c9c258bb1994afbc7f,Shipped,2017-02-08 13:07:19,2017-02-09 02:50:42,sports_leisure,ca2c0ff5ed267bbf35e45b952067fd6c,4.0,Boleto,39.42,...,27210,Volta Redonda,RJ,-22.749635,-42.870444,Wednesday,6,1,2017,1
118315,67384817c871ae183b24f9a0a6eb9bbc,f83083fcbc51d12f8279b5713c2d4b4d,Shipped,2018-01-25 12:12:37,2018-01-25 12:31:27,sports_leisure,7784917a4ab456dafb63e1cda0453477,1.0,Credit Card,554.96,...,77807,Araguaina,TO,-21.720679,-43.346939,Thursday,4,1,2018,1
