## Attention

An order might have multiple items.

Each item might be fulfiled by a distinct seller

In [158]:
#Import required libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from millify import millify

In [5]:
# The datasets were combined into one Excel file with multiple sheets
#Load workbook
xl = pd.ExcelFile('olist_store_dataset.xlsx', engine='openpyxl')

In [6]:
# list of sheets containing the datasets
xl.sheet_names

['customers_data',
 'geolocation_data',
 'order_items_data',
 'order_payments_data',
 'order_reviews_data',
 'orders_data',
 'products_data',
 'sellers_data',
 'product_categories_data']

### Load the tables from Excel worksheet to a pandas dataframe

In [7]:
# Customers data sheet
customers_df = pd.read_excel(xl, sheet_name='customers_data')
customers_df.head(2)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP


In [11]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  object
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: object(5)
memory usage: 3.8+ MB


In [12]:
# change column datatypes
customers_df['customer_zip_code_prefix'] = customers_df.customer_zip_code_prefix.astype('str')

# Let's standardize customer_zip_code_prefix digits to 5 for the column
customers_df.customer_zip_code_prefix = customers_df.customer_zip_code_prefix.str.zfill(5)

customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  object
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: object(5)
memory usage: 3.8+ MB


In [13]:
customers_df.customer_zip_code_prefix.sample(5)

90414    13403
27546    12935
93696    22451
78682    06850
23506    12280
Name: customer_zip_code_prefix, dtype: object

In [14]:
# geolocation data sheet
geolocation_df = pd.read_excel(xl, sheet_name='geolocation_data')
geolocation_df

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.644820,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP
...,...,...,...,...,...
1000158,99950,-28.068639,-52.010705,tapejara,RS
1000159,99900,-27.877125,-52.224882,getulio vargas,RS
1000160,99950,-28.071855,-52.014716,tapejara,RS
1000161,99980,-28.388932,-51.846871,david canabarro,RS


In [15]:
geolocation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000163 non-null  int64  
 1   geolocation_lat              1000163 non-null  float64
 2   geolocation_lng              1000163 non-null  float64
 3   geolocation_city             1000163 non-null  object 
 4   geolocation_state            1000163 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 38.2+ MB


In [16]:
# Change data types of listed columns

geolocation_df['geolocation_zip_code_prefix'] = geolocation_df.geolocation_zip_code_prefix.astype(str)


# Let's standardize geolocation_zip_code_prefix digits to 5 for the column
geolocation_df.geolocation_zip_code_prefix = geolocation_df.geolocation_zip_code_prefix.str.zfill(5)

In [17]:
geolocation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000163 non-null  object 
 1   geolocation_lat              1000163 non-null  float64
 2   geolocation_lng              1000163 non-null  float64
 3   geolocation_city             1000163 non-null  object 
 4   geolocation_state            1000163 non-null  object 
dtypes: float64(2), object(3)
memory usage: 38.2+ MB


In [18]:
geolocation_df.geolocation_zip_code_prefix.sample(10)

922101    89037
669942    40110
174362    06721
249730    11045
368755    17120
41999     03067
323670    13820
639676    37930
458001    22770
702059    48490
Name: geolocation_zip_code_prefix, dtype: object

In [19]:
# order_items data sheet
order_items_df = pd.read_excel(xl, sheet_name='order_items_data')
order_items_df.head(2)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93


In [20]:
order_items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             112650 non-null  object        
 1   order_item_id        112650 non-null  int64         
 2   product_id           112650 non-null  object        
 3   seller_id            112650 non-null  object        
 4   shipping_limit_date  112650 non-null  datetime64[ns]
 5   price                112650 non-null  float64       
 6   freight_value        112650 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 6.0+ MB


- Change datatypes for some columns
- Drop the shipping_limit_date column

In [21]:
order_items_df['order_item_id'] = order_items_df['order_item_id'].astype('str')
order_items_df['product_id'] = order_items_df['product_id'].astype('str')
order_items_df['seller_id'] = order_items_df['seller_id'].astype('str')

order_items_df = order_items_df.drop(columns=['shipping_limit_date'])

In [22]:
order_items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   order_id       112650 non-null  object 
 1   order_item_id  112650 non-null  object 
 2   product_id     112650 non-null  object 
 3   seller_id      112650 non-null  object 
 4   price          112650 non-null  float64
 5   freight_value  112650 non-null  float64
dtypes: float64(2), object(4)
memory usage: 5.2+ MB


In [23]:
# order_payments data sheet
order_payments_df = pd.read_excel(xl, sheet_name='order_payments_data')
order_payments_df.head(2)

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39


In [24]:
order_payments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB


In [25]:
order_payments_df.duplicated().any()

False

In [26]:
# order_payments_df contains duplicate rows
order_payments_df[order_payments_df.duplicated()]

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value


In [27]:
# Change datatype
order_payments_df['order_id'] = order_payments_df['order_id'].astype('str')

# drop irrelevant columns
order_payments_df = order_payments_df.drop(columns=['payment_sequential', 'payment_installments'])
order_payments_df.info()

# drop duplicates
order_payments_df = order_payments_df.drop_duplicates()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   order_id       103886 non-null  object 
 1   payment_type   103886 non-null  object 
 2   payment_value  103886 non-null  float64
dtypes: float64(1), object(2)
memory usage: 2.4+ MB


In [28]:
order_payments_df.duplicated().any()

False

In [29]:
# order_reviews data sheet
order_reviews_df = pd.read_excel(xl, sheet_name='order_reviews_data')
order_reviews_df.head(2)

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10,2018-03-11 03:05:13


In [30]:
order_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   review_id                99224 non-null  object        
 1   order_id                 99224 non-null  object        
 2   review_score             99224 non-null  int64         
 3   review_comment_title     11567 non-null  object        
 4   review_comment_message   40974 non-null  object        
 5   review_creation_date     99224 non-null  datetime64[ns]
 6   review_answer_timestamp  99224 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 5.3+ MB


In [31]:
# drop irrelevant columns
order_reviews_df = order_reviews_df.drop(columns=['review_creation_date','review_comment_message',
                                                  'review_comment_title','review_answer_timestamp',
                                                 ])
order_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   review_id     99224 non-null  object
 1   order_id      99224 non-null  object
 2   review_score  99224 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


In [32]:
# orders data sheet
orders_df = pd.read_excel(xl, sheet_name='orders_data')
orders_df.head(2)

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13


In [33]:
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  object        
 3   order_purchase_timestamp       99441 non-null  datetime64[ns]
 4   order_approved_at              99281 non-null  datetime64[ns]
 5   order_delivered_carrier_date   97658 non-null  datetime64[ns]
 6   order_delivered_customer_date  96476 non-null  datetime64[ns]
 7   order_estimated_delivery_date  99441 non-null  datetime64[ns]
dtypes: datetime64[ns](5), object(3)
memory usage: 6.1+ MB


In [34]:
# drop irrelevant columns
orders_df = orders_df.drop(columns = ['order_delivered_carrier_date', 
                                      'order_estimated_delivery_date'
                                     ])
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 6 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  object        
 3   order_purchase_timestamp       99441 non-null  datetime64[ns]
 4   order_approved_at              99281 non-null  datetime64[ns]
 5   order_delivered_customer_date  96476 non-null  datetime64[ns]
dtypes: datetime64[ns](3), object(3)
memory usage: 4.6+ MB


In [35]:
# products data sheet
products_df = pd.read_excel(xl, sheet_name='products_data')
products_df.head(2)

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0


In [36]:
products_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32951 non-null  object 
 1   product_category_name       32341 non-null  object 
 2   product_name_lenght         32341 non-null  float64
 3   product_description_lenght  32341 non-null  float64
 4   product_photos_qty          32341 non-null  float64
 5   product_weight_g            32949 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_height_cm           32949 non-null  float64
 8   product_width_cm            32949 non-null  float64
dtypes: float64(7), object(2)
memory usage: 2.3+ MB


In [37]:
# drop irrelevant columns
products_df = products_df.drop(columns=['product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm'])
products_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   product_id             32951 non-null  object
 1   product_category_name  32341 non-null  object
dtypes: object(2)
memory usage: 515.0+ KB


In [38]:
# sellers data sheet
sellers_df = pd.read_excel(xl, sheet_name='sellers_data')
sellers_df.sample(10)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
2612,4a1f694197d05fe70026b016a7316b41,13920,pedreira,SP
1379,1987cd4be10fe09ee50454f9c354d1c6,37570,ouro fino,MG
100,9c4d31c7e46ab03a43fc06e3142afd4e,20785,rio de janeiro,RJ
1317,e88c9b79e592e370d6bd852eeefbf057,85501,pato branco,PR
1482,804287717b9156fb7a787acd9af4fac1,87030,maringa,PR
2499,6b803197e03abd5056b4313306b4f29d,1123,sao paulo,SP
839,ad97a199236354e53fcd91a5a913e9a2,88350,brusque,SC
333,32f83ffe11cd40f7adcf4eef171f52d9,97502,uruguaiana,RS
1732,44ed138eca6214d572ce1d813fb0049b,92010,canoas,RS
1564,dd533b429f380718b70ad9922c294bae,14781,barretos,SP


In [39]:
sellers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   int64 
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: int64(1), object(3)
memory usage: 96.8+ KB


In [40]:
# change data types of columns
sellers_df['seller_zip_code_prefix'] = sellers_df['seller_zip_code_prefix'].astype('str')
sellers_df['seller_id'] = sellers_df['seller_id'].astype('str')

# Let's standardize seller_zip_code_prefix digits to 5 for the column
sellers_df.seller_zip_code_prefix = sellers_df.seller_zip_code_prefix.str.zfill(5)

sellers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   object
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: object(4)
memory usage: 96.8+ KB


In [41]:
# product_categories data sheet
product_categories_df = pd.read_excel(xl, sheet_name='product_categories_data')
product_categories_df

Unnamed: 0,product_category_name,product_category_name_english
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories
2,automotivo,auto
3,cama_mesa_banho,bed_bath_table
4,moveis_decoracao,furniture_decor
...,...,...
66,flores,flowers
67,artes_e_artesanato,arts_and_craftmanship
68,fraldas_higiene,diapers_and_hygiene
69,fashion_roupa_infanto_juvenil,fashion_childrens_clothes


## Using the schema below, let's merge the tables.

>1.	An order might have multiple items.
2.	Each item might be fulfilled by a distinct seller.
3.	All text identifying stores and partners were replaced by the names of Game of Thrones great houses.


<img src='schema.png' alt='Table schema' width='750px'>

In [42]:
#  orders_df + order_reviews = orders
orders = pd.merge(orders_df, order_reviews_df, on='order_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99992 entries, 0 to 99991
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99992 non-null  object        
 1   customer_id                    99992 non-null  object        
 2   order_status                   99992 non-null  object        
 3   order_purchase_timestamp       99992 non-null  datetime64[ns]
 4   order_approved_at              99831 non-null  datetime64[ns]
 5   order_delivered_customer_date  97005 non-null  datetime64[ns]
 6   review_id                      99224 non-null  object        
 7   review_score                   99224 non-null  float64       
dtypes: datetime64[ns](3), float64(1), object(4)
memory usage: 6.9+ MB


In [43]:
orders.duplicated().any()

False

In [44]:
# orders + order_payments_df = orders
orders = pd.merge(orders, order_payments_df, on='order_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103857 entries, 0 to 103856
Data columns (total 10 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       103857 non-null  object        
 1   customer_id                    103857 non-null  object        
 2   order_status                   103857 non-null  object        
 3   order_purchase_timestamp       103857 non-null  datetime64[ns]
 4   order_approved_at              103691 non-null  datetime64[ns]
 5   order_delivered_customer_date  100736 non-null  datetime64[ns]
 6   review_id                      103060 non-null  object        
 7   review_score                   103060 non-null  float64       
 8   payment_type                   103856 non-null  object        
 9   payment_value                  103856 non-null  float64       
dtypes: datetime64[ns](3), float64(2), object(5)
memory usage: 8.7+ MB


In [45]:
# orders + order_items_df = orders
orders = pd.merge(orders, order_items_df, on='order_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 15 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       118432 non-null  object        
 1   customer_id                    118432 non-null  object        
 2   order_status                   118432 non-null  object        
 3   order_purchase_timestamp       118432 non-null  datetime64[ns]
 4   order_approved_at              118265 non-null  datetime64[ns]
 5   order_delivered_customer_date  115044 non-null  datetime64[ns]
 6   review_id                      117438 non-null  object        
 7   review_score                   117438 non-null  float64       
 8   payment_type                   118429 non-null  object        
 9   payment_value                  118429 non-null  float64       
 10  order_item_id                  117618 non-null  object        
 11  

In [46]:
# orders + products = orders
orders = pd.merge(orders, products_df, on='product_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 16 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       118432 non-null  object        
 1   customer_id                    118432 non-null  object        
 2   order_status                   118432 non-null  object        
 3   order_purchase_timestamp       118432 non-null  datetime64[ns]
 4   order_approved_at              118265 non-null  datetime64[ns]
 5   order_delivered_customer_date  115044 non-null  datetime64[ns]
 6   review_id                      117438 non-null  object        
 7   review_score                   117438 non-null  float64       
 8   payment_type                   118429 non-null  object        
 9   payment_value                  118429 non-null  float64       
 10  order_item_id                  117618 non-null  object        
 11  

### Fetch Seller Latitude and Longitude values from the geolocation table

In [47]:
sellers_df['seller_lat'] = pd.merge(sellers_df, geolocation_df, 
                                        left_on=['seller_zip_code_prefix', 'seller_state', 'seller_city'], 
                                        right_on=['geolocation_zip_code_prefix', 'geolocation_state', 'geolocation_city'], 
                                         how='left')['geolocation_lat']

In [48]:
sellers_df['seller_lng'] = pd.merge(sellers_df, geolocation_df, 
                                         left_on=['seller_zip_code_prefix', 'seller_state', 'seller_city'], 
                                         right_on=['geolocation_zip_code_prefix', 'geolocation_state', 'geolocation_city'], 
                                         how='left')['geolocation_lng']

In [49]:
# orders + sellers_df = orders
orders = pd.merge(orders, sellers_df, on='seller_id', how='left')
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 21 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       118432 non-null  object        
 1   customer_id                    118432 non-null  object        
 2   order_status                   118432 non-null  object        
 3   order_purchase_timestamp       118432 non-null  datetime64[ns]
 4   order_approved_at              118265 non-null  datetime64[ns]
 5   order_delivered_customer_date  115044 non-null  datetime64[ns]
 6   review_id                      117438 non-null  object        
 7   review_score                   117438 non-null  float64       
 8   payment_type                   118429 non-null  object        
 9   payment_value                  118429 non-null  float64       
 10  order_item_id                  117618 non-null  object        
 11  

In [50]:
orders.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,...,product_id,seller_id,price,freight_value,product_category_name,seller_zip_code_prefix,seller_city,seller_state,seller_lat,seller_lng
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-10 21:25:13,a54f0611adc9ed256b57ede6b6eb5114,4.0,credit_card,18.12,...,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,8.72,utilidades_domesticas,9350,maua,SP,-23.656364,-46.611549
1,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-10 21:25:13,a54f0611adc9ed256b57ede6b6eb5114,4.0,voucher,2.0,...,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,8.72,utilidades_domesticas,9350,maua,SP,-23.656364,-46.611549
2,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-10 21:25:13,a54f0611adc9ed256b57ede6b6eb5114,4.0,voucher,18.59,...,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,8.72,utilidades_domesticas,9350,maua,SP,-23.656364,-46.611549
3,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-08-07 15:27:45,8d5266042046a06655c8db133d120ba5,4.0,boleto,141.46,...,595fac2a385ac33a80bd5114aec74eb8,289cdb325fb7e7f891c38608bf9e0962,118.7,22.76,perfumaria,31570,belo horizonte,SP,-23.659474,-46.614125
4,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-17 18:06:29,e73b67b67587f7644d5bd1a52deb1b01,5.0,credit_card,179.12,...,aa4383b373c6aca5d8797843e5594415,4869f7a5dfa277a7dca6462dcf3b52b2,159.9,19.22,automotivo,14840,guariba,SP,-23.987049,-46.234482


### Also, fetch Customer Latitude and Longitude values from the geolocation table

In [51]:
customers_df['customer_lat'] = pd.merge(customers_df, geolocation_df, 
                                         left_on=['customer_zip_code_prefix', 'customer_state', 'customer_city'], right_on=['geolocation_zip_code_prefix', 'geolocation_state', 'geolocation_city'], 
                                         how='left')['geolocation_lat']

In [52]:
customers_df['customer_lng'] = pd.merge(customers_df, geolocation_df, 
                                         left_on=['customer_zip_code_prefix', 'customer_state', 'customer_city'], right_on=['geolocation_zip_code_prefix', 'geolocation_state', 'geolocation_city'], 
                                         how='left')['geolocation_lng']

In [53]:
customers_df.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,customer_lat,customer_lng
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.509897,-47.397866
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP,-20.497396,-47.399241
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP,-20.510459,-47.399553
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP,-20.48094,-47.394161
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP,-20.515413,-47.398194


### Finally, merge customers_df with orders using customer_id column

In [54]:
# orders + customers_df = sales_df
sales_df = pd.merge(orders, customers_df, on = 'customer_id', how='right')
sales_df.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,...,seller_city,seller_state,seller_lat,seller_lng,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,customer_lat,customer_lng
0,00e7ee1b050b8499577073aeb2a297a1,06b8999e2fba1a1fbc88172c00ba8bc7,delivered,2017-05-16 15:05:35,2017-05-16 15:22:12,2017-05-25 10:35:35,88b8b52d46df026a9d1ad2136a59b30b,4.0,credit_card,146.87,...,itaquaquecetuba,SP,-16.330851,-48.947623,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.509897,-47.397866
1,29150127e6685892b6eab3eec79f59c7,18955e83d337fd6b2def6b18a428ac77,delivered,2018-01-12 20:48:24,2018-01-12 20:58:32,2018-01-29 12:41:19,02fc48a9efa3e3d0f1a8ea26507eeec3,5.0,credit_card,335.48,...,itajai,SC,-16.442126,-39.07801,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP,-20.497396,-47.399241
2,b2059ed67ce144a36e2aa97d2c9e9ad2,4e7b3e00288586ebd08712fdd0374a03,delivered,2018-05-19 16:07:45,2018-05-20 16:19:10,2018-06-14 17:58:51,5ad6695d76ee186dc473c42706984d87,5.0,credit_card,157.73,...,itaquaquecetuba,SP,-16.330851,-48.947623,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP,-20.510459,-47.399553
3,951670f92359f4fe4a63112aa7306eba,b2b6027bc5c5109e529d4dc6358b12c3,delivered,2018-03-13 16:06:38,2018-03-13 17:29:19,2018-03-28 16:04:25,059a801bb31f6aab2266e672cab87bc5,5.0,credit_card,173.3,...,itaquaquecetuba,SP,-16.330851,-48.947623,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP,-20.48094,-47.394161
4,6b7d50bd145f6fc7f33cebabd7e49d0f,4f2d8ab171c80ec8364f7c12e35b23ad,delivered,2018-07-29 09:51:30,2018-07-29 10:10:09,2018-08-09 20:55:48,8490879d58d6c5d7773f2739a03f089a,5.0,credit_card,252.25,...,ibitinga,SP,-23.98232,-46.236891,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP,-20.515413,-47.398194


### Copy the sales_df table for further cleaning

In [55]:
sales_df_clean = sales_df.copy()

In [56]:
sales_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 27 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       118432 non-null  object        
 1   customer_id                    118432 non-null  object        
 2   order_status                   118432 non-null  object        
 3   order_purchase_timestamp       118432 non-null  datetime64[ns]
 4   order_approved_at              118265 non-null  datetime64[ns]
 5   order_delivered_customer_date  115044 non-null  datetime64[ns]
 6   review_id                      117438 non-null  object        
 7   review_score                   117438 non-null  float64       
 8   payment_type                   118429 non-null  object        
 9   payment_value                  118429 non-null  float64       
 10  order_item_id                  117618 non-null  object        
 11  

In [51]:
# Export to csv for visual cleaning
sales_df_clean.to_csv('dirty_sales_data.csv', index=False)

In [57]:
sales_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 27 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       118432 non-null  object        
 1   customer_id                    118432 non-null  object        
 2   order_status                   118432 non-null  object        
 3   order_purchase_timestamp       118432 non-null  datetime64[ns]
 4   order_approved_at              118265 non-null  datetime64[ns]
 5   order_delivered_customer_date  115044 non-null  datetime64[ns]
 6   review_id                      117438 non-null  object        
 7   review_score                   117438 non-null  float64       
 8   payment_type                   118429 non-null  object        
 9   payment_value                  118429 non-null  float64       
 10  order_item_id                  117618 non-null  object        
 11  

In [58]:
sales_df_clean.describe()

Unnamed: 0,review_score,payment_value,price,freight_value,seller_lat,seller_lng,customer_lat,customer_lng
count,117438.0,118429.0,117618.0,117618.0,117610.0,117610.0,118430.0,118430.0
mean,4.015753,173.575062,120.628165,20.019567,-22.550835,-45.646617,-22.00148,-45.743073
std,1.400427,268.325853,184.12144,15.809956,3.682994,3.572776,4.218093,3.383411
min,1.0,0.0,0.85,0.0,-28.48623,-54.265283,-32.103825,-63.017772
25%,4.0,61.5,39.9,13.08,-23.978509,-47.89148,-23.559614,-47.391128
50%,5.0,108.73,74.9,16.28,-22.965496,-46.619714,-22.9083,-46.353037
75%,5.0,189.69,134.9,21.18,-22.248724,-43.177365,-20.261674,-43.237025
max,5.0,13664.08,6735.0,409.68,-12.867805,-38.289328,0.08643,-34.8211


### Additional Data Quality Issues
- Product category column not in English
- '_' in item and category names
- City names are in lower case

### Product category column not in English

Replace product category name column with the translated one in the products_category_translation csv file

In [59]:
sales_df_clean.product_category_name.sample(15)

65130                     moveis_decoracao
80605                         beleza_saude
78449                           perfumaria
36583                    moveis_escritorio
68836          fashion_bolsas_e_acessorios
98682                utilidades_domesticas
48813                           perfumaria
33341                     moveis_decoracao
92880                         beleza_saude
44194                    moveis_escritorio
9966                         esporte_lazer
17520                     moveis_decoracao
3289                       cama_mesa_banho
41043    construcao_ferramentas_iluminacao
98868                        esporte_lazer
Name: product_category_name, dtype: object

In [60]:
for i in sales_df_clean.product_category_name:
    assert (str(i).islower())

In [61]:
# Some products categories were not translated to English, while others are not available
print(sales_df_clean.product_category_name.nunique(dropna=False))
print(product_categories_df.product_category_name.nunique(dropna=False))

74
71


In [62]:
missing_cat = {'product_category_name' : ['portateis_cozinha_e_preparadores_de_alimentos', 'pc_gamer', np.nan],
    'product_category_name_english' : ['kitchen_equipment', 'pc_gamer', 'not_available']}

df = pd.DataFrame(data=missing_cat)

# Concatenate with the product_categories_df dataframe
product_categories_df = pd.concat([product_categories_df, df], ignore_index=True)
product_categories_df

Unnamed: 0,product_category_name,product_category_name_english
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories
2,automotivo,auto
3,cama_mesa_banho,bed_bath_table
4,moveis_decoracao,furniture_decor
...,...,...
69,fashion_roupa_infanto_juvenil,fashion_childrens_clothes
70,seguros_e_servicos,security_and_services
71,portateis_cozinha_e_preparadores_de_alimentos,kitchen_equipment
72,pc_gamer,pc_gamer


In [63]:
# Next, replace the Portugese category names with English version
sales_df_clean = pd.merge(sales_df_clean, product_categories_df, on='product_category_name', how='left').drop(columns=['product_category_name'])
sales_df_clean

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,...,seller_state,seller_lat,seller_lng,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,customer_lat,customer_lng,product_category_name_english
0,00e7ee1b050b8499577073aeb2a297a1,06b8999e2fba1a1fbc88172c00ba8bc7,delivered,2017-05-16 15:05:35,2017-05-16 15:22:12,2017-05-25 10:35:35,88b8b52d46df026a9d1ad2136a59b30b,4.0,credit_card,146.87,...,SP,-16.330851,-48.947623,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.509897,-47.397866,office_furniture
1,29150127e6685892b6eab3eec79f59c7,18955e83d337fd6b2def6b18a428ac77,delivered,2018-01-12 20:48:24,2018-01-12 20:58:32,2018-01-29 12:41:19,02fc48a9efa3e3d0f1a8ea26507eeec3,5.0,credit_card,335.48,...,SC,-16.442126,-39.078010,290c77bc529b7ac935b93aa66c333dc3,09790,sao bernardo do campo,SP,-20.497396,-47.399241,housewares
2,b2059ed67ce144a36e2aa97d2c9e9ad2,4e7b3e00288586ebd08712fdd0374a03,delivered,2018-05-19 16:07:45,2018-05-20 16:19:10,2018-06-14 17:58:51,5ad6695d76ee186dc473c42706984d87,5.0,credit_card,157.73,...,SP,-16.330851,-48.947623,060e732b5b29e8181a18229c7b0b2b5e,01151,sao paulo,SP,-20.510459,-47.399553,office_furniture
3,951670f92359f4fe4a63112aa7306eba,b2b6027bc5c5109e529d4dc6358b12c3,delivered,2018-03-13 16:06:38,2018-03-13 17:29:19,2018-03-28 16:04:25,059a801bb31f6aab2266e672cab87bc5,5.0,credit_card,173.30,...,SP,-16.330851,-48.947623,259dac757896d24d7702b9acbbff3f3c,08775,mogi das cruzes,SP,-20.480940,-47.394161,office_furniture
4,6b7d50bd145f6fc7f33cebabd7e49d0f,4f2d8ab171c80ec8364f7c12e35b23ad,delivered,2018-07-29 09:51:30,2018-07-29 10:10:09,2018-08-09 20:55:48,8490879d58d6c5d7773f2739a03f089a,5.0,credit_card,252.25,...,SP,-23.982320,-46.236891,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP,-20.515413,-47.398194,home_confort
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118427,6760e20addcf0121e9d58f2f1ff14298,17ddf5dd5d51696bb3d7c6291687be6f,delivered,2018-04-07 15:48:17,2018-04-07 16:08:45,2018-04-13 20:06:37,36e2cdbaa9f639b57c53b37ac798fee8,4.0,credit_card,88.78,...,SP,-23.486861,-46.730143,1a29b476fee25c95fbafc67c5ac95cf8,03937,sao paulo,SP,-21.721842,-43.353880,books_general_interest
118428,9ec0c8947d973db4f4e8dcf1fbfa8f1b,e7b71a9017aa05c9a7fd292d714858e8,delivered,2018-04-04 08:20:22,2018-04-04 08:35:12,2018-04-11 18:54:45,b273b431c3aedb4eed18643309652940,5.0,credit_card,129.06,...,SP,-16.419576,-39.084272,d52a67c98be1cf6a5c84435bd38d095d,06764,taboao da serra,SP,-21.718059,-43.350766,sports_leisure
118429,fed4434add09a6f332ea398efd656a5c,5e28dfe12db7fb50a4b2f691faecea5e,delivered,2018-04-08 20:11:50,2018-04-08 20:30:03,2018-05-09 19:03:15,fa4f16891e6b2edd1354668d07f5648b,1.0,credit_card,56.04,...,SP,-16.416062,-39.085346,e9f50caf99f032f0bf3c55141f019d99,60115,fortaleza,CE,-21.719544,-43.352861,health_beauty
118430,e31ec91cea1ecf97797787471f98a8c2,56b18e2166679b8a959d72dd06da27f9,delivered,2017-11-03 21:08:33,2017-11-03 21:31:20,2017-11-16 19:58:39,0bcdc9e450ea500811a8d39ee993cd47,5.0,credit_card,711.07,...,SP,-23.987049,-46.234482,73c2643a0a458b49f58cea58833b192e,92120,canoas,RS,-21.720405,-43.353035,watches_gifts


In [64]:
sales_df_clean.info() #['product_category_name']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 27 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       118432 non-null  object        
 1   customer_id                    118432 non-null  object        
 2   order_status                   118432 non-null  object        
 3   order_purchase_timestamp       118432 non-null  datetime64[ns]
 4   order_approved_at              118265 non-null  datetime64[ns]
 5   order_delivered_customer_date  115044 non-null  datetime64[ns]
 6   review_id                      117438 non-null  object        
 7   review_score                   117438 non-null  float64       
 8   payment_type                   118429 non-null  object        
 9   payment_value                  118429 non-null  float64       
 10  order_item_id                  117618 non-null  object        
 11  

### '_' in item and category names

### City names are in lower case

Replace all underscores in named columns of the Dataframe to have a cleaner data, then, convert to title case

In [65]:
# Replace all underscores with white space
named_columns = ['order_status', 'payment_type', 'product_category_name_english', 'seller_city', 'customer_city',]
# sales_df_clean[named_columns] = sales_df_clean[named_columns].astype('str')
sales_df_clean[named_columns] = sales_df_clean[named_columns].replace('_', ' ', regex=True)

# Next, convert to title case
for col in named_columns:
    sales_df_clean[col] = sales_df_clean[col].str.title()

In [66]:
sales_df_clean.customer_city

0                        Franca
1         Sao Bernardo Do Campo
2                     Sao Paulo
3               Mogi Das Cruzes
4                      Campinas
                  ...          
118427                Sao Paulo
118428          Taboao Da Serra
118429                Fortaleza
118430                   Canoas
118431                    Cotia
Name: customer_city, Length: 118432, dtype: object

In [67]:
sales_df_clean[named_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 5 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   order_status                   118432 non-null  object
 1   payment_type                   118429 non-null  object
 2   product_category_name_english  118432 non-null  object
 3   seller_city                    117617 non-null  object
 4   customer_city                  118432 non-null  object
dtypes: object(5)
memory usage: 5.4+ MB


In [68]:
# Add date dimension
# This will enable us apply filters using different time-based factors such as year, month, quarter, etc. 
# Please ignore warning in the output

sales_df_clean['day'] = sales_df_clean['order_purchase_timestamp'].dt.day_name()
sales_df_clean['week'] = sales_df_clean['order_purchase_timestamp'].dt.week
sales_df_clean['quarter'] = sales_df_clean['order_purchase_timestamp'].dt.quarter
sales_df_clean['year'] = sales_df_clean['order_purchase_timestamp'].dt.year
sales_df_clean['half_year'] = (sales_df_clean['quarter'] + 1) // 2
sales_df_clean

  sales_df_clean['week'] = sales_df_clean['order_purchase_timestamp'].dt.week


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,review_id,review_score,payment_type,payment_value,...,customer_city,customer_state,customer_lat,customer_lng,product_category_name_english,day,week,quarter,year,half_year
0,00e7ee1b050b8499577073aeb2a297a1,06b8999e2fba1a1fbc88172c00ba8bc7,Delivered,2017-05-16 15:05:35,2017-05-16 15:22:12,2017-05-25 10:35:35,88b8b52d46df026a9d1ad2136a59b30b,4.0,Credit Card,146.87,...,Franca,SP,-20.509897,-47.397866,Office Furniture,Tuesday,20,2,2017,1
1,29150127e6685892b6eab3eec79f59c7,18955e83d337fd6b2def6b18a428ac77,Delivered,2018-01-12 20:48:24,2018-01-12 20:58:32,2018-01-29 12:41:19,02fc48a9efa3e3d0f1a8ea26507eeec3,5.0,Credit Card,335.48,...,Sao Bernardo Do Campo,SP,-20.497396,-47.399241,Housewares,Friday,2,1,2018,1
2,b2059ed67ce144a36e2aa97d2c9e9ad2,4e7b3e00288586ebd08712fdd0374a03,Delivered,2018-05-19 16:07:45,2018-05-20 16:19:10,2018-06-14 17:58:51,5ad6695d76ee186dc473c42706984d87,5.0,Credit Card,157.73,...,Sao Paulo,SP,-20.510459,-47.399553,Office Furniture,Saturday,20,2,2018,1
3,951670f92359f4fe4a63112aa7306eba,b2b6027bc5c5109e529d4dc6358b12c3,Delivered,2018-03-13 16:06:38,2018-03-13 17:29:19,2018-03-28 16:04:25,059a801bb31f6aab2266e672cab87bc5,5.0,Credit Card,173.30,...,Mogi Das Cruzes,SP,-20.480940,-47.394161,Office Furniture,Tuesday,11,1,2018,1
4,6b7d50bd145f6fc7f33cebabd7e49d0f,4f2d8ab171c80ec8364f7c12e35b23ad,Delivered,2018-07-29 09:51:30,2018-07-29 10:10:09,2018-08-09 20:55:48,8490879d58d6c5d7773f2739a03f089a,5.0,Credit Card,252.25,...,Campinas,SP,-20.515413,-47.398194,Home Confort,Sunday,30,3,2018,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118427,6760e20addcf0121e9d58f2f1ff14298,17ddf5dd5d51696bb3d7c6291687be6f,Delivered,2018-04-07 15:48:17,2018-04-07 16:08:45,2018-04-13 20:06:37,36e2cdbaa9f639b57c53b37ac798fee8,4.0,Credit Card,88.78,...,Sao Paulo,SP,-21.721842,-43.353880,Books General Interest,Saturday,14,2,2018,1
118428,9ec0c8947d973db4f4e8dcf1fbfa8f1b,e7b71a9017aa05c9a7fd292d714858e8,Delivered,2018-04-04 08:20:22,2018-04-04 08:35:12,2018-04-11 18:54:45,b273b431c3aedb4eed18643309652940,5.0,Credit Card,129.06,...,Taboao Da Serra,SP,-21.718059,-43.350766,Sports Leisure,Wednesday,14,2,2018,1
118429,fed4434add09a6f332ea398efd656a5c,5e28dfe12db7fb50a4b2f691faecea5e,Delivered,2018-04-08 20:11:50,2018-04-08 20:30:03,2018-05-09 19:03:15,fa4f16891e6b2edd1354668d07f5648b,1.0,Credit Card,56.04,...,Fortaleza,CE,-21.719544,-43.352861,Health Beauty,Sunday,14,2,2018,1
118430,e31ec91cea1ecf97797787471f98a8c2,56b18e2166679b8a959d72dd06da27f9,Delivered,2017-11-03 21:08:33,2017-11-03 21:31:20,2017-11-16 19:58:39,0bcdc9e450ea500811a8d39ee993cd47,5.0,Credit Card,711.07,...,Canoas,RS,-21.720405,-43.353035,Watches Gifts,Friday,44,4,2017,2


In [190]:
# OPTIONAL
# Now, let's export our cleaned data

sales_df_clean.to_csv('cleaned_sales_data.csv', index=False, )

In [69]:
sales_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118432 entries, 0 to 118431
Data columns (total 32 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       118432 non-null  object        
 1   customer_id                    118432 non-null  object        
 2   order_status                   118432 non-null  object        
 3   order_purchase_timestamp       118432 non-null  datetime64[ns]
 4   order_approved_at              118265 non-null  datetime64[ns]
 5   order_delivered_customer_date  115044 non-null  datetime64[ns]
 6   review_id                      117438 non-null  object        
 7   review_score                   117438 non-null  float64       
 8   payment_type                   118429 non-null  object        
 9   payment_value                  118429 non-null  float64       
 10  order_item_id                  117618 non-null  object        
 11  

In [70]:
sales_df_clean.query("year == 2018")['payment_value'].sum() #groupby('year')['payment_value'].sum()

11157621.08

In [71]:
sales_df_clean.groupby(by=['product_category_name_english']).sum()['payment_value'].sort_values()

product_category_name_english
Security And Services            324.51
Fashion Childrens Clothes        785.67
Cds Dvds Musicals               1199.43
Home Comfort 2                  1710.54
Pc Gamer                        2174.43
                                ...    
Watches Gifts                1429639.26
Furniture Decor              1442659.16
Computers Accessories        1598913.33
Health Beauty                1661358.66
Bed Bath Table               1740639.37
Name: payment_value, Length: 74, dtype: float64

In [156]:
total_order = sales_df_clean.groupby('order_id')[['order_id','payment_value', 'price', 'freight_value']] #.sum()

In [157]:
total_order.payment_value == total_order.price + total_order.freight_value

IndexError: Column(s) ['order_id', 'payment_value', 'price', 'freight_value'] already selected

In [159]:
total_order_value = millify(sales_df_clean.payment_value.sum())
total_order_value

'21M'

In [160]:
# Average worth of order + shipping across all categories. This includes canceled orders too.
average_order_value = millify(sales_df_clean.payment_value.mean())
average_order_value

'174'

In [162]:
this_year = sales_df_clean.year.max()
last_year = this_year - 1

2018 2017


In [164]:
# Value of orders already delivered to customers this year
total_cleared_order_value = sales_df_clean.query("order_status == 'Delivered'").query("year == @this_year")[['payment_value']].sum()
total_cleared_order_value

payment_value    10870392.98
dtype: float64

In [77]:
# Value of orders that were canceled
total_canceled_order = sales_df_clean.query("order_status == 'Canceled'").groupby('order_id')[['order_id','payment_value', 'price', 'freight_value']]
total_canceled_order.sum()

Unnamed: 0_level_0,payment_value,price,freight_value
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00310b0c75bb13015ec4d82d341865a4,55.28,39.90,15.38
00ae7a8b4936674ebb701d4a23719a79,37.16,27.90,9.26
00b1cb0320190ca0daa2c88b35206009,0.00,0.00,0.00
00d0ffd14774da775ac832ba8520510f,134.49,0.00,0.00
00ff0cf5583758e6964723e42f111bf4,170.15,154.90,15.25
...,...,...,...
fc3c882665c98c9b737a7b1b3aa6c553,37.78,28.90,8.88
fd4c3a2912e854eedd463b329540da4b,1395.76,0.00,0.00
fdbbb1715d0c62c714e2a8178b95dd54,69.14,57.99,11.15
fe9aa3b22b4d65ccbaffb57984bc12fb,47.68,39.90,7.78


In [84]:
# YoY Growth
# formula = (revenue this year - revenue last year)/revenue last year * 100

def YOY_growth(df, this_year, last_year):
    revenue_this_year = df.query("year == @this_year").payment_value.sum()
    revenue_last_year = df.query("year == @last_year").payment_value.sum()
#     print(revenue_this_year, revenue_last_year)
    
    difference = revenue_this_year - revenue_last_year
    yoy = (difference / revenue_last_year) * 100
    return yoy

In [87]:
YOY_growth(sales_df_clean, 2018, 2017)

19.69445372944715

In [95]:
sales_df_clean.order_status.unique()

array(['Delivered', 'Canceled', 'Invoiced', 'Shipped', 'Processing',
       'Unavailable', 'Approved', 'Created'], dtype=object)

In [121]:
fig = go.Figure(go.Indicator(
    mode = "gauge+number",
    number = {'prefix': "$", 'font': {'size': 50}},
    value = 80000,
    align = 'center',
    domain = {'x': [0,1], 'y': [0,1]},
    gauge = {
        'axis': {'range': [None, 90000], 'tickwidth': 0, 'tickcolor': "#e05628"},
        'bar': {'color': "#e05628"},
        'bgcolor': 'rgb(255,255,255, 0)',
        'borderwidth': 0.5,
        'bordercolor': "white",
#         'steps': [
#             {'range': [0, 33], 'color': 'red'},
#             {'range': [33, 66], 'color': 'yellow'},
#             {'range': [66,100], 'color': 'green'}],
        }))

fig.update_layout(font = {'color': "black", 'family': "sans serif"})

fig.show()

In [194]:
target_sales = 70000
current_sales = 30000
delta = target_sales - current_sales
values = [current_sales, delta]
colors = ["#e05628", "#C7C9CE"]

fig6 = go.Figure(data = go.Pie(values = values, 
#                                labels = labels, 
                               hole = 0.8,
                               marker_colors = colors))
fig6.update_traces(hoverinfo='label+percent',
                   textinfo='percent', 
                   textfont_size=20)
fig6.update_layout(
                   title_text = 'Population Economically Active',
                   title_font = dict(size=25,family='sans serif', 
                                     color='darkred',))
fig6.add_annotation(x= 0.5, y = 0.5,
                    text = '${}'.format(target_sales),
                    font = dict(size = 50,family='sana serif', 
                                color='black'),
                    showarrow = False)
fig6.show()

In [192]:
target_this_year()

800000


In [178]:
this_year = sales_df_clean.year.max()
last_year = this_year - 1

def total_cleared_order():
    '''Value of orders already delivered to customers THIS year'''
    total_cleared_order = sales_df_clean.query("order_status == 'Delivered'").query("year == @this_year")['payment_value'].sum()
    return int(total_cleared_order)

def YoY_growth(this_year, last_year):
    revenue_this_year = sales_df_clean.query("year == @this_year").payment_value.sum()
    revenue_last_year = sales_df_clean.query("year == @last_year").payment_value.sum()    
    difference = revenue_this_year - revenue_last_year
    yoy_growth = (difference / revenue_last_year) * 100
    return int(yoy_growth)

In [180]:
YoY_growth(this_year=last_year, last_year=last_year-1)

12014

In [187]:
def target_this_year():
    target_sales = 80000
    current_sales = total_cleared_order()
    delta = target_sales - current_sales
    values = [current_sales, delta]
    colors = ["#e05628", "#e05628"]

    data = go.Pie(values = values,hole = 0.8, marker_colors = colors)
    fig_target_sales = go.Figure(data=data)
    fig_target_sales.update_traces(hoverinfo='label+percent',
                    textinfo = 'none', #'percent', 
                    textfont_size=20,  
                    rotation= 45,
                    showlegend = False,)

    fig_target_sales.add_annotation(x= 0.5, y = 0.5,
                        text = '${}'.format(target_sales),
                        font = dict(size = 50,family='sana serif', 
                                    color='black'),
                        showarrow = False)
    fig_target_sales.show()
target_this_year()