# DataWarehouse com dados de Eccomerce

Os scripts abaixo realizam a modelagem de dados para criar um data warehouse.

Primeiro, é feito uma analise bem preliminar dos dados, junto a testes de criação de outras colunas. Por último, os dados são modelados para a criação de um data warehouse.

## Importando Bibliotecas

In [0]:
from pyspark.sql.functions import col,isnan, when, count, sum, udf, to_timestamp, datediff, max, min, lit, avg, row_number
from pyspark.sql.functions import year, month, hour, dayofmonth, dayofweek, date_format
from pyspark.sql.window import Window
from pyspark.sql.utils import AnalysisException
from pyspark.sql.types import StringType, IntegerType, DoubleType, TimestampType, StructType
from pyspark.ml.feature import QuantileDiscretizer

import datetime
import pandas as pd
import math


### Importando os dados

In [0]:
path = "/FileStore/shared_uploads/luannrs@hotmail.com/"

# CSV
orders_items_df = spark.read.csv(path+'olist_order_items_dataset.csv',header=True)
orders_df = spark.read.csv(path+'olist_orders_dataset.csv',header=True)
orders_payments_df = spark.read.csv(path+'olist_order_payments_dataset.csv',header=True)
orders_reviews_df = spark.read.option("multiLine",True).csv(path+'olist_order_reviews_dataset.csv',header=True, escape="\"")

customers_df = spark.read.csv(path+'olist_customers_dataset.csv',header=True)
geolocations_df = spark.read.csv(path+'olist_geolocation_dataset.csv',header=True)
sellers_df = spark.read.csv(path+'olist_sellers_dataset.csv',header=True)
products_df = spark.read.csv(path+'olist_products_dataset.csv',header=True)
product_translations_df = spark.read.csv(path+'product_category_name_translation.csv',header=True)

### Verificando características dos dados

Verifica primeiras linhas de cada tabela

In [0]:
orders_items_df.limit(10).toPandas()

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14
5,00048cc3ae777c65dbb7d2a0634bc1ea,1,ef92defde845ab8450f9d70c526ef70f,6426d21aca402a131fc0a5d0960a3c90,2017-05-23 03:55:27,21.9,12.69
6,00054e8431b9d7675808bcb819fb4a32,1,8d4f2bb7e93e6710a28f34fa83ee7d28,7040e82f899a04d1b434b795a43b4617,2017-12-14 12:10:31,19.9,11.85
7,000576fe39319847cbb9d288c5617fa6,1,557d850972a7d6f792fd18ae1400d9b6,5996cddab893a4652a15592fb58ab8db,2018-07-10 12:30:45,810.0,70.75
8,0005a1a1728c9d785b8e2b08b904576c,1,310ae3c140ff94b03219ad0adc3c778f,a416b6a846a11724393025641d4edd5e,2018-03-26 18:31:29,145.95,11.65
9,0005f50442cb953dcd1d21e1fb923495,1,4535b0e1091c278dfd193e5a1d63b39f,ba143b05f0110f0dc71ad71b4466ce92,2018-07-06 14:10:56,53.99,11.4


In [0]:
print((orders_items_df.count(), len(orders_items_df.columns)))

(112652, 7)


In [0]:
orders_df.limit(10).toPandas()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00
5,a4591c265e18cb1dcee52889e2d8acc3,503740e9ca751ccdda7ba28e9ab8f608,delivered,2017-07-09 21:57:05,2017-07-09 22:10:13,2017-07-11 14:58:04,2017-07-26 10:57:55,2017-08-01 00:00:00
6,136cce7faa42fdb2cefd53fdc79a6098,ed0271e0b7da060a393796590e7b737a,invoiced,2017-04-11 12:22:08,2017-04-13 13:25:17,,,2017-05-09 00:00:00
7,6514b8ad8028c9f2cc2374ded245783f,9bdf08b4b3b52b5526ff42d37d47f222,delivered,2017-05-16 13:10:30,2017-05-16 13:22:11,2017-05-22 10:07:46,2017-05-26 12:55:51,2017-06-07 00:00:00
8,76c6e866289321a7c93b82b54852dc33,f54a9f0e6b351c431402b8461ea51999,delivered,2017-01-23 18:29:09,2017-01-25 02:50:47,2017-01-26 14:16:31,2017-02-02 14:08:10,2017-03-06 00:00:00
9,e69bfb5eb88e0ed6a785585b27e16dbf,31ad1d1b63eb9962463f764d4e6e0c9d,delivered,2017-07-29 11:55:02,2017-07-29 12:05:32,2017-08-10 19:45:24,2017-08-16 17:14:30,2017-08-23 00:00:00


In [0]:
print((orders_df.count(), len(orders_df.columns)))

(99443, 8)


In [0]:
orders_payments_df.limit(10).toPandas()

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45
5,298fcdf1f73eb413e4d26d01b25bc1cd,1,credit_card,2,96.12
6,771ee386b001f06208a7419e4fc1bbd7,1,credit_card,1,81.16
7,3d7239c394a212faae122962df514ac7,1,credit_card,3,51.84
8,1f78449c87a54faf9e96e88ba1491fa9,1,credit_card,6,341.09
9,0573b5e23cbd798006520e1d5b4c6714,1,boleto,1,51.95


In [0]:
print((orders_payments_df.count(), len(orders_payments_df.columns)))

(103887, 5)


In [0]:
orders_reviews_df.limit(10).toPandas()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53
5,15197aa66ff4d0650b5434f1b46cda19,b18dcdf73be66366873cd26c5724d1dc,1,,,2018-04-13 00:00:00,2018-04-16 00:39:37
6,07f9bee5d1b850860defd761afa7ff16,e48aa0d2dcec3a2e87348811bcfdf22b,5,,,2017-07-16 00:00:00,2017-07-18 19:30:34
7,7c6400515c67679fbee952a7525281ef,c31a859e34e3adac22f376954e19b39d,5,,,2018-08-14 00:00:00,2018-08-14 21:36:06
8,a3f6f7f6f433de0aefbb97da197c554c,9c214ac970e84273583ab523dfafd09b,5,,,2017-05-17 00:00:00,2017-05-18 12:05:37
9,8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4,recomendo,aparelho eficiente. no site a marca do aparelh...,2018-05-22 00:00:00,2018-05-23 16:45:47


In [0]:
print((orders_reviews_df.count(), len(orders_reviews_df.columns)))

(99224, 7)


In [0]:
customers_df.limit(10).toPandas()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP
5,879864dab9bc3047522c92c82e1212b8,4c93744516667ad3b8f1fb645a3116a4,89254,jaragua do sul,SC
6,fd826e7cf63160e536e0908c76c3f441,addec96d2e059c80c30fe6871d30d177,4534,sao paulo,SP
7,5e274e7a0c3809e14aba7ad5aae0d407,57b2a98a409812fe9618067b6b8ebe4f,35182,timoteo,MG
8,5adf08e34b2e993982a47070956c5c65,1175e95fb47ddff9de6b2b06188f7e0d,81560,curitiba,PR
9,4b7139f34592b3a31687243a302fa75b,9afe194fb833f79e300e37e580171f22,30575,belo horizonte,MG


In [0]:
print((customers_df.count(), len(customers_df.columns)))

(99442, 5)


In [0]:
geolocations_df.limit(10).toPandas()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.54562128115268,-46.63929204800168,sao paulo,SP
1,1046,-23.54608112703553,-46.64482029837157,sao paulo,SP
2,1046,-23.54612896641469,-46.64295148361138,sao paulo,SP
3,1041,-23.5443921648681,-46.63949930627844,sao paulo,SP
4,1035,-23.541577961711493,-46.64160722329613,sao paulo,SP
5,1012,-23.547762303364262,-46.63536053788448,são paulo,SP
6,1047,-23.54627311241268,-46.64122516971552,sao paulo,SP
7,1013,-23.546923208436723,-46.6342636964915,sao paulo,SP
8,1029,-23.543769055769133,-46.63427784085132,sao paulo,SP
9,1011,-23.547639550320632,-46.63603162315495,sao paulo,SP


In [0]:
print((geolocations_df.count(), len(geolocations_df.columns)))

(1000163, 5)


In [0]:
sellers_df.limit(10).toPandas()

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP
5,c240c4061717ac1806ae6ee72be3533b,20920,rio de janeiro,RJ
6,e49c26c3edfa46d227d5121a6b6e4d37,55325,brejao,PE
7,1b938a7ec6ac5061a66a3766e0e75f90,16304,penapolis,SP
8,768a86e36ad6aae3d03ee3c6433d61df,1529,sao paulo,SP
9,ccc4bbb5f32a6ab2b7066a4130f114e3,80310,curitiba,PR


In [0]:
print((sellers_df.count(), len(sellers_df.columns)))

(3096, 4)


In [0]:
products_df.limit(10).toPandas()

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40,287,1,225,16,10,14
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44,276,1,1000,30,18,20
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46,250,1,154,18,9,15
3,cef67bcfe19066a932b7673e239eb23d,bebes,27,261,1,371,26,4,26
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37,402,4,625,20,17,13
5,41d3672d4792049fa1779bb35283ed13,instrumentos_musicais,60,745,1,200,38,5,11
6,732bd381ad09e530fe0a5f457d81becb,cool_stuff,56,1272,4,18350,70,24,44
7,2548af3e6e77a690cf3eb6368e9ab61e,moveis_decoracao,56,184,2,900,40,8,40
8,37cc742be07708b53a98702e77a21a02,eletrodomesticos,57,163,1,400,27,13,17
9,8c92109888e8cdf9d66dc7e463025574,brinquedos,36,1156,1,600,17,10,12


In [0]:
print((products_df.count(), len(products_df.columns)))

(32952, 9)


In [0]:
product_translations_df.limit(10).toPandas()

Unnamed: 0,product_category_name,product_category_name_english
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories
2,automotivo,auto
3,cama_mesa_banho,bed_bath_table
4,moveis_decoracao,furniture_decor
5,esporte_lazer,sports_leisure
6,perfumaria,perfumery
7,utilidades_domesticas,housewares
8,telefonia,telephony
9,relogios_presentes,watches_gifts


In [0]:
print((product_translations_df.count(), len(product_translations_df.columns)))

(74, 2)


### Analise bem preliminar e testes em novas colunas

Junta todas as tabelas em uma só

In [0]:
df= customers_df.join( orders_df,["customer_id"], how='inner')
df= df.join(orders_reviews_df, ["order_id"], how='inner')
df= df.join(orders_items_df, ["order_id"], how='inner')
df= df.join(products_df, ["product_id"], how='inner')
df= df.join(orders_payments_df, ["order_id"], how='inner')
df= df.join(sellers_df, ['seller_id'], how='inner')
print((df.count(), len(df.columns)))

(117329, 39)


In [0]:
df.printSchema()

root
 |-- seller_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: string (nullable = true)
 |-- order_approved_at: string (nullable = true)
 |-- order_delivered_carrier_date: string (nullable = true)
 |-- order_delivered_customer_date: string (nullable = true)
 |-- order_estimated_delivery_date: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_score: string (nullable = true)
 |-- review_comment_title: string (nullable = true)
 |-- review_comment_message: string (nullable = true)
 |-- review_creation_date: string (nullable = true)
 |-- review_answer_timestamp: string (nullable = true)
 |-

Verifica se existem valores duplicados

In [0]:
df \
    .groupby(df.columns) \
    .count() \
    .where('count > 1') \
    .sort('count', ascending=False) \
    .show()

+---------+--------+----------+-----------+------------------+------------------------+-------------+--------------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+---------+------------+--------------------+----------------------+--------------------+-----------------------+-------------+-------------------+-----+-------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+------------------+------------+--------------------+-------------+----------------------+-----------+------------+-----+
|seller_id|order_id|product_id|customer_id|customer_unique_id|customer_zip_code_prefix|customer_city|customer_state|order_status|order_purchase_timestamp|order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|review_id|review_score|review_co

Verifica quantidade de linhas nulas para cada coluna

In [0]:

df_count_nulls = df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns])
cols = []

for c in df.columns:
    count_null_values = df_count_nulls.where(col(c) > 0).count()
    if count_null_values > 0:
        cols.append(c)

df_count_nulls.select(cols).toPandas()


Out[284]: '\ndf_count_nulls = df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns])\ncols = []\n\nfor c in df.columns:\n    count_null_values = df_count_nulls.where(col(c) > 0).count()\n    if count_null_values > 0:\n        cols.append(c)\n\ndf_count_nulls.select(cols).toPandas()\n'

Exclui linhas nulas, baseado nas colunas da tabela de pedidos

In [0]:
df = df.dropna(subset = orders_df.columns)

Diminui a quantidade de categorias para facilitar as analises

In [0]:
def reduce_categories(x):
    if x in ['moveis_escritorio', 'moveis_decoracao', 'moveis_sala', 'moveis_cozinha_area_de_servico_jantar_e_jardim', 'cama_mesa_banho', 'casa_conforto', 'casa_conforto_2', 'casa_construcao', 'ferramentas_jardim', 'moveis_quarto', 'moveis_colchao_e_estofado']:
        return 'Mobília'
    
    elif x in ['automotivo', 'informatica_acessorios', 'instrumentos_musicais', 'consoles_games', 'relogios_presentes', 'climatizacao', 'telefonia', 'eletronicos', 'telefonia_fixa', 'tablets_impressao_imagem', 'pcs', 'portateis_casa_forno_e_cafe', 'eletroportateis', 'audio', 'sinalizacao_e_seguranca', 'seguros_e_servicos']:
        return 'Eletrônicos'
    
    elif x in ['fashion_roupa_feminina', 'fashion_roupa_masculina', 'fashion_bolsas_e_acessorios', 'fashion_calcados', 'fashion_esporte', 'fashion_underwear_e_moda_praia', 'fashion_roupa_infanto_juvenil', 'bebes', 'cool_stuff', ]:
        return 'Fashion'
    
    elif x in ['utilidades_domesticas', 'casa_conforto', 'eletrodomesticos', 'eletrodomesticos_2', 'flores', 'construcao_ferramentas_jardim', 'ferramentas_jardim', 'construcao_ferramentas_iluminacao', 'construcao_ferramentas_ferramentas', 'malas_acessorios', 'la_cuisine', 'pet_shop', 'market_place']:
        return 'Acessórios Doméstico'
    
    elif x in ['esporte_lazer', 'brinquedos', 'cds_dvds_musicais', 'musica', 'dvds_blu_ray', 'cine_foto', 'artigos_de_festas', 'artigos_de_natal', 'artes_e_artesanato', 'artes']:
        return 'Entreterimento'
    
    elif x in ['beleza_saude', 'perfumaria', 'fraldas_higiene']:
        return 'Produtos de Beleza e Higiene'
    
    elif x in ['alimentos_bebidas', 'bebidas', 'alimentos']:
        return 'Comidas e Bebidas'
    
    elif x in ['livros_interesse_geral', 'livros_tecnicos', 'livros_importados', 'papelaria']:
        return 'Livros e artigos de papelaria'
    
    elif x in ['construcao_ferramentas_construcao', 'construcao_ferramentas_seguranca', 'industria_comercio_e_negocios', 'agro_industria_e_comercio']:
        return 'Industria e Construção'
    else:
        return x
    
    
reduce_categories_udf = udf(reduce_categories, StringType())

In [0]:
df = df.withColumn("product_category", reduce_categories_udf(col("product_category_name")))
df.select("product_category").distinct().limit(80).toPandas()

Unnamed: 0,product_category
0,pc_gamer
1,Fashion
2,
3,Industria e Construção
4,portateis_cozinha_e_preparadores_de_alimentos
5,Entreterimento
6,Eletrônicos
7,Mobília
8,Produtos de Beleza e Higiene
9,Livros e artigos de papelaria


Cria colunas para facilitar analise das colunas das datas

In [0]:
df = df.withColumn('order_estimated_delivery_date', to_timestamp(col("order_estimated_delivery_date")))
df = df.withColumn('order_purchase_timestamp', to_timestamp(col("order_purchase_timestamp"))) 
df = df.withColumn('order_delivered_customer_date', to_timestamp(col("order_delivered_customer_date"))) 
df = df.withColumn('order_delivered_carrier_date', to_timestamp(col("order_delivered_carrier_date"))) 
df = df.withColumn('shipping_limit_date', to_timestamp(col("shipping_limit_date"))) 

df.select(['order_estimated_delivery_date', 'order_purchase_timestamp','order_delivered_customer_date',
           'order_delivered_carrier_date', 'shipping_limit_date']).limit(10).toPandas()

Unnamed: 0,order_estimated_delivery_date,order_purchase_timestamp,order_delivered_customer_date,order_delivered_carrier_date,shipping_limit_date
0,2017-10-18,2017-10-02 10:56:33,2017-10-10 21:25:13,2017-10-04 19:55:00,2017-10-06 11:07:15
1,2017-10-18,2017-10-02 10:56:33,2017-10-10 21:25:13,2017-10-04 19:55:00,2017-10-06 11:07:15
2,2017-10-18,2017-10-02 10:56:33,2017-10-10 21:25:13,2017-10-04 19:55:00,2017-10-06 11:07:15
3,2018-08-13,2018-07-24 20:41:37,2018-08-07 15:27:45,2018-07-26 14:31:00,2018-07-30 03:24:27
4,2018-09-04,2018-08-08 08:38:49,2018-08-17 18:06:29,2018-08-08 13:50:00,2018-08-13 08:55:23
5,2017-12-15,2017-11-18 19:28:06,2017-12-02 00:28:42,2017-11-22 13:39:59,2017-11-23 19:45:59
6,2018-02-26,2018-02-13 21:18:39,2018-02-16 18:17:02,2018-02-14 19:46:34,2018-02-19 20:31:37
7,2017-08-01,2017-07-09 21:57:05,2017-07-26 10:57:55,2017-07-11 14:58:04,2017-07-13 22:10:13
8,2017-06-07,2017-05-16 13:10:30,2017-05-26 12:55:51,2017-05-22 10:07:46,2017-05-22 13:22:11
9,2017-03-06,2017-01-23 18:29:09,2017-02-02 14:08:10,2017-01-26 14:16:31,2017-01-27 18:29:09


In [0]:
def get_arrival_status(x):
    if x == 0:
        return 'Atrasado'
    if x:
        if x >= 0:
            return 'Em tempo'
        else:
            return 'Atrasado'
    else:
        return None
    
get_arrival_status_udf = udf(get_arrival_status, StringType())


In [0]:
df = df.withColumn('estimated_days', datediff(col('order_estimated_delivery_date'),col('order_purchase_timestamp'))) 
df = df.withColumn('arrival_days', datediff(col('order_delivered_customer_date'),col('order_purchase_timestamp'))) 
df = df.withColumn('shipping_days', datediff(col('order_delivered_customer_date'),col('order_delivered_carrier_date')))
df = df.withColumn('seller_to_carrier_status', datediff(col('shipping_limit_date'),col('order_delivered_carrier_date'))) 
df = df.withColumn('arrival_status',datediff(col('order_estimated_delivery_date'),col('order_delivered_customer_date'))) 

df = df.withColumn('seller_to_carrier_status', get_arrival_status_udf(col('seller_to_carrier_status'))) 
df = df.withColumn('arrival_status',get_arrival_status_udf(col('arrival_status'))) 

df.select(['estimated_days','arrival_days', 'shipping_days',  'seller_to_carrier_status', 'arrival_status',
          'order_estimated_delivery_date', 'order_purchase_timestamp']).limit(10).toPandas()

Unnamed: 0,estimated_days,arrival_days,shipping_days,seller_to_carrier_status,arrival_status,order_estimated_delivery_date,order_purchase_timestamp
0,16,8,6,Em tempo,Em tempo,2017-10-18,2017-10-02 10:56:33
1,16,8,6,Em tempo,Em tempo,2017-10-18,2017-10-02 10:56:33
2,16,8,6,Em tempo,Em tempo,2017-10-18,2017-10-02 10:56:33
3,20,14,12,Em tempo,Em tempo,2018-08-13,2018-07-24 20:41:37
4,27,9,9,Em tempo,Em tempo,2018-09-04,2018-08-08 08:38:49
5,27,14,10,Em tempo,Em tempo,2017-12-15,2017-11-18 19:28:06
6,13,3,2,Em tempo,Em tempo,2018-02-26,2018-02-13 21:18:39
7,23,17,15,Em tempo,Em tempo,2017-08-01,2017-07-09 21:57:05
8,22,10,4,Atrasado,Em tempo,2017-06-07,2017-05-16 13:10:30
9,42,10,7,Em tempo,Em tempo,2017-03-06,2017-01-23 18:29:09


In [0]:
def get_duration_status(x):
    if x:
        if x in range(0, 8):
            return 'Muito Rápido'
        
        elif x in range(8, 16):
            return 'Rápido'
        
        elif x in range(16, 25):
            return 'Duração OK'
        
        elif x > 24:
            return 'Devagar'
        
        else:
            return 'Muito Devagar'
    else:
        return None

get_duration_status_udf = udf(get_duration_status, StringType())
df = df.withColumn('estimated_delivery_rate', get_duration_status_udf(col('estimated_days')))
df = df.withColumn('arrival_delivery_rate', get_duration_status_udf(col('arrival_days'))) 
df = df.withColumn('shipping_delivery_rate', get_duration_status_udf(col('shipping_days')))

df.select(['estimated_days','arrival_days', 'shipping_days', 'estimated_delivery_rate',
          'arrival_delivery_rate', 'shipping_delivery_rate' ]).limit(10).toPandas()


Unnamed: 0,estimated_days,arrival_days,shipping_days,estimated_delivery_rate,arrival_delivery_rate,shipping_delivery_rate
0,16,8,6,Duração OK,Rápido,Muito Rápido
1,16,8,6,Duração OK,Rápido,Muito Rápido
2,16,8,6,Duração OK,Rápido,Muito Rápido
3,20,14,12,Duração OK,Rápido,Rápido
4,27,9,9,Devagar,Rápido,Rápido
5,27,14,10,Devagar,Rápido,Rápido
6,13,3,2,Rápido,Muito Rápido,Muito Rápido
7,23,17,15,Duração OK,Duração OK,Rápido
8,22,10,4,Duração OK,Rápido,Muito Rápido
9,42,10,7,Devagar,Rápido,Muito Rápido


Cria colunas para facilitar analise das notas

In [0]:
df = df.withColumn('review_score', col("review_score").cast("int") )
get_review_status_udf = udf(lambda x: 'Satisfeito' if x >= 4 else 'Não Satisfeito', StringType())
df = df.withColumn('review_score_status', get_review_status_udf(col('review_score')))

df.select(['review_score', 'review_score_status'  ]).limit(10).toPandas()

Unnamed: 0,review_score,review_score_status
0,4,Satisfeito
1,4,Satisfeito
2,4,Satisfeito
3,4,Satisfeito
4,5,Satisfeito
5,5,Satisfeito
6,5,Satisfeito
7,4,Satisfeito
8,5,Satisfeito
9,1,Não Satisfeito


In [0]:
max_date_df = df.select( max("order_purchase_timestamp")).first()
max_date = max_date_df.asDict()["max(order_purchase_timestamp)"]
print(max_date)

2018-08-29 15:00:37


Realiza segmentação de clientes

In [0]:
rfm_table = df.groupby('customer_unique_id').agg(max('order_purchase_timestamp').alias('Recency'),
                                                 count('product_id').alias('Frequancy'),
                                                 sum('payment_value').alias('Monetary'))
rfm_table = rfm_table.withColumn('Recency', datediff( lit(max_date),col('Recency'))) 
rfm_table.limit(10).toPandas()

Unnamed: 0,customer_unique_id,Recency,Frequancy,Monetary
0,969cdc8af5b07074766e79c1e6d76c24,114,2,269.32
1,f2a9bc9a1db05c873e1419654f747f9e,29,1,122.42
2,d0c5e56e04e886e73c79b3b71f0d6f0b,417,1,79.24
3,5f03b965e26e79a371d229cfeeb578d7,435,1,54.0
4,14a188558af6cd5bc222ca773d395a7f,175,2,245.15
5,ef1c2fafea5285a4bfb61386bc5e3154,9,2,471.68
6,d339ed835c9d8fd6b1e3c5cb60850bc5,565,2,524.2
7,39d6e50625a51a618c2cf02a026231c1,384,1,66.74
8,8d2fa65d968da66afc05f2e28250c9dc,176,1,150.91
9,d04921557f1cde4963c30a11dfb1719d,6,1,155.98


In [0]:
qd =  QuantileDiscretizer(numBuckets=4, inputCols=["Recency", "Frequancy", "Monetary"],
                          outputCols=["r_score", "f_score", "m_score"])
rfm_table = qd.fit(rfm_table).transform(rfm_table)
rfm_table.limit(10).toPandas()

Unnamed: 0,customer_unique_id,Recency,Frequancy,Monetary,r_score,f_score,m_score
0,969cdc8af5b07074766e79c1e6d76c24,114,2,269.32,1.0,1.0,3.0
1,f2a9bc9a1db05c873e1419654f747f9e,29,1,122.42,0.0,1.0,2.0
2,d0c5e56e04e886e73c79b3b71f0d6f0b,417,1,79.24,3.0,1.0,1.0
3,5f03b965e26e79a371d229cfeeb578d7,435,1,54.0,3.0,1.0,0.0
4,14a188558af6cd5bc222ca773d395a7f,175,2,245.15,1.0,1.0,3.0
5,ef1c2fafea5285a4bfb61386bc5e3154,9,2,471.68,0.0,1.0,3.0
6,d339ed835c9d8fd6b1e3c5cb60850bc5,565,2,524.2,3.0,1.0,3.0
7,39d6e50625a51a618c2cf02a026231c1,384,1,66.74,3.0,1.0,1.0
8,8d2fa65d968da66afc05f2e28250c9dc,176,1,150.91,1.0,1.0,2.0
9,d04921557f1cde4963c30a11dfb1719d,6,1,155.98,0.0,1.0,2.0


In [0]:
rfm_table = rfm_table.withColumn('r_score', (-1)*(col("r_score") - 4))
rfm_table = rfm_table.withColumn('f_score', col('f_score') + 1 )
rfm_table = rfm_table.withColumn('m_score', col('m_score') + 1 )

rfm_table.limit(10).toPandas()

Unnamed: 0,customer_unique_id,Recency,Frequancy,Monetary,r_score,f_score,m_score
0,969cdc8af5b07074766e79c1e6d76c24,114,2,269.32,3.0,2.0,4.0
1,f2a9bc9a1db05c873e1419654f747f9e,29,1,122.42,4.0,2.0,3.0
2,d0c5e56e04e886e73c79b3b71f0d6f0b,417,1,79.24,1.0,2.0,2.0
3,5f03b965e26e79a371d229cfeeb578d7,435,1,54.0,1.0,2.0,1.0
4,14a188558af6cd5bc222ca773d395a7f,175,2,245.15,3.0,2.0,4.0
5,ef1c2fafea5285a4bfb61386bc5e3154,9,2,471.68,4.0,2.0,4.0
6,d339ed835c9d8fd6b1e3c5cb60850bc5,565,2,524.2,1.0,2.0,4.0
7,39d6e50625a51a618c2cf02a026231c1,384,1,66.74,1.0,2.0,2.0
8,8d2fa65d968da66afc05f2e28250c9dc,176,1,150.91,3.0,2.0,3.0
9,d04921557f1cde4963c30a11dfb1719d,6,1,155.98,4.0,2.0,3.0


In [0]:
def customer_segmantation(rfm_score):
    
    if rfm_score == 444:
        return 'VIP'
    
    elif rfm_score >= 433 and rfm_score < 444:
        return 'Muito Leal'

    elif  rfm_score >=421 and rfm_score< 433:
        return 'Lealdade em Potencial'

    elif rfm_score>=344 and rfm_score < 421:
        return 'Novo Cliente'

    elif rfm_score>=323 and rfm_score<344:
        return 'Cliente em Potencial'

    elif rfm_score>=224 and rfm_score<311:
        return 'Alto Risco de Rotatividade' 

    else:
        return 'Cliente Perdido' 

In [0]:
customer_segmantation_udf = udf(customer_segmantation,  StringType())

rfm_table = rfm_table.withColumn('rfm_score', (100 * col("r_score") + 10 * col("f_score") + col("m_score")) )
rfm_table = rfm_table.withColumn('customer_segmantation', customer_segmantation_udf(col("rfm_score")) )

rfm_table.limit(20).toPandas()

Unnamed: 0,customer_unique_id,Recency,Frequancy,Monetary,r_score,f_score,m_score,rfm_score,customer_segmantation
0,969cdc8af5b07074766e79c1e6d76c24,114,2,269.32,3.0,2.0,4.0,324.0,Cliente em Potencial
1,f2a9bc9a1db05c873e1419654f747f9e,29,1,122.42,4.0,2.0,3.0,423.0,Lealdade em Potencial
2,d0c5e56e04e886e73c79b3b71f0d6f0b,417,1,79.24,1.0,2.0,2.0,122.0,Cliente Perdido
3,5f03b965e26e79a371d229cfeeb578d7,435,1,54.0,1.0,2.0,1.0,121.0,Cliente Perdido
4,14a188558af6cd5bc222ca773d395a7f,175,2,245.15,3.0,2.0,4.0,324.0,Cliente em Potencial
5,ef1c2fafea5285a4bfb61386bc5e3154,9,2,471.68,4.0,2.0,4.0,424.0,Lealdade em Potencial
6,d339ed835c9d8fd6b1e3c5cb60850bc5,565,2,524.2,1.0,2.0,4.0,124.0,Cliente Perdido
7,39d6e50625a51a618c2cf02a026231c1,384,1,66.74,1.0,2.0,2.0,122.0,Cliente Perdido
8,8d2fa65d968da66afc05f2e28250c9dc,176,1,150.91,3.0,2.0,3.0,323.0,Cliente em Potencial
9,d04921557f1cde4963c30a11dfb1719d,6,1,155.98,4.0,2.0,3.0,423.0,Lealdade em Potencial


### Modelagem de dados para DataWarehouse

Esse datawarehouse usam as colunas novas testadas. Além disso, diferente do que foi feito nos testes, nenhuma linha foi excluída

#### Cria dimensão Tempo

In [0]:
start_date = '2016-01-01'
end_date = datetime.date.today().strftime("%Y-%m-%d")

dates = pd.date_range(start=start_date, end=end_date, freq = '1H')
datetimes = [date.to_pydatetime() for date in dates]

dw_time = spark.createDataFrame(datetimes, TimestampType())
dw_time = dw_time.withColumnRenamed("value", "full_date")

dw_time.limit(20).toPandas()

Unnamed: 0,full_date
0,2016-01-01 00:00:00
1,2016-01-01 01:00:00
2,2016-01-01 02:00:00
3,2016-01-01 03:00:00
4,2016-01-01 04:00:00
5,2016-01-01 05:00:00
6,2016-01-01 06:00:00
7,2016-01-01 07:00:00
8,2016-01-01 08:00:00
9,2016-01-01 09:00:00


In [0]:
def get_month_name (x):
    if x == 1:
        return "Janeiro"
    elif x == 2:
        return "Fevereiro"
    elif x == 3:
        return "Março"
    elif x == 4:
        return "Abril"
    elif x == 5:
        return "Maio"
    elif x == 6:
        return "Junho"
    elif x == 7:
        return "Julho"
    elif x == 8:
        return "Agosto"
    elif x == 9:
        return "Setembro"
    elif x == 10:
        return "Outubro"
    elif x == 11:
        return "Novembro"
    elif x == 12:
        return "Dezembro"

get_month_name_udf = udf(get_month_name, StringType())

def get_year_month (year, month):
    month = str(month) if month >= 10 else "0" + str(month) 
    return str(year) + '-' + month

get_year_month_udf = udf(get_year_month, StringType())

def get_is_weekend (x):
    if x in [1,7]:
        return 1
    else:
        return 0
    
get_is_weekend_udf = udf(get_is_weekend, IntegerType())

def get_day_of_week (x):
    if x == 1:
        return 'Domingo'
    elif x == 2:
        return 'Segunda-Feira'
    elif x == 3:
        return 'Terça-Feira'
    elif x == 4:
        return 'Quarta-Feira'
    elif x == 5:
        return 'Quinta-feira'
    elif x == 6:
        return 'Sexta-Feira'
    elif x == 7:
        return 'Sábado'
    
get_day_of_week_udf = udf(get_day_of_week, StringType())

In [0]:
dw_time = dw_time.select( "full_date", year("full_date").alias('year'), month("full_date").alias('month'), 
                         dayofmonth("full_date").alias('day'), hour("full_date").alias("hour") ,
                         dayofweek("full_date").alias('dayofweek'))

dw_time = dw_time.withColumn("year_month", get_year_month_udf(col("year"),col("month")))
dw_time = dw_time.withColumn("is_weekend", get_is_weekend_udf(col('dayofweek')))
dw_time = dw_time.withColumn("day_of_week", get_day_of_week_udf(col('dayofweek')))
dw_time = dw_time.withColumn("month_name", get_month_name_udf(col("month")))

windowSpec  = Window.partitionBy("full_date").orderBy("full_date")
dw_time = dw_time.withColumn("IDSK1", row_number().over(windowSpec))
windowSpec  = Window.partitionBy("IDSK1").orderBy("full_date")
dw_time = dw_time.withColumn("IDSK", row_number().over(windowSpec))

dw_time = dw_time.select(["IDSK", "full_date", "year", "month", "day", "hour", "year_month", "month_name" , "is_weekend",
                          "day_of_week"])

dw_time.limit(27).toPandas()

Unnamed: 0,IDSK,full_date,year,month,day,hour,year_month,month_name,is_weekend,day_of_week
0,1,2016-01-01 00:00:00,2016,1,1,0,2016-01,Janeiro,0,Sexta-Feira
1,2,2016-01-01 01:00:00,2016,1,1,1,2016-01,Janeiro,0,Sexta-Feira
2,3,2016-01-01 02:00:00,2016,1,1,2,2016-01,Janeiro,0,Sexta-Feira
3,4,2016-01-01 03:00:00,2016,1,1,3,2016-01,Janeiro,0,Sexta-Feira
4,5,2016-01-01 04:00:00,2016,1,1,4,2016-01,Janeiro,0,Sexta-Feira
5,6,2016-01-01 05:00:00,2016,1,1,5,2016-01,Janeiro,0,Sexta-Feira
6,7,2016-01-01 06:00:00,2016,1,1,6,2016-01,Janeiro,0,Sexta-Feira
7,8,2016-01-01 07:00:00,2016,1,1,7,2016-01,Janeiro,0,Sexta-Feira
8,9,2016-01-01 08:00:00,2016,1,1,8,2016-01,Janeiro,0,Sexta-Feira
9,10,2016-01-01 09:00:00,2016,1,1,9,2016-01,Janeiro,0,Sexta-Feira


#### Carrega dados que já estão no DataWarehouse para atualização

In [0]:
path = "/FileStore/shared_uploads/luannrs@hotmail.com/dw_data/"

# CSV
try:
    dw_itens_old = spark.read.csv(path+'dw_itens.csv',header=True)
    dw_orders_old = spark.read.csv(path+'dw_orders.csv',header=True)
    dw_payments_old = spark.read.csv(path+'dw_payments.csv',header=True)
    dw_reviews_old = spark.read.option("multiLine",True).csv(path+'dw_reviews.csv',header=True)
    
    dw_customers_old = spark.read.csv(path+'dw_customers.csv',header=True)
    dw_sellers_old = spark.read.csv(path+'dw_sellers.csv',header=True)
    dw_products_old = spark.read.csv(path+'dw_products.csv',header=True)
except AnalysisException:
    dw_itens_old = spark.createDataFrame([], StructType([]))
    dw_orders_old = spark.createDataFrame([], StructType([]))
    dw_payments_old = spark.createDataFrame([], StructType([]))
    dw_reviews_old  = spark.createDataFrame([], StructType([]))

    dw_customers_old = spark.createDataFrame([], StructType([]))
    dw_sellers_old = spark.createDataFrame([], StructType([]))
    dw_products_old = spark.createDataFrame([], StructType([]))

In [0]:
print(dw_products_old.count(), len(dw_products_old.columns))
print(dw_sellers_old.count(), len(dw_sellers_old.columns))
print(dw_customers_old.count(), len(dw_customers_old.columns))

print(dw_orders_old.count(), len(dw_orders_old.columns))
print(dw_payments_old.count(), len(dw_payments_old.columns))
print(dw_reviews_old.count(), len(dw_reviews_old.columns))
print(dw_itens_old.count(), len(dw_itens_old.columns))


32953 12
3097 9
99442 9
99443 6
103886 14
99223 16
112652 17


#### Muda tipo das colunas que deveriam ser diferente de String

In [0]:
if dw_products_old.count() > 0:
    dw_products_old = dw_products_old.withColumn("IDSK", col("IDSK").cast(IntegerType()) )
    dw_products_old = dw_products_old.withColumn("product_name_lenght", col("product_name_lenght").cast(IntegerType()))
    dw_products_old = dw_products_old.withColumn("product_description_lenght", col("product_description_lenght").cast(IntegerType()))
    dw_products_old = dw_products_old.withColumn("product_photos_qty", col("product_photos_qty").cast(IntegerType()))
    dw_products_old = dw_products_old.withColumn("product_weight_g", col("product_weight_g").cast(DoubleType()))
    dw_products_old = dw_products_old.withColumn("product_length_cm", col("product_length_cm").cast(DoubleType()))
    dw_products_old = dw_products_old.withColumn("product_height_cm", col("product_height_cm").cast(DoubleType()))
    dw_products_old = dw_products_old.withColumn("product_width_cm", col("product_width_cm").cast(DoubleType()))

if dw_sellers_old.count() > 0:
    dw_sellers_old = dw_sellers_old.withColumn("IDSK", col("IDSK").cast(IntegerType()) )
    dw_sellers_old = dw_sellers_old.withColumn("geolocation_lat", col("geolocation_lat").cast(DoubleType()) )
    dw_sellers_old = dw_sellers_old.withColumn("geolocation_lng", col("geolocation_lng").cast(DoubleType()) )

if dw_customers_old.count() > 0:
    dw_customers_old = dw_customers_old.withColumn("IDSK", col("IDSK").cast(IntegerType()) )
    dw_customers_old = dw_customers_old.withColumn("geolocation_lat", col("geolocation_lat").cast(DoubleType()) )
    dw_customers_old = dw_customers_old.withColumn("geolocation_lng", col("geolocation_lng").cast(DoubleType()) )

if dw_orders_old.count() > 0:
    dw_orders_old = dw_orders_old.withColumn("IDSK", col("IDSK").cast(IntegerType()) )

if dw_payments_old.count() > 0:
    dw_payments_old = dw_payments_old.withColumn("customer_id", col("customer_id").cast(IntegerType()) )
    dw_payments_old = dw_payments_old.withColumn("order_id", col("order_id").cast(IntegerType()) )
    dw_payments_old = dw_payments_old.withColumn("payment_sequential", col("payment_sequential").cast(IntegerType()) )
    dw_payments_old = dw_payments_old.withColumn("payment_installments", col("payment_installments").cast(IntegerType()) )
    dw_payments_old = dw_payments_old.withColumn("payment_value", col("payment_value").cast(DoubleType()) )

    dw_payments_old = dw_payments_old.withColumn('order_purchase_timestamp_id', col('order_purchase_timestamp_id').cast(IntegerType())) 
    dw_payments_old = dw_payments_old.withColumn('order_approved_at_id', col('order_approved_at_id').cast(IntegerType())) 
    dw_payments_old = dw_payments_old.withColumn('order_delivered_carrier_date_id', col('order_delivered_carrier_date_id').cast(IntegerType())) 
    dw_payments_old = dw_payments_old.withColumn('order_delivered_customer_date_id', col('order_delivered_customer_date_id').cast(IntegerType())) 
    dw_payments_old = dw_payments_old.withColumn('order_estimated_delivery_date_id', col('order_estimated_delivery_date_id').cast(IntegerType()))

    dw_payments_old = dw_payments_old.withColumn('estimated_days', col('estimated_days').cast(IntegerType()))
    dw_payments_old = dw_payments_old.withColumn('arrival_days', col('arrival_days').cast(IntegerType()))
    dw_payments_old = dw_payments_old.withColumn('shipping_days', col('shipping_days').cast(IntegerType()))

if dw_reviews_old.count() > 0:
    dw_reviews_old = dw_reviews_old.withColumn("customer_id", col("customer_id").cast(IntegerType()) )
    dw_reviews_old = dw_reviews_old.withColumn("order_id", col("order_id").cast(IntegerType()) )
    dw_reviews_old = dw_reviews_old.withColumn("review_score", col("review_score").cast(IntegerType()) )
    dw_reviews_old = dw_reviews_old.withColumn("review_creation_date_id", col("review_creation_date_id").cast(IntegerType()) )
    dw_reviews_old = dw_reviews_old.withColumn("review_answer_timestamp_id" , col("review_answer_timestamp_id" ).cast(IntegerType()) )

    dw_reviews_old = dw_reviews_old.withColumn('order_purchase_timestamp_id', col('order_purchase_timestamp_id').cast(IntegerType())) 
    dw_reviews_old = dw_reviews_old.withColumn('order_approved_at_id', col('order_approved_at_id').cast(IntegerType())) 
    dw_reviews_old = dw_reviews_old.withColumn('order_delivered_carrier_date_id', col('order_delivered_carrier_date_id').cast(IntegerType())) 
    dw_reviews_old = dw_reviews_old.withColumn('order_delivered_customer_date_id', col('order_delivered_customer_date_id').cast(IntegerType())) 
    dw_reviews_old = dw_reviews_old.withColumn('order_estimated_delivery_date_id', col('order_estimated_delivery_date_id').cast(IntegerType()))

    dw_reviews_old = dw_reviews_old.withColumn('estimated_days', col('estimated_days').cast(IntegerType()))
    dw_reviews_old = dw_reviews_old.withColumn('arrival_days', col('arrival_days').cast(IntegerType()))
    dw_reviews_old = dw_reviews_old.withColumn('shipping_days', col('shipping_days').cast(IntegerType()))

if dw_itens_old.count() > 0:
    dw_itens_old = dw_itens_old.withColumn("order_item_id", col("order_item_id").cast(IntegerType()) )
    dw_itens_old = dw_itens_old.withColumn("customer_id", col("customer_id").cast(IntegerType()) )
    dw_itens_old = dw_itens_old.withColumn("order_id", col("order_id").cast(IntegerType()) )
    dw_itens_old = dw_itens_old.withColumn("seller_id", col("order_id").cast(IntegerType()) )
    dw_itens_old = dw_itens_old.withColumn("product_id", col("order_id").cast(IntegerType()) )
    
    dw_itens_old = dw_itens_old.withColumn('order_purchase_timestamp_id', col('order_purchase_timestamp_id').cast(IntegerType())) 
    dw_itens_old = dw_itens_old.withColumn('order_approved_at_id', col('order_approved_at_id').cast(IntegerType())) 
    dw_itens_old = dw_itens_old.withColumn('order_delivered_carrier_date_id', col('order_delivered_carrier_date_id').cast(IntegerType())) 
    dw_itens_old = dw_itens_old.withColumn('order_delivered_customer_date_id', col('order_delivered_customer_date_id').cast(IntegerType())) 
    dw_itens_old = dw_itens_old.withColumn('order_estimated_delivery_date_id', col('order_estimated_delivery_date_id').cast(IntegerType()))
    dw_itens_old = dw_itens_old.withColumn('shipping_limit_date_id', col('shipping_limit_date_id').cast(IntegerType()))

    dw_itens_old = dw_itens_old.withColumn('estimated_days', col('estimated_days').cast(IntegerType()))
    dw_itens_old = dw_itens_old.withColumn('arrival_days', col('arrival_days').cast(IntegerType()))
    dw_itens_old = dw_itens_old.withColumn('shipping_days', col('shipping_days').cast(IntegerType()))

#### Cria funções auxiliares para criação do DataWarehouse

In [0]:
def verify_new_str_value (idsk, value, new_value):
    value = str(value)
    new_value = str(new_value)

    if not idsk:
        return new_value
    else:
        return value

def verify_new_double_value (idsk, value, new_value):
    try:
        value = float(value)
    except:
        value = None
    
    try:
        new_value = float(new_value)
    except:
        new_value = None

    if not idsk:
        return new_value
    else:
        return value
    
def verify_new_int_value (idsk, value, new_value):
    try:
        value = int(value)
    except:
        value = None
    
    try:
        new_value = int(new_value)
    except:
        new_value = None

    if not idsk:
        return new_value
    else:
        return value

verify_new_value_str_udf = udf(verify_new_str_value, StringType())
verify_new_value_int_udf = udf(verify_new_int_value, IntegerType())
verify_new_value_double_udf = udf(verify_new_double_value, DoubleType())

def verify_INICIO (inicio):
    if not inicio:
        return datetime.date.today().strftime("%Y-%m-%d")
    else:
        return inicio
    
verify_INICIO_udf = udf(verify_INICIO, StringType())    


In [0]:


def verify_new_product_FIM (fim, idsk, product_category, product_category_new, product_name_lenght, product_name_lenght_new,
                            product_description_lenght, product_description_lenght_new, product_photos_qty, product_photos_qty_new,
                            product_weight_g, product_weight_g_new, product_length_cm, product_length_cm_new, product_height_cm,
                            product_height_cm_new, product_width_cm, product_width_cm_new):
    
    product_category = product_category if product_category and product_category != 'None' else None
    product_category_new = product_category_new if product_category_new and product_category_new != 'None' else None

    product_name_lenght = int(product_name_lenght) if product_name_lenght else None
    product_name_lenght_new = int(product_name_lenght_new) if product_name_lenght_new else None
    product_description_lenght = int(product_description_lenght) if product_description_lenght else None
    product_description_lenght_new = int(product_description_lenght_new) if product_description_lenght_new else None
    product_photos_qty = int(product_photos_qty) if product_photos_qty else None
    product_photos_qty_new = int(product_photos_qty_new) if product_photos_qty_new else None

    product_weight_g = float(product_weight_g) if product_weight_g else None
    product_weight_g_new = float(product_weight_g_new) if product_weight_g_new else None
    product_length_cm= float(product_length_cm) if product_length_cm else None
    product_length_cm_new = float(product_length_cm_new) if product_length_cm_new else None
    product_height_cm = float(product_height_cm) if product_height_cm else None
    product_height_cm_new = float(product_height_cm_new) if product_height_cm_new else None
    product_width_cm = float(product_width_cm) if product_width_cm else None
    product_width_cm_new = float(product_width_cm_new) if product_width_cm_new else None
    
    if ( product_category != product_category_new or product_name_lenght != product_name_lenght_new or product_description_lenght != product_description_lenght_new or product_photos_qty != product_photos_qty_new or product_weight_g != product_weight_g_new or product_length_cm != product_length_cm_new or product_height_cm != product_height_cm_new or product_width_cm != product_width_cm_new) and idsk:
        return datetime.date.today().strftime("%Y-%m-%d")
    else:
        return None


verify_new_product_FIM_udf = udf(verify_new_product_FIM, StringType())
    


#### Cria dimensão Produto com a característica de "slowly changing dimensions"

In [0]:
dw_products = products_df.select(products_df.columns)

dw_products = dw_products.withColumn("product_category", reduce_categories_udf(col("product_category_name")))
dw_products = dw_products.select(['product_id', "product_category", 'product_name_lenght', 'product_description_lenght', 
                                  'product_photos_qty', 'product_weight_g', 'product_length_cm', 'product_height_cm', 
                                  'product_width_cm']) 

if (dw_products_old.count() > 0):
    cols = dw_products_old.columns
    
    dw_products = dw_products.withColumn("product_name_lenght", col("product_name_lenght").cast(IntegerType()))
    dw_products = dw_products.withColumn("product_description_lenght", col("product_description_lenght").cast(IntegerType()))
    dw_products = dw_products.withColumn("product_photos_qty", col("product_photos_qty").cast(IntegerType()))
    dw_products = dw_products.withColumn("product_weight_g", col("product_weight_g").cast(DoubleType()))
    dw_products = dw_products.withColumn("product_length_cm", col("product_length_cm").cast(DoubleType()))
    dw_products = dw_products.withColumn("product_height_cm", col("product_height_cm").cast(DoubleType()))
    dw_products = dw_products.withColumn("product_width_cm", col("product_width_cm").cast(DoubleType()))

    dw_products_new = dw_products.select(dw_products.columns)
    dw_products = dw_products_old.select(dw_products_old.columns)

    dw_products_new = dw_products_new.withColumn("INICIO", lit(datetime.date.today().strftime("%Y-%m-%d")).cast(StringType()))
    dw_products_new = dw_products_new.withColumn("FIM",lit(None).cast(StringType()) )

    dw_products.write.mode('overwrite').saveAsTable("dw_products")
    dw_products_new.createOrReplaceTempView("dw_products_new")

    dw_products = spark.sql("""insert into dw_products (product_id,product_category,product_name_lenght,product_description_lenght,
                            product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm, INICIO, FIM ,IDSK)
                            select pn.product_id, pn.product_category, pn.product_name_lenght, pn.product_description_lenght,
                            pn.product_photos_qty, pn.product_weight_g, pn.product_length_cm, pn.product_height_cm, pn.product_width_cm, pn.INICIO, pn.FIM ,
                            ( (select max((cast(IDSK as INT))) from dw_products) + ROW_NUMBER() OVER (
                                ORDER BY pn.product_id
                            ) ) as IDSK
                            from dw_products_new pn
                            inner join dw_products p on p.product_id = pn.product_id and p.FIM is null
                            where
                            pn.product_category <> p.product_category
                            or pn.product_name_lenght <> p.product_name_lenght
                            or pn.product_description_lenght <> p.product_description_lenght
                            or pn.product_photos_qty <> p.product_photos_qty
                            or pn.product_weight_g <> p.product_weight_g
                            or pn.product_length_cm <> p.product_length_cm
                            or pn.product_height_cm <> p.product_height_cm
                            or pn.product_width_cm <>  p.product_width_cm               
                            """)
    
    dw_products = spark.sql("""insert into dw_products (product_id,product_category,product_name_lenght,product_description_lenght,
                            product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm, INICIO, FIM ,IDSK)
                            select pn.product_id, pn.product_category, pn.product_name_lenght, pn.product_description_lenght,
                            pn.product_photos_qty, pn.product_weight_g, pn.product_length_cm, pn.product_height_cm, pn.product_width_cm, pn.INICIO, pn.FIM ,
                            ( (select max((cast(IDSK as INT))) from dw_products) + ROW_NUMBER() OVER (
                                ORDER BY pn.product_id
                            ) ) as IDSK
                            from dw_products_new pn
                            where pn.product_id  not in (select product_id from dw_products)                  
                            """)

    dw_products = sqlContext.table("dw_products")

    dw_products_new = dw_products_new.select(col('product_id'),
                                     col('product_category').alias('product_category_new'),
                                     col('product_name_lenght').alias('product_name_lenght_new'),
                                     col('product_description_lenght').alias('product_description_lenght_new'),
                                     col('product_photos_qty').alias('product_photos_qty_new'),
                                     col('product_weight_g').alias('product_weight_g_new'),
                                     col('product_length_cm').alias('product_length_cm_new'),
                                     col('product_height_cm').alias('product_height_cm_new'),
                                     col('product_width_cm').alias('product_width_cm_new'))
    
    
    dw_products = dw_products_new.join(dw_products, ['product_id'], how = 'right')

    dw_products = dw_products.withColumn( "INICIO", verify_INICIO_udf(col("INICIO")) )
    dw_products = dw_products.withColumn( "FIM", verify_new_product_FIM_udf(  col("FIM"), col("IDSK"),
                                                                            col("product_category"), col("product_category_new"), 
                                                                            col("product_name_lenght"), col("product_name_lenght_new"),
                                                                            col("product_description_lenght"), col("product_description_lenght_new"), 
                                                                            col("product_photos_qty"), col("product_photos_qty_new"),
                                                                            col("product_weight_g"), col("product_weight_g_new"), 
                                                                            col("product_length_cm"), col("product_length_cm_new"), 
                                                                            col("product_height_cm"), col("product_height_cm_new"), 
                                                                            col("product_width_cm"), col("product_width_cm_new") ) )

    dw_products = dw_products.withColumn("product_category", verify_new_value_str_udf(col("IDSK"), col("product_category"), col("product_category_new")) )
    dw_products = dw_products.withColumn("product_name_lenght", verify_new_value_int_udf(col("IDSK"), col("product_name_lenght"), col("product_name_lenght_new")) )
    dw_products = dw_products.withColumn("product_description_lenght", verify_new_value_int_udf(col("IDSK"), col("product_description_lenght"), col("product_description_lenght_new")) )
    dw_products = dw_products.withColumn("product_photos_qty", verify_new_value_int_udf(col("IDSK"), col("product_photos_qty"), col("product_photos_qty_new")) )
    dw_products = dw_products.withColumn("product_weight_g", verify_new_value_double_udf(col("IDSK"), col("product_weight_g"), col("product_weight_g_new")) )
    dw_products = dw_products.withColumn("product_length_cm", verify_new_value_double_udf(col("IDSK"), col("product_length_cm"), col("product_length_cm_new")) )
    dw_products = dw_products.withColumn("product_height_cm", verify_new_value_double_udf(col("IDSK"), col("product_height_cm"), col("product_height_cm_new")) )
    dw_products = dw_products.withColumn("product_width_cm", verify_new_value_double_udf(col("IDSK"), col("product_width_cm"), col("product_width_cm_new")) )
    
    dw_products = dw_products.select(cols)
else:
    dw_products = dw_products.withColumn("INICIO", lit(datetime.date(2016,1,1)).cast("timestamp"))
    dw_products = dw_products.withColumn("FIM",lit(None).cast(StringType()) )

    windowSpec  = Window.orderBy("product_id")
    dw_products = dw_products.withColumn("IDSK", row_number().over(windowSpec))

cols = dw_products.columns
dw_products = dw_products.select(cols)  

dw_products.limit(10).toPandas()

Unnamed: 0,product_id,product_category,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,INICIO,FIM,IDSK
0,00066f42aeeb9f3007548bb9d3f33c38,Produtos de Beleza e Higiene,53,596,6,300.0,20.0,16.0,16.0,2016-01-01T00:00:00.000Z,,1
1,00088930e925c41fd95ebfe695fd2655,Eletrônicos,56,752,4,1225.0,55.0,10.0,26.0,2016-01-01T00:00:00.000Z,,2
2,0009406fd7479715e4bef61dd91f2462,Mobília,50,266,2,300.0,45.0,15.0,35.0,2016-01-01T00:00:00.000Z,,3
3,000b8f95fcb9e0096488278317764d19,Acessórios Doméstico,25,364,3,550.0,19.0,24.0,12.0,2016-01-01T00:00:00.000Z,,4
4,000d9be29b5207b54e86aa1b1ac54872,Eletrônicos,48,613,4,250.0,22.0,11.0,15.0,2016-01-01T00:00:00.000Z,,5
5,0011c512eb256aa0dbbb544d8dffcf6e,Eletrônicos,58,177,1,100.0,16.0,15.0,16.0,2016-01-01T00:00:00.000Z,,6
6,00126f27c813603687e6ce486d909d01,Fashion,42,2461,1,700.0,25.0,5.0,15.0,2016-01-01T00:00:00.000Z,,7
7,001795ec6f1b187d37335e1c4704762e,Eletrônicos,53,274,1,600.0,30.0,20.0,20.0,2016-01-01T00:00:00.000Z,,8
8,001b237c0e9bb435f2e54071129237e9,Mobília,42,253,1,6000.0,40.0,4.0,30.0,2016-01-01T00:00:00.000Z,,9
9,001b72dfd63e9833e8c02742adf472e3,Mobília,45,520,3,600.0,26.0,8.0,22.0,2016-01-01T00:00:00.000Z,,10


In [0]:
print((dw_products.count(), len(dw_products.columns)))

(32954, 12)


In [0]:
def verify_new_seller_FIM (fim, idsk, seller_city, seller_city_new, seller_state, seller_state_new, seller_zip_code_prefix, seller_zip_code_prefix_new):

    seller_city = seller_city if seller_city and seller_city != 'None' else None
    seller_city_new = seller_city_new if seller_city_new and seller_city_new != 'None' else None

    seller_state = seller_state if seller_state and seller_state != 'None' else None
    seller_state_new = seller_state_new if seller_state_new and seller_state_new != 'None' else None

    seller_zip_code_prefix = seller_zip_code_prefix if seller_zip_code_prefix and seller_zip_code_prefix != 'None' else None
    seller_zip_code_prefix_new = seller_zip_code_prefix_new if seller_zip_code_prefix_new and seller_zip_code_prefix_new != 'None' else None

    if (seller_city != seller_city_new or seller_state != seller_state_new or seller_zip_code_prefix != seller_zip_code_prefix_new) and idsk:
        return datetime.date.today().strftime("%Y-%m-%d")
    else:
        return None


verify_new_seller_FIM_udf = udf(verify_new_seller_FIM, StringType())

Faz a média entre latitude e longitude para cada código postal. Esses dados serão usados na dimensão Vendedor e Cliente

In [0]:
geolocations_df = geolocations_df.groupby('geolocation_zip_code_prefix').agg(avg('geolocation_lat').alias('geolocation_lat'),
                                                                             avg('geolocation_lng').alias('geolocation_lng'))
geolocations_df.limit(10).toPandas()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng
0,2053,-23.513228,-46.602793
1,2943,-23.476741,-46.724994
2,3442,-23.543788,-46.5399
3,3511,-23.534921,-46.527164
4,3904,-23.578208,-46.518503
5,4319,-23.647404,-46.63657
6,4438,-23.67657,-46.669897
7,5176,-23.477063,-46.711912
8,5163,-23.481597,-46.764806
9,5422,-23.566922,-46.687446


Função para lidar com códigos postais com zero a esquerda

In [0]:
def convert_zip_code (x):
    if x:
        return str(int(float(x))).rjust(5,'0')
    else:
        return x

convert_zip_code_udf = udf(convert_zip_code, StringType())

def verify_string_number (x):
    if x:
        if x.isnumeric():
            return 'STRINGNUMBER' + x
        else:
            return x
    else:
        return x

verify_string_number_udf = udf(verify_string_number, StringType())

def remove_STRINGNUMBER (x):
    if x:
        if 'STRINGNUMBER' in x:
            return x.replace('STRINGNUMBER','')
        else:
            return x
    else:
        return x

remove_STRINGNUMBER_udf = udf(remove_STRINGNUMBER, StringType())

#### Cria dimensão Vendedor com a característica de "slowly changing dimensions"

In [0]:
dw_sellers = sellers_df.select(sellers_df.columns)
geolocations_df = geolocations_df.withColumnRenamed("geolocation_zip_code_prefix", "seller_zip_code_prefix")

dw_sellers = dw_sellers.join(geolocations_df, ["seller_zip_code_prefix"], how='left')
dw_sellers = dw_sellers.select(["seller_id","seller_zip_code_prefix","seller_city","seller_state", "geolocation_lat","geolocation_lng"])
print((dw_sellers.count(), len(dw_sellers.columns)))
if dw_sellers_old.count() > 0:
    cols = dw_sellers_old.columns
    
    dw_sellers = dw_sellers.withColumn("geolocation_lat", col("geolocation_lat").cast(DoubleType()) )
    dw_sellers = dw_sellers.withColumn("geolocation_lng", col("geolocation_lng").cast(DoubleType()) )

    dw_sellers_new = dw_sellers.select(dw_sellers.columns)

    dw_sellers = dw_sellers_old.select(dw_sellers_old.columns)

    dw_sellers_new = dw_sellers_new.withColumn("INICIO", lit(datetime.date.today().strftime("%Y-%m-%d")).cast(StringType()))
    dw_sellers_new = dw_sellers_new.withColumn("FIM",lit(None).cast(StringType()) )

    dw_sellers = dw_sellers.withColumn("seller_city",verify_string_number_udf(col("seller_city")) )
    dw_sellers = dw_sellers.withColumn("seller_state",verify_string_number_udf(col("seller_state")) )
    dw_sellers_new = dw_sellers_new.withColumn("seller_city",verify_string_number_udf(col("seller_city")) )
    dw_sellers_new = dw_sellers_new.withColumn("seller_state",verify_string_number_udf(col("seller_state")) )

    dw_sellers.write.mode("overwrite").saveAsTable("dw_sellers")
    dw_sellers_new.createOrReplaceTempView("dw_sellers_new")

    dw_sellers = spark.sql("""insert into dw_sellers (seller_id, seller_city, seller_state, seller_zip_code_prefix, geolocation_lat,
                           geolocation_lng, INICIO, FIM ,IDSK)
                            select sn.seller_id, sn.seller_city, sn.seller_state, sn.seller_zip_code_prefix, sn.geolocation_lat, sn.geolocation_lng, sn.INICIO, sn.FIM , 
                            ( (select max((cast(IDSK as INT))) from dw_sellers) + ROW_NUMBER() OVER (
                                ORDER BY sn.seller_id
                            ) ) as IDSK
                            from dw_sellers_new sn
                            inner join dw_sellers s on s.seller_id = sn.seller_id and s.FIM is null
                            where
                            sn.seller_city <> s.seller_city
                            or sn.seller_state <> s.seller_state
                            or sn.seller_zip_code_prefix <> lpad( cast( cast( s.seller_zip_code_prefix as INT) as STRING), 5, '0');              
                            """)
    
    dw_sellers = spark.sql("""insert into dw_sellers (seller_id, seller_city, seller_state, seller_zip_code_prefix, geolocation_lat,
                           geolocation_lng, INICIO, FIM ,IDSK)
                            select sn.seller_id, sn.seller_city, sn.seller_state, sn.seller_zip_code_prefix, sn.geolocation_lat, sn.geolocation_lng, sn.INICIO, sn.FIM ,
                            ( (select max((cast(IDSK as INT))) from dw_sellers) + ROW_NUMBER() OVER (
                                ORDER BY sn.seller_id
                            ) ) as IDSK
                            from dw_sellers_new sn
                            where sn.seller_id  not in (select seller_id from dw_sellers)                    
                           """)
    
    dw_sellers = sqlContext.table("dw_sellers")

    dw_sellers = dw_sellers.withColumn("seller_city", remove_STRINGNUMBER_udf(col("seller_city")))
    dw_sellers = dw_sellers.withColumn("seller_state", remove_STRINGNUMBER_udf(col("seller_state")))
    dw_sellers = dw_sellers.withColumn("seller_zip_code_prefix", convert_zip_code_udf(col("seller_zip_code_prefix")))

    dw_sellers_new = dw_sellers_new.withColumn("seller_city", remove_STRINGNUMBER_udf(col("seller_city")))
    dw_sellers_new = dw_sellers_new.withColumn("seller_state", remove_STRINGNUMBER_udf(col("seller_state")))
    dw_sellers_new = dw_sellers_new.withColumn("seller_zip_code_prefix", convert_zip_code_udf(col("seller_zip_code_prefix")))

    dw_sellers_new = dw_sellers_new.select(col('seller_id'),
                                     col('seller_zip_code_prefix').alias('seller_zip_code_prefix_new'),
                                     col('seller_city').alias('seller_city_new'),
                                     col('seller_state').alias('seller_state_new'),
                                     col('geolocation_lat').alias('geolocation_lat_new'),
                                     col('geolocation_lng').alias('geolocation_lng_new'))

    dw_sellers = dw_sellers_new.join(dw_sellers, ['seller_id'], how = 'right')
    print((dw_sellers.count(), len(dw_sellers.columns)))

    dw_sellers = dw_sellers.withColumn( "INICIO", verify_INICIO_udf(col("INICIO")) )
    dw_sellers = dw_sellers.withColumn( "FIM", verify_new_seller_FIM_udf(  col("FIM"), col("IDSK"),
                                                                            col("seller_zip_code_prefix"), col("seller_zip_code_prefix_new"), 
                                                                            col("seller_city"), col("seller_city_new"),
                                                                            col("seller_state"), col("seller_state_new") ) )

    dw_sellers = dw_sellers.withColumn("seller_zip_code_prefix", verify_new_value_str_udf(col("IDSK"), col("seller_zip_code_prefix"), col("seller_zip_code_prefix_new")) )
    dw_sellers = dw_sellers.withColumn("seller_city", verify_new_value_str_udf(col("IDSK"), col("seller_city"), col("seller_city_new")) )
    dw_sellers = dw_sellers.withColumn("seller_state", verify_new_value_str_udf(col("IDSK"), col("seller_state"), col("seller_state_new")) )
    dw_sellers = dw_sellers.withColumn("geolocation_lat", verify_new_value_double_udf(col("IDSK"), col("geolocation_lat"), col("geolocation_lat_new")) )
    dw_sellers = dw_sellers.withColumn("geolocation_lng", verify_new_value_double_udf(col("IDSK"), col("geolocation_lng"), col("geolocation_lng_new")) )
    
    dw_sellers = dw_sellers.select(cols)

    print((dw_sellers.count(), len(dw_sellers.columns)))
else:
    dw_sellers = dw_sellers.withColumn("INICIO", lit(datetime.date(2016,1,1)).cast("timestamp"))
    dw_sellers = dw_sellers.withColumn("FIM",lit(None).cast(StringType()) )

    windowSpec  = Window.orderBy("seller_id")
    dw_sellers = dw_sellers.withColumn("IDSK", row_number().over(windowSpec))

cols = dw_sellers.columns
dw_sellers = dw_sellers.select(cols)

dw_sellers.limit(10).toPandas()

(3096, 6)
(3098, 14)
(3098, 9)


Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,geolocation_lat,geolocation_lng,INICIO,FIM,IDSK
0,062ce95fa2ad4dfaedfc79260130565f,95913,lajeado,RS,-29.44658,-51.961202,2016-01-01T00:00:00.000Z,,76
1,0b64bcdb0784abc139af04077d49a20e,92420,canoas,RS,-29.88533,-51.178237,2016-01-01T00:00:00.000Z,,147
2,0ea22c1cfbdc755f86b9b54b39c16043,35700,sete lagoas,MG,-19.457995,-44.248128,2016-01-01T00:00:00.000Z,,174
3,2009a095de2a2a41626f6c6d7722678d,15025,sao jose do rio preto,SP,-20.806707,-49.389165,2016-01-01T00:00:00.000Z,,380
4,297d5eccd19fa9a83b2630071ff105e4,80710,curitiba,PR,-25.428323,-49.299818,2016-01-01T00:00:00.000Z,,488
5,4d600e08ecbe08258c79e536c5a42fee,85988,entre rios do oeste,PR,-24.702966,-54.237448,2016-01-01T00:00:00.000Z,,923
6,6eeed17989b0ae47c9f11ece6f38ea90,4123,sao paulo,SP,-23.610422,-46.626594,2016-01-01T00:00:00.000Z,,1357
7,791cfcfe22fe4a771ece27f90017da92,14010,ribeirao preto,SP,-21.17954,-47.808598,2016-01-01T00:00:00.000Z,,1490
8,8e6cc767478edae941d9bd9eb778d77a,38442,araguari,MG,-18.645099,-48.204993,2016-01-01T00:00:00.000Z,,1747
9,9803a40e82e45418ab7fb84091af5231,75901,rio verde,GO,-17.804501,-50.916495,2016-01-01T00:00:00.000Z,,1856


In [0]:
print((dw_sellers.count(), len(dw_sellers.columns)))

(3098, 9)


#### Cria dimensão Cliente

In [0]:
dw_customers = customers_df.select(customers_df.columns)
geolocations_df = geolocations_df.withColumnRenamed("seller_zip_code_prefix", "customer_zip_code_prefix")

dw_customers = dw_customers.join(geolocations_df, ["customer_zip_code_prefix"], how='left')
dw_customers = dw_customers.join(rfm_table, ["customer_unique_id"], how='left')

dw_customers = dw_customers.select(['customer_id', 'customer_unique_id', 'customer_zip_code_prefix', 'customer_city', 
                                    'customer_state', 'geolocation_lat', 'geolocation_lng', 'customer_segmantation'])

if dw_customers_old.count() > 0:
    cols = dw_customers_old.columns
    dw_customers_new = dw_customers.select(dw_customers.columns)
    dw_customers = dw_customers.select(col('customer_id'),
                                       col('customer_unique_id').alias('customer_unique_id_new'),
                                       col('customer_zip_code_prefix').alias('customer_zip_code_prefix_new'),
                                       col('customer_city').alias('customer_city_new'),
                                       col('customer_state').alias('customer_state_new'),
                                       col('geolocation_lat').alias('geolocation_lat_new'),
                                       col('geolocation_lng').alias('geolocation_lng_new'),
                                       col('customer_segmantation').alias('customer_segmantation_new'))
    
    dw_customers = dw_customers.join(dw_customers_old, ['customer_id'], how = 'right')
    
    dw_customers = dw_customers.withColumn("IDSK", col("IDSK").cast(IntegerType()) )

    dw_customers = dw_customers.withColumn("customer_unique_id", verify_new_value_str_udf(col("IDSK"), col("customer_unique_id"), col("customer_unique_id_new")) )
    dw_customers = dw_customers.withColumn("customer_zip_code_prefix", verify_new_value_str_udf(col("IDSK"), col("customer_zip_code_prefix"), col("customer_zip_code_prefix_new")) )
    dw_customers = dw_customers.withColumn("customer_city", verify_new_value_str_udf(col("IDSK"), col("customer_city"), col("customer_city_new")) )
    dw_customers = dw_customers.withColumn("customer_state", verify_new_value_str_udf(col("IDSK"), col("customer_state"), col("customer_state_new")) )
    dw_customers = dw_customers.withColumn("geolocation_lat", verify_new_value_double_udf(col("IDSK"), col("geolocation_lat"), col("geolocation_lat_new")) )
    dw_customers = dw_customers.withColumn("geolocation_lng", verify_new_value_double_udf(col("IDSK"), col("geolocation_lng"), col("geolocation_lng_new")) )
    dw_customers = dw_customers.withColumn("customer_segmantation", verify_new_value_str_udf(col("IDSK"), col("customer_segmantation"), col("customer_segmantation_new")) )

    dw_customers = dw_customers.select(cols)

    dw_customers = dw_customers.withColumn("customer_city",verify_string_number_udf(col("customer_city")) )
    dw_customers = dw_customers.withColumn("customer_state",verify_string_number_udf(col("customer_state")) )
    dw_customers_new = dw_customers_new.withColumn("customer_city",verify_string_number_udf(col("customer_city")) )
    dw_customers_new = dw_customers_new.withColumn("customer_state",verify_string_number_udf(col("customer_state")) )

    dw_customers_new = dw_customers_new.withColumn("geolocation_lat", col("geolocation_lat").cast(DoubleType()) )
    dw_customers_new = dw_customers_new.withColumn("geolocation_lng", col("geolocation_lng").cast(DoubleType()) )

    dw_customers.write.mode("overwrite").saveAsTable("dw_customers")
    dw_customers_new.createOrReplaceTempView("dw_customers_new")

    dw_customers = spark.sql("""insert into dw_customers (customer_id, customer_unique_id, customer_zip_code_prefix, customer_city,
                            customer_state, geolocation_lat, geolocation_lng, customer_segmantation , IDSK)
                            select cn.customer_id, cn.customer_unique_id, cn.customer_zip_code_prefix, cn.customer_city,
                            cn.customer_state, cn.geolocation_lat, cn.geolocation_lng, cn.customer_segmantation ,
                            ( (select max((cast(IDSK as INT))) from dw_customers) + ROW_NUMBER() OVER (
                                ORDER BY cn.customer_id
                            ) ) as IDSK
                            from dw_customers_new cn
                            where cn.customer_id  not in (select customer_id from dw_customers)                  
                            """)

    dw_customers = sqlContext.table("dw_customers")

    dw_customers = dw_customers.withColumn("customer_city", remove_STRINGNUMBER_udf(col("customer_city")))
    dw_customers = dw_customers.withColumn("customer_state", remove_STRINGNUMBER_udf(col("customer_state")))
    dw_customers = dw_customers.withColumn("customer_zip_code_prefix", convert_zip_code_udf(col("customer_zip_code_prefix")))
else:
    windowSpec  = Window.orderBy("customer_id")
    dw_customers = dw_customers.withColumn("IDSK", row_number().over(windowSpec))

cols = dw_customers.columns
dw_customers = dw_customers.select(cols)

dw_customers.limit(10).toPandas()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,geolocation_lat,geolocation_lng,customer_segmantation,IDSK
0,d8f3e586c5a006d51f26fa3de177ef19,a3d0e18c2fe7a415099f73e5dd10b34b,95020,caxias do sul,RS,-29.166314,-51.178314,Cliente em Potencial,84525
1,d927892e658a6af918b1843a55dc8ccb,507c526dc04e88f65b3348123ad2b6eb,7776,cajamar,SP,-23.340395,-46.844019,Cliente Perdido,84587
2,d962bd2ab8e3cbb417a6d31fcfe313a9,a5840b8c3fc4c197e65b8016cd02d688,26210,nova iguacu,RJ,-22.755877,-43.450113,Cliente Perdido,84670
3,d9b2ad06c6bc33624b15fd1aad224cc0,23f47429f7afd480f23d926525e91762,30170,belo horizonte,MG,-19.927104,-43.944964,Cliente Perdido,84795
4,d9bc09228a6d0e4f885c56ddb7e5a95d,2ae2f7c75a0d113879759e55a02a0d2a,38400,uberlandia,MG,-18.913248,-48.278212,Cliente Perdido,84810
5,d9f57c5a009cd22a41f3483ee6c71674,fa456d6a84c63b41098ceb060829c8c3,11660,caraguatatuba,SP,-23.622809,-45.414136,Lealdade em Potencial,84893
6,da0b203dc40242d27e707a12c147592e,8c532c86e424b28e85997dd891e26d5d,23068,rio de janeiro,RJ,-22.886459,-43.59763,Cliente Perdido,84921
7,da0f1d8609882eb8e79a55c7ca567834,7db0eba759379d3a68e2922bb970c48c,24230,niteroi,RJ,-22.878637,-43.086096,Lealdade em Potencial,84931
8,da14d636fdbf0cf7f7c9da5967eb7f0f,5b3b19201bb81179198a9ce0d26ae56f,28991,saquarema,RJ,-22.912694,-42.476505,Lealdade em Potencial,84943
9,da8254447ce1ed8a5c494ee4a8929380,73bba09416f450b3dd0f5d83eb69d3a6,38120,conceicao das alagoas,MG,-19.92124,-48.375494,Cliente Perdido,85107


In [0]:
print((dw_customers.count(), len(dw_customers.columns)))

(99442, 9)


In [0]:
def datediff_cond (x,y):
    if not x or not y:
        return None
    else:
        date1 = datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S") if type(x) == str else x
        date2 = datetime.datetime.strptime(y, "%Y-%m-%d %H:%M:%S") if type(y) == str else y
        delta = date1 - date2
        return delta.days
    
datediff_cond_udf = udf(datediff_cond, IntegerType())

#### Cria dimensão de Pedidos

In [0]:
dw_orders = orders_df.select(orders_df.columns)

dw_orders = dw_orders.withColumn('estimated_days', datediff_cond_udf(col('order_estimated_delivery_date'),col('order_purchase_timestamp'))) 
dw_orders = dw_orders.withColumn('arrival_days', datediff_cond_udf(col('order_delivered_customer_date'),col('order_purchase_timestamp'))) 
dw_orders = dw_orders.withColumn('shipping_days', datediff_cond_udf(col('order_delivered_customer_date'),col('order_delivered_carrier_date'))) 
dw_orders = dw_orders.withColumn('arrival_status',datediff_cond_udf(col('order_estimated_delivery_date'),col('order_delivered_customer_date'))) 

dw_orders = dw_orders.withColumn('arrival_status',get_arrival_status_udf(col('arrival_status'))) 

dw_orders = dw_orders.withColumn('estimated_delivery_rate', get_duration_status_udf(col('estimated_days')))
dw_orders = dw_orders.withColumn('arrival_delivery_rate', get_duration_status_udf(col('arrival_days'))) 
dw_orders = dw_orders.withColumn('shipping_delivery_rate', get_duration_status_udf(col('shipping_days')))


dw_orders.limit(20).toPandas()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,estimated_days,arrival_days,shipping_days,arrival_status,estimated_delivery_rate,arrival_delivery_rate,shipping_delivery_rate
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,15,8.0,6.0,Em tempo,Rápido,Rápido,Muito Rápido
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,19,13.0,12.0,Em tempo,Duração OK,Rápido,Rápido
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,26,9.0,9.0,Em tempo,Devagar,Rápido,Rápido
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00,26,13.0,9.0,Em tempo,Devagar,Rápido,Rápido
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00,12,2.0,1.0,Em tempo,Rápido,Muito Rápido,Muito Rápido
5,a4591c265e18cb1dcee52889e2d8acc3,503740e9ca751ccdda7ba28e9ab8f608,delivered,2017-07-09 21:57:05,2017-07-09 22:10:13,2017-07-11 14:58:04,2017-07-26 10:57:55,2017-08-01 00:00:00,22,16.0,14.0,Em tempo,Duração OK,Duração OK,Rápido
6,136cce7faa42fdb2cefd53fdc79a6098,ed0271e0b7da060a393796590e7b737a,invoiced,2017-04-11 12:22:08,2017-04-13 13:25:17,,,2017-05-09 00:00:00,27,,,,Devagar,,
7,6514b8ad8028c9f2cc2374ded245783f,9bdf08b4b3b52b5526ff42d37d47f222,delivered,2017-05-16 13:10:30,2017-05-16 13:22:11,2017-05-22 10:07:46,2017-05-26 12:55:51,2017-06-07 00:00:00,21,9.0,4.0,Em tempo,Duração OK,Rápido,Muito Rápido
8,76c6e866289321a7c93b82b54852dc33,f54a9f0e6b351c431402b8461ea51999,delivered,2017-01-23 18:29:09,2017-01-25 02:50:47,2017-01-26 14:16:31,2017-02-02 14:08:10,2017-03-06 00:00:00,41,9.0,6.0,Em tempo,Devagar,Rápido,Muito Rápido
9,e69bfb5eb88e0ed6a785585b27e16dbf,31ad1d1b63eb9962463f764d4e6e0c9d,delivered,2017-07-29 11:55:02,2017-07-29 12:05:32,2017-08-10 19:45:24,2017-08-16 17:14:30,2017-08-23 00:00:00,24,18.0,5.0,Em tempo,Duração OK,Duração OK,Muito Rápido


In [0]:
dw_orders = dw_orders.withColumn('order_purchase_timestamp', to_timestamp(date_format(col("order_purchase_timestamp"), "yyyy-MM-dd HH") ))
dw_orders = dw_orders.withColumn('order_approved_at', to_timestamp(date_format(col('order_approved_at'), "yyyy-MM-dd HH") ))
dw_orders = dw_orders.withColumn('order_delivered_carrier_date', to_timestamp(date_format(col('order_delivered_carrier_date'), "yyyy-MM-dd HH") ))
dw_orders = dw_orders.withColumn('order_delivered_customer_date', to_timestamp(date_format(col('order_delivered_customer_date'), "yyyy-MM-dd HH") ))
dw_orders = dw_orders.withColumn('order_estimated_delivery_date', to_timestamp(date_format(col('order_estimated_delivery_date'), "yyyy-MM-dd HH") ))


dw_orders = dw_orders.join(dw_time, dw_orders.order_purchase_timestamp == dw_time.full_date , how='left')
dw_orders = dw_orders.select(['order_id', 'customer_id', 'order_status',  'IDSK', 'order_approved_at', 
                              'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date',
                              'estimated_days', 'arrival_days', 'shipping_days', 'arrival_status', 'estimated_delivery_rate',
                              'arrival_delivery_rate', 'shipping_delivery_rate' ])
dw_orders = dw_orders.withColumnRenamed("IDSK", "order_purchase_timestamp_id")

dw_orders = dw_orders.join(dw_time, dw_orders.order_approved_at == dw_time.full_date , how='left')
dw_orders = dw_orders.select(['order_id', 'customer_id', 'order_status',  'order_purchase_timestamp_id', 'IDSK', 
                              'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date',
                              'estimated_days', 'arrival_days', 'shipping_days', 'arrival_status', 'estimated_delivery_rate',
                              'arrival_delivery_rate', 'shipping_delivery_rate' ])
dw_orders = dw_orders.withColumnRenamed("IDSK", "order_approved_at_id")

dw_orders = dw_orders.join(dw_time, dw_orders.order_delivered_carrier_date == dw_time.full_date , how='left')
dw_orders = dw_orders.select(['order_id', 'customer_id', 'order_status',  'order_purchase_timestamp_id', 'order_approved_at_id', 
                              'IDSK', 'order_delivered_customer_date', 'order_estimated_delivery_date',
                              'estimated_days', 'arrival_days', 'shipping_days', 'arrival_status', 'estimated_delivery_rate',
                              'arrival_delivery_rate', 'shipping_delivery_rate', 'order_delivered_carrier_date' ])
dw_orders = dw_orders.withColumnRenamed("IDSK", "order_delivered_carrier_date_id")

dw_orders = dw_orders.join(dw_time, dw_orders.order_delivered_customer_date == dw_time.full_date , how='left')
dw_orders = dw_orders.select(['order_id', 'customer_id', 'order_status',  'order_purchase_timestamp_id', 'order_approved_at_id', 
                              'order_delivered_carrier_date_id', 'IDSK', 'order_estimated_delivery_date',
                              'estimated_days', 'arrival_days', 'shipping_days', 'arrival_status', 'estimated_delivery_rate',
                              'arrival_delivery_rate', 'shipping_delivery_rate', 'order_delivered_carrier_date' ])
dw_orders = dw_orders.withColumnRenamed("IDSK", "order_delivered_customer_date_id")

dw_orders = dw_orders.join(dw_time, dw_orders.order_estimated_delivery_date == dw_time.full_date , how='left')
dw_orders = dw_orders.select(['order_id', 'customer_id', 'order_status',  'order_purchase_timestamp_id', 'order_approved_at_id', 
                              'order_delivered_carrier_date_id', 'order_delivered_customer_date_id', 'IDSK',
                              'estimated_days', 'arrival_days', 'shipping_days', 'arrival_status', 'estimated_delivery_rate',
                              'arrival_delivery_rate', 'shipping_delivery_rate', 'order_delivered_carrier_date' ])
dw_orders = dw_orders.withColumnRenamed("IDSK", "order_estimated_delivery_date_id")

if dw_orders_old.count() > 0:
    cols = dw_orders.columns
    dw_orders_new = dw_orders.select(dw_orders.columns)
    dw_orders = dw_orders.select(col('order_id'),
                                 col('customer_id'), 
                                 col('order_status'),  
                                 col('order_purchase_timestamp_id'), 
                                 col('order_approved_at_id'), 
                                 col('order_delivered_carrier_date_id'), 
                                 col('order_delivered_customer_date_id'), 
                                 col('order_estimated_delivery_date_id'),
                                 col('estimated_days'), 
                                 col('arrival_days'), 
                                 col('shipping_days'),
                                 col('order_delivered_carrier_date'),
                                 col('arrival_status').alias('arrival_status_new'),
                                 col('estimated_delivery_rate').alias('estimated_delivery_rate_new'),
                                 col('arrival_delivery_rate').alias('arrival_delivery_rate_new'),
                                 col('shipping_delivery_rate').alias('shipping_delivery_rate_new'))
    
    dw_orders = dw_orders.join(dw_orders_old, ['order_id'], how = 'right')

    dw_orders = dw_orders.withColumn("IDSK", col("IDSK").cast(IntegerType()) )
    
    dw_orders = dw_orders.withColumn("arrival_status", verify_new_value_str_udf(col("IDSK"), col("arrival_status"), col("arrival_status_new")) )
    dw_orders = dw_orders.withColumn("estimated_delivery_rate", verify_new_value_str_udf(col("IDSK"), col("estimated_delivery_rate"), col("estimated_delivery_rate_new")) )
    dw_orders = dw_orders.withColumn("arrival_delivery_rate", verify_new_value_str_udf(col("IDSK"), col("arrival_delivery_rate"), col("arrival_delivery_rate_new")) )
    dw_orders = dw_orders.withColumn("shipping_delivery_rate", verify_new_value_str_udf(col("IDSK"), col("shipping_delivery_rate"), col("shipping_delivery_rate_new")) )

    dw_orders.write.mode("overwrite").saveAsTable("dw_orders")
    dw_orders_new.createOrReplaceTempView("dw_orders_new")

    dw_orders = spark.sql("""insert into dw_orders (order_id, customer_id, order_status,  order_purchase_timestamp_id,
                        order_approved_at_id, order_delivered_carrier_date_id, order_delivered_customer_date_id,
                        order_estimated_delivery_date_id, estimated_days, arrival_days, shipping_days, arrival_status, 
                        estimated_delivery_rate, arrival_delivery_rate, shipping_delivery_rate, order_delivered_carrier_date , IDSK)
                        select odn.order_id, odn.customer_id, odn.order_status,  odn.order_purchase_timestamp_id,
                        odn.order_approved_at_id, odn.order_delivered_carrier_date_id, odn.order_delivered_customer_date_id,
                        odn.order_estimated_delivery_date_id, odn.estimated_days, odn.arrival_days, odn.shipping_days, odn.arrival_status, 
                        odn.estimated_delivery_rate, odn.arrival_delivery_rate, odn.shipping_delivery_rate, 
                        odn.order_delivered_carrier_date ,
                        ( (select max((cast(IDSK as INT))) from dw_orders) + ROW_NUMBER() OVER (
                            ORDER BY odn.order_id
                        ) ) as IDSK
                        from dw_orders_new odn
                        where odn.order_id  not in (select order_id from dw_orders)                  
                        """)
     
    dw_orders = sqlContext.table("dw_orders")
else:
    windowSpec  = Window.orderBy("order_id")
    dw_orders = dw_orders.withColumn("IDSK", row_number().over(windowSpec))

cols = dw_orders.columns
dw_orders = dw_orders.select(cols)


dw_orders.limit(20).toPandas()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp_id,order_approved_at_id,order_delivered_carrier_date_id,order_delivered_customer_date_id,order_estimated_delivery_date_id,estimated_days,arrival_days,...,order_delivered_carrier_date,arrival_status_new,estimated_delivery_rate_new,arrival_delivery_rate_new,shipping_delivery_rate_new,IDSK,arrival_status,estimated_delivery_rate,arrival_delivery_rate,shipping_delivery_rate
0,014405982914c2cde2796ddcf0b8703d,2de342d6e5905a5a8bb3a991c855f3e2,delivered,13746,13746,13772,13864,14257,21,4,...,2017-07-27 19:00:00,Em tempo,Duração OK,Muito Rápido,Muito Rápido,493,Em tempo,Duração OK,Muito Rápido,Muito Rápido
1,019886de8f385a39b75bedbb726fd4ef,8cf88d7ba142365ef2ca619ef06f9a0f,delivered,18517,18518,18616,18819,19273,31,12,...,2018-02-14 15:00:00,Em tempo,Devagar,Rápido,Rápido,606,Em tempo,Devagar,Rápido,Rápido
2,01a6ad782455876aa89081449d49c452,71accffbcbdf8e02f67a469f65cdbf73,delivered,17963,17963,18071,18310,18745,32,14,...,2018-01-22 22:00:00,Em tempo,Devagar,Rápido,Rápido,632,Em tempo,Devagar,Rápido,Rápido
3,01d907b3e209269e120a365fc2b97524,d02cc92f5e33eb58d9ff4d5cce6ae901,delivered,14081,14099,14132,14255,14545,19,7,...,2017-08-11 19:00:00,Em tempo,Duração OK,Muito Rápido,Muito Rápido,715,Em tempo,Duração OK,Muito Rápido,Muito Rápido
4,028dc52e12ddda803ec1e35eb0b7b0d9,8c89a09d8fb33b6e5dc8a769d6b2bd63,delivered,17242,17243,17273,17295,17713,19,2,...,2017-12-20 16:00:00,Em tempo,Duração OK,Muito Rápido,,980,Em tempo,Duração OK,Muito Rápido,
5,036dd381dfb3ec75e0a63e14828cc871,00f5116a953fdf1b86dd0deb055c0e12,delivered,14711,14711,14733,14925,15241,22,8,...,2017-09-05 20:00:00,Em tempo,Duração OK,Rápido,Muito Rápido,1334,Em tempo,Duração OK,Rápido,Muito Rápido
6,03ebfa9712b7dbc7031291856263b314,4024b83c510f004326fbcf0671738663,delivered,19500,19507,19582,19600,19801,12,4,...,2018-03-26 21:00:00,Em tempo,Rápido,Muito Rápido,,1526,Em tempo,Rápido,Muito Rápido,
7,0420da8d50a3784011290a782f25a8a8,0949b5cf9adad08c1421aa3f1778e4a3,delivered,22841,22842,22933,23012,23113,11,7,...,2018-08-13 12:00:00,Em tempo,Rápido,Muito Rápido,Muito Rápido,1624,Em tempo,Rápido,Muito Rápido,Muito Rápido
8,05afef1c185862cab9062b322ff25cc5,296de103322e463a1b76de3c81fdf02b,delivered,13715,13716,13864,14374,14689,40,27,...,2017-07-31 15:00:00,Em tempo,Devagar,Devagar,Duração OK,2240,Em tempo,Devagar,Devagar,Duração OK
9,05bef443b850685058070c9e781988e8,4ff3f3945300d8e040f2906998401e92,delivered,21137,21179,21182,21788,21985,35,27,...,2018-06-01 13:00:00,Em tempo,Devagar,Devagar,Devagar,2260,Em tempo,Devagar,Devagar,Devagar


In [0]:
print((dw_orders.count(), len(dw_orders.columns)))

(99443, 21)


#### Cria tabela Fato de Pagamentos

In [0]:
dw_payments = orders_payments_df.select(orders_payments_df.columns)
dw_payments = dw_payments.select(['payment_sequential', 'payment_type', 'payment_installments', 'payment_value', "order_id" ])
dw_payments = dw_payments.join(dw_orders, ["order_id"], how='left')
cols = ['payment_sequential', 'payment_type', 'payment_installments', 'payment_value', 'customer_id', 'order_status', 
        'order_purchase_timestamp_id', 'order_approved_at_id', 'order_delivered_carrier_date_id', 
        'order_delivered_customer_date_id', 'order_estimated_delivery_date_id', 'IDSK', 'estimated_days', 'arrival_days', 
        'shipping_days']
dw_payments = dw_payments.select(cols)
dw_payments = dw_payments.withColumnRenamed("IDSK", "order_id")

cols = dw_payments.columns
cols.remove("customer_id")
dw_payments = dw_payments.join(dw_customers, ["customer_id"], how='left')
cols = [*cols, *["IDSK"]]
dw_payments = dw_payments.select(cols)
dw_payments = dw_payments.withColumnRenamed("IDSK", "customer_id")

dw_payments = dw_payments.select(['customer_id','order_id', 'payment_sequential', 'payment_type', 'payment_installments', 
                                  'payment_value', 'order_purchase_timestamp_id', 'order_approved_at_id', 
                                  'order_delivered_carrier_date_id', 'order_delivered_customer_date_id',
                                  'order_estimated_delivery_date_id', 'estimated_days', 'arrival_days', 
                                  'shipping_days'])

if dw_reviews_old.count() > 0:
    dw_payments_old.write.mode("overwrite").saveAsTable("dw_payments")
    dw_payments.createOrReplaceTempView("dw_payments_new")

    dw_payments = spark.sql("""insert into  dw_payments (customer_id,order_id, payment_sequential, payment_type, payment_installments, 
                                  payment_value, order_purchase_timestamp_id, order_approved_at_id, order_delivered_carrier_date_id, 
                                  order_delivered_customer_date_id, order_estimated_delivery_date_id, estimated_days, arrival_days, 
                                  shipping_days)
                        select customer_id,order_id, payment_sequential, payment_type, payment_installments, payment_value, 
                                  order_purchase_timestamp_id, order_approved_at_id, order_delivered_carrier_date_id, 
                                  order_delivered_customer_date_id, order_estimated_delivery_date_id, estimated_days, arrival_days, 
                                  shipping_days
                        from dw_payments_new 
                        where (order_id, payment_sequential) not in (select order_id, payment_sequential from dw_payments);                  
                        """)
     
    dw_payments = sqlContext.table("dw_payments")
    
dw_payments.limit(20).toPandas()

Unnamed: 0,customer_id,order_id,payment_sequential,payment_type,payment_installments,payment_value,order_purchase_timestamp_id,order_approved_at_id,order_delivered_carrier_date_id,order_delivered_customer_date_id,order_estimated_delivery_date_id,estimated_days,arrival_days,shipping_days
0,53963,72675,1,credit_card,1,37.15,22841,22842,22858,,23161,13,,
1,32808,51800,1,credit_card,1,72.75,15119,15119,15138,15236.0,15625,21,4.0,4.0
2,8241,54409,3,voucher,1,15.0,19391,19415,19434,19504.0,19801,17,4.0,2.0
3,41004,32641,1,credit_card,2,61.19,12420,12425,12442,13385.0,13057,26,40.0,39.0
4,98810,36281,1,credit_card,2,136.26,10243,10267,10332,10572.0,10921,28,13.0,9.0
5,1228,25314,1,boleto,1,89.27,17054,17069,17230,17274.0,17449,16,9.0,1.0
6,9636,74075,1,credit_card,2,239.5,18252,18252,18400,18786.0,19249,41,22.0,16.0
7,93214,20116,1,credit_card,3,248.51,18973,18974,19071,19849.0,19441,19,36.0,32.0
8,92651,24663,1,boleto,1,91.05,17830,17908,17948,18067.0,18433,25,9.0,4.0
9,55527,88057,1,credit_card,3,102.03,18979,18979,19009,19412.0,19729,31,18.0,16.0


In [0]:
print((dw_payments.count(), len(dw_payments.columns)))

(103887, 14)


#### Cria tabela Fato de Avaliações

In [0]:
dw_reviews = orders_reviews_df.select(orders_reviews_df.columns)
dw_reviews = dw_reviews.select(["review_id","order_id","review_score","review_comment_title","review_comment_message",
                                "review_creation_date","review_answer_timestamp"])
dw_reviews = dw_reviews.join(dw_orders, ["order_id"], how= 'left')
cols = dw_reviews.columns
cols.remove("order_id")
cols.remove("order_status")
dw_reviews = dw_reviews.select(cols)
dw_reviews = dw_reviews.withColumnRenamed("IDSK", "order_id")

cols = dw_reviews.columns
dw_reviews = dw_reviews.join(dw_customers, ["customer_id"], how='left')
cols.remove("customer_id")
cols = [*cols, *["IDSK"]]
dw_reviews = dw_reviews.select(cols)
dw_reviews = dw_reviews.withColumnRenamed("IDSK", "customer_id")

dw_reviews = dw_reviews.withColumn('review_creation_date', to_timestamp(date_format(col('review_creation_date'), "yyyy-MM-dd HH") ))
dw_reviews = dw_reviews.withColumn('review_answer_timestamp', to_timestamp(date_format(col('review_answer_timestamp'), "yyyy-MM-dd HH") ))

dw_reviews = dw_reviews.join(dw_time, dw_reviews.review_creation_date == dw_time.full_date , how='left')
dw_reviews = dw_reviews.select(['customer_id','order_id', "review_id", "review_score","review_comment_title","review_comment_message",
                                'order_purchase_timestamp_id', 'order_approved_at_id', 'order_delivered_carrier_date_id', 
                                'order_delivered_customer_date_id', 'order_estimated_delivery_date_id', 'IDSK', 'review_answer_timestamp', 
                                'estimated_days', 'arrival_days', 'shipping_days'])
dw_reviews = dw_reviews.withColumnRenamed("IDSK", "review_creation_date_id")

dw_reviews = dw_reviews.join(dw_time, dw_reviews.review_answer_timestamp == dw_time.full_date , how='left')
dw_reviews = dw_reviews.select([ 'customer_id','order_id', "review_id", "review_score","review_comment_title","review_comment_message",
                                'order_purchase_timestamp_id', 'order_approved_at_id', 'order_delivered_carrier_date_id', 
                                'order_delivered_customer_date_id', 'order_estimated_delivery_date_id', 'review_creation_date_id',
                                'IDSK', 'estimated_days', 'arrival_days', 'shipping_days'])
dw_reviews = dw_reviews.withColumnRenamed("IDSK", "review_answer_timestamp_id")

if dw_reviews_old.count() > 0:
    dw_reviews_old.write.option("multiLine", "true").mode("overwrite").saveAsTable("dw_reviews")
    dw_reviews.createOrReplaceTempView("dw_reviews_new")

    dw_reviews = spark.sql("""insert into  dw_reviews ( customer_id,order_id, review_id, review_score,review_comment_title, 
                                review_comment_message, order_purchase_timestamp_id, order_approved_at_id, 
                                order_delivered_carrier_date_id, order_delivered_customer_date_id, order_estimated_delivery_date_id, 
                                review_creation_date_id, review_answer_timestamp_id, estimated_days, arrival_days, shipping_days)
                        select  customer_id, order_id, review_id, review_score,review_comment_title,review_comment_message,
                                order_purchase_timestamp_id, order_approved_at_id, order_delivered_carrier_date_id, 
                                order_delivered_customer_date_id, order_estimated_delivery_date_id, review_creation_date_id,
                                review_answer_timestamp_id, estimated_days, arrival_days, shipping_days
                        from dw_reviews_new 
                        where (order_id, review_id) not in (select order_id, review_id from dw_reviews);                  
                        """)
     
    dw_reviews = sqlContext.table("dw_reviews")



dw_reviews.limit(20).toPandas()

Unnamed: 0,customer_id,order_id,review_id,review_score,review_comment_title,review_comment_message,order_purchase_timestamp_id,order_approved_at_id,order_delivered_carrier_date_id,order_delivered_customer_date_id,order_estimated_delivery_date_id,review_creation_date_id,review_answer_timestamp_id,estimated_days,arrival_days,shipping_days
0,11339,34979,4f38ceccf5f69828f45e871aad65266f,4,,,23201,23201,23293,23317,23425,23329,23347,9,4,0
1,3839,50376,500dab66d2c2cc1b14d64d0e1aa346db,4,,Produto chegou dentro do prazo. Atende pelo cu...,20063,20064,20083,20180,20737,20185,20223,28,4,4
2,70413,77531,2afe3bd43d00fe8743d030db08906ac8,5,,,18490,18491,18503,19430,19249,19297,19309,31,39,38
3,30351,28461,7f2c8f10a5eec4375a11cbcb3573310d,5,,entrega dentro do prazo isso é importante.,19912,19912,20089,20112,20137,20113,20198,9,8,0
4,86770,9297,e9c0f57b4407df02dc7248e0c2acc520,5,,,18423,18424,18452,18475,18793,18481,18622,15,2,0
5,37062,24320,82c65eeeb07cb7979bb2ee04f10a554b,5,,,17735,17736,17780,17999,18481,18001,18026,31,10,9
6,49245,27475,81d10002d089ca9b6715acc0db0f8dcc,5,,,16894,16905,16919,17087,17569,17617,17672,28,8,6
7,96763,45316,9fbaad8f7ba7652d9b453408a67df110,5,,Produto entregue no prazo nota 10,17904,17904,17951,18183,18601,18193,18216,29,11,9
8,56343,75848,9a10c75418c7973955299f3755a1fe0c,4,,,10909,10923,10939,11001,11377,11017,11102,19,3,2
9,69304,93022,1226ec6e120aebd5ae29bb59859aac9b,1,,Prazo esgotado. Produto ainda não entregue,19746,19747,19777,20898,20497,20545,20587,31,47,46


In [0]:
print((dw_reviews.count(), len(dw_reviews.columns)))

(99224, 16)


#### Cria tabela Fato de Itens

In [0]:
dw_itens = orders_items_df.select(orders_items_df.columns)
dw_itens = dw_itens.select(["order_item_id","order_id","order_item_id","product_id","seller_id","shipping_limit_date","price","freight_value"])
dw_itens = dw_itens.join(dw_orders, ["order_id"], how='left')
cols = dw_itens.columns
cols.remove("order_id")
dw_itens = dw_itens.select(cols)
dw_itens = dw_itens.withColumnRenamed("IDSK", "order_id")

cols = dw_itens.columns
dw_itens = dw_itens.join(dw_customers, ["customer_id"], how='left')
cols.remove("customer_id")
cols = [*cols, *["IDSK"]]
dw_itens = dw_itens.select(cols)
dw_itens = dw_itens.withColumnRenamed("IDSK", "customer_id")

dw_itens = dw_itens.withColumn('shipping_limit_date', to_timestamp(date_format(col('shipping_limit_date'), "yyyy-MM-dd HH") ))

dw_itens = dw_itens.withColumn('seller_to_carrier_status', datediff_cond_udf(col('shipping_limit_date'),col('order_delivered_carrier_date'))) 
dw_itens = dw_itens.withColumn('seller_to_carrier_status', get_arrival_status_udf(col('seller_to_carrier_status'))) 

dw_itens = dw_itens.join(dw_time, dw_itens.shipping_limit_date == dw_time.full_date , how='left')
dw_itens = dw_itens.select(["order_item_id", 'customer_id','order_id', "seller_id" , "product_id" , "price", "freight_value",
                            'order_purchase_timestamp_id', 'order_approved_at_id', 'order_delivered_carrier_date_id', 
                            'order_delivered_customer_date_id', 'order_estimated_delivery_date_id', 'IDSK', 'estimated_days', 
                            'arrival_days', 'shipping_days', 'seller_to_carrier_status'])
dw_itens = dw_itens.withColumnRenamed("IDSK", "shipping_limit_date_id")

if dw_itens_old.count() > 0:
    cols = dw_itens_old.columns
    dw_itens_new = dw_itens.select(dw_itens.columns)

    dw_itens_new  = dw_itens_new .withColumn("order_item_id", col("order_item_id").cast(IntegerType()) )

    dw_itens_old = dw_itens_old.select(["order_item_id", 'order_id', 'customer_id',"seller_id" , "product_id" , "price", "freight_value", 
                                        'shipping_limit_date_id', 'seller_to_carrier_status'])
    dw_orders_temp = dw_orders.select(['IDSK', 'order_purchase_timestamp_id', 'order_approved_at_id', 'order_delivered_carrier_date_id', 
                                        'order_delivered_customer_date_id', 'order_estimated_delivery_date_id', 'estimated_days', 
                                        'arrival_days', 'shipping_days'])
    dw_itens_old = dw_itens_old.join(dw_orders_temp, dw_itens_old.order_id == dw_orders.IDSK, how='left')
    dw_itens_old = dw_itens_old.select(["order_item_id", 'customer_id','order_id', "seller_id" , "product_id" , "price", "freight_value",
                                        'order_purchase_timestamp_id', 'order_approved_at_id', 'order_delivered_carrier_date_id', 
                                        'order_delivered_customer_date_id', 'order_estimated_delivery_date_id', 'shipping_limit_date_id', 
                                        'estimated_days', 'arrival_days', 'shipping_days', 'seller_to_carrier_status'])

    dw_itens_old.write.mode("overwrite").saveAsTable("dw_itens")
    dw_itens_new.createOrReplaceTempView("dw_itens_new")
    dw_products.createOrReplaceTempView("dw_products")
    dw_sellers.createOrReplaceTempView("dw_sellers")

    dw_itens = spark.sql("""insert into  dw_itens (order_item_id, order_id, customer_id, product_id, seller_id, 
                         order_purchase_timestamp_id, order_approved_at_id, order_delivered_carrier_date_id, 
                         order_delivered_customer_date_id, order_estimated_delivery_date_id, shipping_limit_date_id, price, freight_value, 
                         estimated_days, arrival_days, shipping_days, seller_to_carrier_status)
                        select itn.order_item_id, itn.order_id, itn.customer_id, p.IDSK, s.IDSK, itn.order_purchase_timestamp_id, 
                        itn.order_approved_at_id, itn.order_delivered_carrier_date_id, itn.order_delivered_customer_date_id, 
                        itn.order_estimated_delivery_date_id, itn.shipping_limit_date_id, itn.price, itn.freight_value, 
                        itn.estimated_days, itn.arrival_days, itn.shipping_days, itn.seller_to_carrier_status
                        from dw_itens_new itn
                        inner join dw_products p on p.product_id = itn.product_id and p.FIM is null
                        inner join dw_sellers s on s.seller_id = itn.seller_id and s.FIM is null
                        where (itn.order_id, itn.order_item_id) not in ( select order_id, order_item_id from dw_itens);                  
                        """)
     
    dw_itens = sqlContext.table("dw_itens")
    dw_itens = dw_itens.select(cols)

else:
    cols = dw_itens.columns
    dw_itens = dw_itens.join(dw_sellers, (dw_sellers.seller_id == dw_itens.seller_id), how='left')
    cols.remove("seller_id")
    cols = [*cols, *["IDSK"]]
    dw_itens = dw_itens.select(cols)
    dw_itens = dw_itens.withColumnRenamed("IDSK", "seller_id")
    
    cols = dw_itens.columns
    dw_itens = dw_itens.join(dw_products, (dw_products.product_id == dw_itens.product_id) , how='left')
    cols.remove("product_id")
    cols = [*cols, *["IDSK"]]
    dw_itens = dw_itens.select(cols)
    dw_itens = dw_itens.withColumnRenamed("IDSK", "product_id")


dw_itens.limit(20).toPandas()

Unnamed: 0,order_item_id,customer_id,order_id,price,freight_value,order_purchase_timestamp_id,order_approved_at_id,order_delivered_carrier_date_id,order_delivered_customer_date_id,order_estimated_delivery_date_id,shipping_limit_date_id,estimated_days,arrival_days,shipping_days,seller_to_carrier_status,seller_id,product_id
0,1,52599,27203,54.9,18.48,21860,21860,21925,22091,22513,21980,27,9,6,Em tempo,27203,27203
1,1,14443,27213,18.9,15.1,14685,14685,14754,14898,15217,14805,22,8,5,Em tempo,27213,27213
2,1,67068,27365,47.99,21.18,18422,18423,18496,18545,19057,18567,26,5,2,Em tempo,27365,27365
3,1,13335,27807,23.99,7.78,16728,16755,16820,16907,17113,16971,16,7,3,Em tempo,27807,27807
4,1,69993,28082,7.3,15.1,16341,16341,16477,16727,16873,16485,22,16,10,Atrasado,28082,28082
5,1,32038,28123,53.44,22.3,23054,23055,23103,23210,23641,23151,24,6,4,Em tempo,28123,28123
6,1,21697,28760,309.99,65.54,22194,22194,22289,22434,23257,22482,44,10,6,Em tempo,28760,28760
7,1,47123,28767,145.9,58.88,21254,21254,21254,21480,22345,21446,45,9,9,Em tempo,28767,28767
8,1,84115,28932,55.49,15.14,15901,15929,15958,16264,16441,16072,22,15,12,Em tempo,28932,28932
9,1,89320,29162,550.99,40.55,20827,20833,20846,21323,21577,21073,31,20,19,Em tempo,29162,29162


In [0]:
print((dw_itens.count(), len(dw_itens.columns)))

(112652, 17)


In [0]:
dw_orders = dw_orders.select(["IDSK", "order_id", 'arrival_status', 'estimated_delivery_rate', 'arrival_delivery_rate',
                              'shipping_delivery_rate'])

In [0]:
#dbutils.fs.rm("/FileStore/shared_uploads/luannrs@hotmail.com/dw_data/dw_time.csv", True)
#dbutils.fs.rm("/FileStore/shared_uploads/luannrs@hotmail.com/dw_data/dw_products.csv", True)
#dbutils.fs.rm("/FileStore/shared_uploads/luannrs@hotmail.com/dw_data/dw_sellers.csv", True)
#dbutils.fs.rm("/FileStore/shared_uploads/luannrs@hotmail.com/dw_data/dw_customers.csv", True)
#dbutils.fs.rm("/FileStore/shared_uploads/luannrs@hotmail.com/dw_data/dw_orders.csv", True)
#dbutils.fs.rm("/FileStore/shared_uploads/luannrs@hotmail.com/dw_data/dw_payments.csv", True)
#dbutils.fs.rm("/FileStore/shared_uploads/luannrs@hotmail.com/dw_data/dw_reviews.csv", True)
#dbutils.fs.rm("/FileStore/shared_uploads/luannrs@hotmail.com/dw_data/dw_itens.csv", True)

In [0]:
#dbutils.fs.rm("/FileStore/shared_uploads/luannrs@hotmail.com/olist_products_dataset.csv", True)
#dbutils.fs.rm("/FileStore/shared_uploads/luannrs@hotmail.com/olist_sellers_dataset.csv", True)
#dbutils.fs.rm("/FileStore/shared_uploads/luannrs@hotmail.com/olist_customers_dataset.csv", True)
#dbutils.fs.rm("/FileStore/shared_uploads/luannrs@hotmail.com/olist_orders_dataset.csv", True)
#dbutils.fs.rm("/FileStore/shared_uploads/luannrs@hotmail.com/olist_order_payments_dataset.csv", True)
#dbutils.fs.rm("/FileStore/shared_uploads/luannrs@hotmail.com/olist_order_reviews_dataset.csv", True)
#dbutils.fs.rm("/FileStore/shared_uploads/luannrs@hotmail.com/olist_order_items_dataset.csv", True)
#dbutils.fs.rm("/FileStore/shared_uploads/luannrs@hotmail.com/product_category_name_translation.csv", True)

#### Salva os dados do DataWarehouse criado ou atualizado

In [0]:

path = "/FileStore/shared_uploads/luannrs@hotmail.com/dw_data/"
fileList = [x.name.replace('/','') for x in dbutils.fs.ls(path)]

if ("dw_time.csv" in fileList):
    dw_time.write.mode("Overwrite").csv(path + "dw_time.csv", header=True)
else:
    dw_time.write.csv(path + "dw_time.csv", header=True)

if ("dw_products.csv" in fileList):
    dw_products.write.mode("Overwrite").csv(path + "dw_products.csv", header=True)
else:
    dw_products.write.csv(path + "dw_products.csv", header=True)

if ("dw_sellers.csv" in fileList):
    dw_sellers.write.mode("Overwrite").csv(path + "dw_sellers.csv", header=True)
else:
    dw_sellers.write.csv(path + "dw_sellers.csv", header=True)

if ("dw_customers.csv" in fileList):
    dw_customers.write.mode("Overwrite").csv(path + "dw_customers.csv", header=True)
else:
    dw_customers.write.csv(path + "dw_customers.csv", header=True)

if ("dw_orders.csv" in fileList):
    dw_orders.write.mode("Overwrite").csv(path + "dw_orders.csv", header=True)
else:
    dw_orders.write.csv(path + "dw_orders.csv", header=True)

if ("dw_payments.csv" in fileList):
    dw_payments.write.mode("Overwrite").csv(path + "dw_payments.csv", header=True)
else:
    dw_payments.write.csv(path + "dw_payments.csv", header=True)

if ("dw_reviews.csv" in fileList):
    dw_reviews.write.option("multiLine",True).mode("Overwrite").csv(path + "dw_reviews.csv", header=True)
else:
    dw_reviews.write.option("multiLine",True).csv(path + "dw_reviews.csv", header=True)

if ("dw_itens.csv" in fileList):
    dw_itens.write.mode("Overwrite").csv(path + "dw_itens.csv", header=True)
else:
    dw_itens.write.csv(path + "dw_itens.csv", header=True)

In [0]:
%sql
drop table if exists dw_products;
drop table if exists dw_sellers;
drop table if exists dw_customers;
drop table if exists dw_payments;
drop table if exists dw_reviews;
drop table if exists dw_orders;
drop table if exists dw_itens;