# Pré processamento dos dados

## Imports

In [174]:
import numpy as np
import pandas as pd

## Análise superficial dos dados

In [175]:
df = pd.read_csv('../datasets/raw/data-test-analytics.csv')
df.sample(5)

Unnamed: 0,id,created_at,updated_at,deleted_at,name_hash,email_hash,address_hash,birth_date,status,version,city,state,neighborhood,last_date_purchase,average_ticket,items_quantity,all_revenue,all_orders,recency,marketing_source
2817,bfbe2716-b041-4e3a-99cd-223c520c7d6a,05/19/19 05:49 PM,01/13/21 11:23 AM,,85c59c42c550591dfce25694d48a5084,a108dacb1da041ec99b4dfde8a4e4e9c,7b0cc3e8eba707ef798978556d444d6f,12/30/89 12:00 AM,paused,5.28.11,da Mota Paulista,MT,Custodinha,01/13/21 11:23 AM,224.072926,7,1792.583406,8,36,organic_search
3076,3bae5c13-7205-4a26-b91f-a67cf8c898cd,07/25/17 07:19 PM,01/17/21 11:23 AM,,f751e097bd997d0486526a436c3c31ab,e59b58348ffe62d14094d4793194998c,bd8075dd288230da947124a48e6b057c,03/02/60 12:00 AM,active,4.41.11,Aragão do Norte,MS,Nossa Senhora Do Rosário,01/17/21 11:23 AM,212.575042,2,1275.450253,6,32,telegram_whatsapp
3195,dcbb7f4d-2c62-4605-aec3-6e33e17864da,11/25/17 07:36 AM,01/12/21 11:23 AM,,dc365e619a4486d28b7452cebe98f21f,00136780ba5b47b3a31843ebfae1e70f,6b77680ca20c8de19d1ecc9cac62fe11,08/30/80 12:00 AM,paused,4.27.6,da Cruz,MA,Vila São Gabriel Jacui,01/12/21 11:23 AM,221.845328,12,443.690656,2,37,organic_search
5901,063acf07-160c-403f-81ea-7c316254834d,11/18/20 08:15 AM,01/18/21 11:23 AM,,65b938adc26e49f057f68aa9011049c1,0131afaf559c21fac1413058ccbb2fe7,1c153a59caa3efd137bd7ef2e9ae68a1,10/26/41 12:00 AM,paused,3.5.8,Vieira de Araújo,AP,Vila Antena,01/18/21 11:23 AM,258.875901,8,517.751803,2,31,organic_search
6584,6ffd49de-6251-42d2-89cd-e5ce4dbbe1de,10/16/17 02:32 AM,01/12/21 11:23 AM,,fd05b73c69e3bbc0c35f5084184bfecd,4f6c055cbfbf359601d1ed29848b3deb,bc35c776cdce4b17979765177ac0ba34,01/16/42 12:00 AM,active,2.23.11,da Rocha,MS,Minas Brasil,01/12/21 11:23 AM,217.478198,8,869.912791,4,37,paid_search


Podemos otimizar o uso de memória do dataframe, convertendo as colunas.

In [176]:
memory_before = df.memory_usage(deep=True).sum()

df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  10000 non-null  object 
 1   created_at          10000 non-null  object 
 2   updated_at          10000 non-null  object 
 3   deleted_at          505 non-null    object 
 4   name_hash           10000 non-null  object 
 5   email_hash          10000 non-null  object 
 6   address_hash        10000 non-null  object 
 7   birth_date          10000 non-null  object 
 8   status              10000 non-null  object 
 9   version             10000 non-null  object 
 10  city                10000 non-null  object 
 11  state               10000 non-null  object 
 12  neighborhood        10000 non-null  object 
 13  last_date_purchase  10000 non-null  object 
 14  average_ticket      10000 non-null  float64
 15  items_quantity      10000 non-null  int64  
 16  all_r

A partir dos metadados e olhando para os dados, podemos perceber que:
- As colunas `id`, `name_hash`, `email_hash`, `address_hash` estão relacionadas a identificação do usuário, e não são relevantes para a análise;

In [177]:
df.drop(columns=[
    'id', 'name_hash', 'email_hash', 'address_hash'
], inplace=True)

Separamos as colunas relacionadas a data, convertendo para o tipo `datetime`.

In [178]:
date_columns = ['created_at', 'updated_at', 'deleted_at', 'birth_date', 'last_date_purchase']

for item in date_columns:
    df[item] = pd.to_datetime(df[item], format='%m/%d/%y %I:%M %p')

Inicialmente foi verificado que os valores `int64` poderiam ser convertidos em `int16` e `int8`.

In [179]:
for item in df.select_dtypes(include=['int64']).columns:
    aux = (df[item].astype('int8') == df[item]).unique()
    if len(aux) == 1:
        df[item] = df[item].astype('int8')
    else:
        df[item] = df[item].astype('int16')
del aux

Análogo para os valores `float64` que poderiam ser convertidos em `float32` sem perda de informação.

In [180]:
for item in df.select_dtypes(include=['float64']).columns:
    aux = np.allclose(df[item].astype('float32'), df[item])
    if aux:
        df[item] = df[item].astype('float32')
del aux

Foi analisado também a possibilidade de converter as colunas `object` para `category`, mas nem todas as colunas poderiam ter essa conversão devido ao tamanho do dataframe.

In [181]:
df.select_dtypes(include=['object']).nunique().sort_values(ascending=True)

status                 3
marketing_source       6
state                 27
neighborhood         482
city                2406
version             2905
dtype: int64

In [182]:
to_cat = df.select_dtypes(include=['object']).nunique().sort_values(ascending=True)[:3].index

for item in to_cat:
    df[item] = df[item].astype('category')

In [183]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   created_at          10000 non-null  datetime64[ns]
 1   updated_at          10000 non-null  datetime64[ns]
 2   deleted_at          505 non-null    datetime64[ns]
 3   birth_date          10000 non-null  datetime64[ns]
 4   status              10000 non-null  category      
 5   version             10000 non-null  object        
 6   city                10000 non-null  object        
 7   state               10000 non-null  category      
 8   neighborhood        10000 non-null  object        
 9   last_date_purchase  10000 non-null  datetime64[ns]
 10  average_ticket      10000 non-null  float32       
 11  items_quantity      10000 non-null  int8          
 12  all_revenue         10000 non-null  float32       
 13  all_orders          10000 non-null  int8       

In [184]:
memory_after = df.memory_usage(deep=True).sum()

In [185]:
def bytes_to_megabytes(num):
    return round(num / 1024**2, 2)

memory_before, memory_after = bytes_to_megabytes(memory_before), bytes_to_megabytes(memory_after)
del bytes_to_megabytes

Por fim temos um grande ganho de memória.

In [186]:
print(f'Antes do tratamento: {memory_before} mb')
print(f'Depois do tratamento: {memory_after} mb')
print(f'Diferença bruta: {memory_before - memory_after} mb')
print(f'Diferença percentual: {(memory_before - memory_after) / memory_before * 100:.2f}%')

Antes do tratamento: 10.81 mb
Depois do tratamento: 2.64 mb
Diferença bruta: 8.17 mb
Diferença percentual: 75.58%


E por fim, salvamos o dataframe em um arquivo `pickle` para otimizar o carregamento dos dados.

In [187]:
df.to_pickle('../datasets/processed/data-test-analytics.pkl')