In [1]:
import pandas as pd

# Carregar  dados da camada bronze
df = pd.read_csv('data/bronze/dados_brutos.csv')
print(f"Dados originais {df.shape}")
df_clean = df.copy()

Dados originais (541909, 10)


# Transformações

## 1. Valores Faltantes

### CustomerID




*   CustomerID	| 135.080 Rows NaN

---


Checando a possibilidade de tratar CustomerID NaN de acordo com InvoiceNo (Nº da fatura) iguais e com CustomerID preenchido

In [2]:
# Contar quantos InvoiceNo possuem pelo menos um CustomerID nulo
invoice_null = df.groupby('InvoiceNo')['CustomerID'].apply(lambda x: x.isnull().any())
invoice_notnull = df.groupby('InvoiceNo')['CustomerID'].apply(lambda x: x.notnull().any())

# Quantos têm mistura (nulo e não nulo)?
mixed_invoice = (invoice_null & invoice_notnull)

print("Total de InvoiceNo mistos:", mixed_invoice.sum())
print("Total de InvoiceNo com CustomerID nulo:", invoice_null.sum())
print("Total de InvoiceNo com CustomerID não nulo:", invoice_notnull.sum())
print(f"Total de linhas com CustomerID nulo: {df['CustomerID'].isnull().sum()}")

Total de InvoiceNo mistos: 0
Total de InvoiceNo com CustomerID nulo: 3710
Total de InvoiceNo com CustomerID não nulo: 22190
Total de linhas com CustomerID nulo: 135080


 - Total de InvoiceNo mistos: 0


Significa que nao é possivel recuperar CustomerID pelo InvoiceNo

### Description

*   Description | 1454 Rows NaN

 Muitos Description possuem erros de preenchimento também

---



Cada produto possui o seu StockCode. Muitos Description (nome do produto) estão NaN ou preenchidos de forma errada, porém possuem StockCode iguais

In [3]:
# Contar quantos StockCode possuem pelo menos um Description nulo
StockCode_null = df.groupby('StockCode')['Description'].apply(lambda x: x.isnull().any())
StockCode_notnull = df.groupby('StockCode')['Description'].apply(lambda x: x.notnull().any())

# Quantos têm mistura (nulo e não nulo)?
mixed_StockCode = (StockCode_null & StockCode_notnull)

print("Total de StockCode mistos:", mixed_StockCode.sum())
print("Total de StockCode com Description nulo:", StockCode_null.sum())
print("Total de StockCode com Description não nulo:", StockCode_notnull.sum())
print(f"Total de linhas com Description nulo: {df['Description'].isnull().sum()}")
print("==========================================")

#Exemplo:
display(df[df["StockCode"] == "35965"])
print("\n")
print(f"Quantidade de Description NaN do StockCode 35965: {df[df['StockCode'] == '35965']['Description'].isnull().sum()}")

Total de StockCode mistos: 848
Total de StockCode com Description nulo: 960
Total de StockCode com Description não nulo: 3958
Total de linhas com Description nulo: 1454


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,data_ingestao,fonte_arquivos
2889,536592,35965,FOLKART HEART NAPKIN RINGS,4,12/1/2010 17:06,3.36,,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
6017,536876,35965,FOLKART HEART NAPKIN RINGS,1,12/3/2010 11:36,3.36,,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
7205,537013,35965,,-25,12/3/2010 15:40,0.00,,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
8071,537126,35965,FOLKART HEART NAPKIN RINGS,1,12/5/2010 12:13,2.95,18118.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
10678,537237,35965,FOLKART HEART NAPKIN RINGS,3,12/6/2010 9:58,3.36,,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
...,...,...,...,...,...,...,...,...,...,...
347758,567337,35965,,5,9/19/2011 14:56,0.00,,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
349563,567507,35965,FOLKART HEART NAPKIN RINGS,12,9/20/2011 14:46,0.97,,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
454169,575513,35965,,7,11/10/2011 10:39,0.00,,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
464522,576110,35965,,5,11/14/2011 10:33,0.00,,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data




Quantidade de Description NaN do StockCode 35965: 10


In [4]:
# Criar mapeamento de StockCode  Description válida
mapa_descricoes = df_clean.dropna(subset=['Description']).groupby('StockCode')['Description'].first().to_dict()

# Preencher Description de acordo com o primeiro valor
df_clean['Description'] = df_clean.apply(
    lambda row: mapa_descricoes.get(row['StockCode'], row['Description']),
    axis=1
)

print(f"Descriptions recuperados: {df['Description'].isnull().sum() - df_clean['Description'].isnull().sum()}")
print(f"Descriptions não recuperados: {df_clean['Description'].isnull().sum()}")
print(f"Quantidade de Descriptions erradas corrigidas: {df['Description'].nunique() - df_clean['Description'].nunique()}")

Descriptions recuperados: 1342
Descriptions não recuperados: 112
Quantidade de Descriptions erradas corrigidas: 406


##  2. Duplicatas

 Duplicatas são aceitáveis no modelo de negócio deste DataFrame. Cada linha representa um item em uma fatura, portanto, o mesmo `InvoiceNo` (número da fatura) pode aparecer várias vezes se uma fatura contiver múltiplos produtos. A venda em si é representada unicamente pelo `InvoiceNo`.

In [5]:
print(f"Quantidade de linhas duplicadas: {df_clean.duplicated().sum()}")
display(df_clean[df_clean.duplicated()])

Quantidade de linhas duplicadas: 5270


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,data_ingestao,fonte_arquivos
517,536409,21866,UNION JACK FLAG LUGGAGE TAG,1,12/1/2010 11:45,1.25,17908.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
527,536409,22866,HAND WARMER SCOTTY DOG DESIGN,1,12/1/2010 11:45,2.10,17908.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
537,536409,22900,SET 2 TEA TOWELS I LOVE LONDON,1,12/1/2010 11:45,2.95,17908.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
539,536409,22111,SCOTTIE DOG HOT WATER BOTTLE,1,12/1/2010 11:45,4.95,17908.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
555,536412,22327,ROUND SNACK BOXES SET OF 4 SKULLS,1,12/1/2010 11:49,2.95,17920.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
...,...,...,...,...,...,...,...,...,...,...
541675,581538,22068,BLACK PIRATE TREASURE CHEST,1,12/9/2011 11:34,0.39,14446.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
541689,581538,23318,BOX OF 6 MINI VINTAGE CRACKERS,1,12/9/2011 11:34,2.49,14446.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
541692,581538,22992,REVOLVER WOODEN RULER,1,12/9/2011 11:34,1.95,14446.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
541699,581538,22694,WICKER STAR,1,12/9/2011 11:34,2.10,14446.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data


## 3. Inconsistências Gerais

### Valores negativos, Cancelamentos e tarifas

In [6]:
# Linhas de tarifas
stockcode_fees = ['C2', 'DOT', 'POST','AMAZONFEE']

# Filtrar DataFrame para mostrar Inconsistências:
#(InvoiceNo que começam com 'C','A)'| Quantity <= 0 | UnitPrice <= 0 e StockCode de tarifas
df_inconsistencias = df_clean[
   (df_clean['InvoiceNo'].astype(str).str.startswith('C')) |
    (df_clean['InvoiceNo'].astype(str).str.startswith('A')) |
     (df_clean['Quantity'] <= 0) | (df_clean['UnitPrice'] <= 0) | (df_clean['StockCode'].isin(stockcode_fees))
]

display(df_inconsistencias)
print("Dados serão divididos na camada gold")

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,data_ingestao,fonte_arquivos
45,536370,POST,POSTAGE,3,12/1/2010 8:45,18.00,12583.0,France,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
141,C536379,D,Discount,-1,12/1/2010 9:41,27.50,14527.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,12/1/2010 9:49,4.65,15311.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
235,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,12/1/2010 10:24,1.65,17548.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
236,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,12/1/2010 10:24,0.29,17548.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
...,...,...,...,...,...,...,...,...,...,...
541716,C581569,84978,HANGING HEART JAR T-LIGHT HOLDER,-1,12/9/2011 11:58,1.25,17315.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
541717,C581569,20979,36 PENCILS TUBE RED RETROSPOT,-5,12/9/2011 11:58,1.25,17315.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
541730,581570,POST,POSTAGE,1,12/9/2011 11:59,18.00,12662.0,Germany,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
541767,581574,POST,POSTAGE,2,12/9/2011 12:09,18.00,12526.0,Germany,2025-10-28 15:45:39.604635,carrie1/ecommerce-data


Dados serão divididos na camada gold


### Outliers

In [7]:
df_clean.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


In [8]:
# identificando Outliers
display(df_clean[df_clean['Quantity'] > 5000])

print("\n")
print("-------------------------------------------------------")
print("Verificando padrão de compra do maior Outlier/Cliente")
print("-------------------------------------------------------")

# Trocar ID para visualizar todos
display(df_clean[df_clean['CustomerID'] == 16446])

print("Foram feitos cancelamenos das compras Outliers")

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,data_ingestao,fonte_arquivos
61619,541431,23166,MEDIUM CERAMIC TOP STORAGE JAR,74215,1/18/2011 10:01,1.04,12346.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
74614,542504,37413,ICON MUG REVOLUTIONARY,5568,1/28/2011 12:03,0.0,,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
502122,578841,84826,ASSTD DESIGN 3D PAPER STICKERS,12540,11/25/2011 15:57,0.0,13256.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
540421,581483,23843,"PAPER CRAFT , LITTLE BIRDIE",80995,12/9/2011 9:15,2.08,16446.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data




-------------------------------------------------------
Verificando padrão de compra do maior Outlier/Cliente
-------------------------------------------------------


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,data_ingestao,fonte_arquivos
194354,553573,22980,PANTRY SCRUBBING BRUSH,1,5/18/2011 9:52,1.65,16446.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
194355,553573,22982,PANTRY PASTRY BRUSH,1,5/18/2011 9:52,1.25,16446.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
540421,581483,23843,"PAPER CRAFT , LITTLE BIRDIE",80995,12/9/2011 9:15,2.08,16446.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data
540422,C581484,23843,"PAPER CRAFT , LITTLE BIRDIE",-80995,12/9/2011 9:27,2.08,16446.0,United Kingdom,2025-10-28 15:45:39.604635,carrie1/ecommerce-data


Foram feitos cancelamenos das compras Outliers


## Carregando dados na camada Silver


In [9]:
df_clean.to_csv("data/silver/dados_limpos.csv",index=False)
print("Dados salvos na camada Silver")

Dados salvos na camada Silver
