In [69]:
import pandas as pd

### Import raw data

In [70]:
orders_raw = pd.read_csv('raw_data/orders.csv')
orderlines_raw = pd.read_csv('raw_data/orderlines.csv')
products_raw = pd.read_csv('raw_data/products.csv')
brands_raw = pd.read_csv('raw_data/brands.csv')

In [71]:
orders_df = orders_raw.copy()
orderlines_df = orderlines_raw.copy()
products_df = products_raw.copy()
brands_df = brands_raw.copy()

### Spot missing values

In [72]:
print("Orders missing values:")
print(orders_df.isna().sum())

print("\nOrderlines missing values:")
print(orderlines_df.isna().sum())

print("\nProducts missing values:")
print(products_df.isna().sum())

print("\nBrands missing values:")
print(brands_df.isna().sum())

Orders missing values:
order_id        0
created_date    0
total_paid      5
state           0
dtype: int64

Orderlines missing values:
id                  0
id_order            0
product_id          0
product_quantity    0
sku                 0
unit_price          0
date                0
dtype: int64

Products missing values:
sku             0
name            0
desc            7
price          46
promo_price     0
in_stock        0
type           50
dtype: int64

Brands missing values:
short    0
long     0
dtype: int64


### Drop duplicates

In [73]:
products_df.drop_duplicates(inplace=True)
orders_df.drop_duplicates(inplace=True)
orderlines_df.drop_duplicates(inplace=True)
brands_df.drop_duplicates(inplace=True)

### Drop missing values

In [74]:
products_df.dropna(axis=0, subset=['desc', 'price'], inplace=True)
orders_df.dropna(axis=0, subset='total_paid', inplace=True)

### Convert dates

In [75]:
orders_df['created_date'] = pd.to_datetime(orders_df['created_date'])
orderlines_df['date'] = pd.to_datetime(orderlines_df['date'])

### Double dot problem

In [76]:
# Checks if there is more than 1 .
double_dot_price_mask = products_df['price'].str.count('\.') > 1
# Get the proportion of trues and falses to see what percent is corrupted
double_dot_price_mask.value_counts(normalize=True)
products_df = products_df[~double_dot_price_mask].reset_index(drop=True)


  double_dot_price_mask = products_df['price'].str.count('\.') > 1


In [77]:
# Checks if there is more than 1 .
double_dot_promo_mask = products_df['promo_price'].str.count('\.') > 1

double_dot_promo_mask.value_counts(normalize=True)

  double_dot_promo_mask = products_df['promo_price'].str.count('\.') > 1


promo_price
False    0.574525
True     0.425475
Name: proportion, dtype: float64

### More than 2 numbers after the decimal

In [78]:
# Checks if there is more than 1 .
price_double_dot_mask = products_df['price'].str.count('\.') > 1
# Get the proportion of trues and falses to see what percent is corrupted
price_double_dot_mask.value_counts(normalize=True)

  price_double_dot_mask = products_df['price'].str.count('\.') > 1


price
False    1.0
Name: proportion, dtype: float64

In [79]:
# Checks if there is more than 1 .
promo_double_dot_mask = products_df['promo_price'].str.count('\.') > 1
# Get the proportion of trues and falses to see what percent is corrupted
promo_double_dot_mask.value_counts(normalize=True)

  promo_double_dot_mask = products_df['promo_price'].str.count('\.') > 1


promo_price
False    0.574525
True     0.425475
Name: proportion, dtype: float64

### Validate relationships between tables

In [80]:
missing_orders = ~orderlines_df['id_order'].isin(orders_df['order_id'])
missing_products = ~orderlines_df['sku'].isin(products_df['sku'])
orderlines_df = orderlines_df[~missing_orders & ~missing_products]
orders_df = orders_df[~orders_df['order_id'].isin(orderlines_df['id_order'])]
products_df = products_df[~products_df['sku'].isin(orderlines_df['sku'])]

### Conver prices to numeric

In [83]:
products_df['price'] = pd.to_numeric(products_df['price'], errors='coerce')
products_df['promo_price'] = pd.to_numeric(products_df['promo_price'], errors='coerce')

### Export the clean data

In [85]:
orderlines_df.to_csv('./clean_data/orderlines_cl.csv')
orders_df.to_csv('./clean_data/orders_cl.csv')
products_df.to_csv('./clean_data/products_cl.csv')
brands_df.to_csv('./clean_data/brands_cl.csv')