# requirements

In [1]:
# !pip install sqlalchemy
# !pip install py7zr
# !pip install pandas
# !pip install pyarrow
# !pip install matplotlib
# !pip install py7zr sqlalchemy pandas pyarrow matplotlib

# Imports

In [2]:
import os
import logging
import uuid
from datetime import (
    datetime,
    timezone
)
import re
import unicodedata

In [3]:
# import sqlalchemy as sa
import numpy as np
import pandas as pd
import py7zr
import matplotlib.pyplot as plt

# Utils & functions

### logger

In [4]:
# Logger
class __UUIDFilter(logging.Filter):
    """
    Internal utils
    Filter that adds a UUID4 to the log record.
    """
    def filter(self, record):
        record.uuid4 = uuid.uuid4()
        return True

def create_logger(log_file=None):
    """
    Creates a logger that writes messages to a file and writes them to the console.

    :param log_file: Name of the log file.
    :return: Configured logger.
    """
    # Logger creation
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    # Custom log format with UTC timestamp and UUID4
    log_format = logging.Formatter(
        '%(asctime)s - %(levelname)s - [%(funcName)s] - [%(uuid4)s] - %(message)s',
        datefmt="%Y-%m-%dT%H:%M:%S%z"
    )

    # Force the UTC time to appear in all handlers
    logging.Formatter.converter = lambda *args: datetime.now(timezone.utc).timetuple()

    uuid_filter = __UUIDFilter()
    logger.addFilter(uuid_filter)

    # File writer handler
    if log_file:
        file_handler = logging.FileHandler(log_file)
        file_handler.setLevel(logging.DEBUG)
        file_handler.setFormatter(log_format)
        logger.addHandler(file_handler)

    # Console handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.DEBUG)
    console_handler.setFormatter(log_format)
    logger.addHandler(console_handler)

    return logger

### extract_7z

In [5]:
def extract_7z(bg_logger, file_path):
    """
    Extracts a 7z file to the same directory as the compressed file.

    :param bg_logger: initialized logger
    :param file_path: Path to the 7z file to extract.
    """
    # Record start time
    start_time = datetime.now()

    # Get extraction directory
    extract_dir = os.path.dirname(file_path)

    # Extract the 7z file
    with py7zr.SevenZipFile(file_path, mode='r') as z:
        z.extractall(path=extract_dir)

    # Calculate extraction duration
    duration = datetime.now() - start_time

    # Log success message with extraction details
    bg_logger.info(
        "Data successfully extracted to directory %s. Duration: %s",
        extract_dir, duration
    )
    extract_dir = None
    del extract_dir


### sanitize_column_data

In [6]:
def sanitize_column_data(bg_logger, df, column, c_dtype=str):
    """
    Corrects and specializes the data column format, replacing invalid values with NaN.
    Args:
        bg_logger: Logger instance for logging.
        df: DataFrame containing the data.
        column: Column to be transformed.
        c_dtype: Target data type (default is str).
    Returns:
        The transformed column.
    """
    start_time = datetime.now()
    df[column] = df[column].str.strip()

    bg_logger.info(
        "Specializing column data '%s' to '%s'. It took %s",
        column, str(c_dtype), str(datetime.now() - start_time)
    )
    return df[column]

### sanitize_text

In [7]:
def sanitize_text(text):
    """
    Normalizes text by:
    - Removing special characters
    - Replacing accented characters with their unaccented counterparts
        Note: This is a symbol-based scenario, meaning it only considers simple transformations.
        For words requiring special handling (e.g., unique characters in specific languages),
        we would need to either create a custom mapping or use a specialized
        library for broader support.
    - Removing extra spaces

    Args:
        text (str): The input string to normalize.

    Returns:
        str: The normalized text.
    """
    if not isinstance(text, str):
        return text  # Return as-is if not a string

    # 1. Remove accents
    text = ''.join(
        c for c in unicodedata.normalize('NFD', text) 
        if unicodedata.category(c) != 'Mn'
    )

    # 2. Remove special characters*
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # 3. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

### data maps

In [8]:
NORMATIZE_LOCATION_MAP = {
    "USA": "United States",
    "US": "United States",
    "UK": "United Kingdom",
    "EIRE": "Ireland",
    "RSA": "South Africa",
    "Unspecified": "co0z0",
}

# Base Pipeline

### Logger init

In [9]:
# initialize logger
bg_logger = create_logger()

### base path reference

In [10]:
# Get the current working directory
cwd_path = os.getcwd()

## Base data process

### Importing base DF

In [11]:
base_df_path = os.path.join(
    cwd_path,
    'ingestion',
    'Invoices_Year_2009-2010.7z'
)

In [12]:
extract_7z(bg_logger, base_df_path)

2024-11-24T20:34:43 - INFO - [extract_7z] - [d6f5ddbc-007c-47b6-b864-7e64ec7ef11c] - Data successfully extracted to directory /workspaces/betsson_group/ingestion. Duration: 0:00:00.405594


In [13]:
# os path join to avoid os-specific path issues

# get csv base df
base_retails_df = pd.read_csv(
    os.path.join(
        cwd_path,
        'ingestion',
        'Invoices_Year_2009-2010.csv'
    ),
    sep=',',
    encoding='latin1',
    low_memory=False
)

### 0 - Briefly check the data

##### A - dataset base info

In [14]:
base_retails_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      525461 non-null  object 
 1   StockCode    525461 non-null  object 
 2   Description  522533 non-null  object 
 3   Quantity     525461 non-null  int64  
 4   InvoiceDate  525461 non-null  object 
 5   Price        525439 non-null  float64
 6   Customer ID  417541 non-null  object 
 7   Country      525430 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 32.1+ MB


##### B - understanding data variations

In [15]:
base_retails_df.describe()

Unnamed: 0,Quantity,Price
count,525461.0,525439.0
mean,10.337667,4.688669
std,107.42411,146.130044
min,-9600.0,-53594.36
25%,1.0,1.25
50%,3.0,2.1
75%,10.0,4.21
max,19152.0,25111.09


In [16]:
base_retails_df.isnull().sum()

Invoice             0
StockCode           0
Description      2928
Quantity            0
InvoiceDate         0
Price              22
Customer ID    107920
Country            31
dtype: int64

##### C - Understanding base memory usage

In [17]:
base_memory_usage = base_retails_df.reset_index(drop=True).memory_usage(deep=True)
format_base_memory_usage = base_memory_usage.sum() / (1024 * 1024)
bg_logger.info(f'{format_base_memory_usage:.2f} MB')
base_memory_usage

2024-11-24T20:34:45 - INFO - [<module>] - [989d5d8c-2284-45a2-b033-0b1e4cfb1c92] - 188.98 MB


Index               132
Invoice        28910564
StockCode      28447549
Description    39656495
Quantity        4203688
InvoiceDate    33927016
Price           4203688
Customer ID    26000647
Country        32814455
dtype: int64

##### D - Overview Top and Bottom data

In [18]:
base_retails_df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,12/01/2009 07:45,6.95,13085,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,12/01/2009 07:45,6.75,13085,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,12/01/2009 07:45,6.75,13085,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,12/01/2009 07:45,2.1,13085,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,12/01/2009 07:45,1.25,13085,United Kingdom


In [19]:
base_retails_df.tail()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
525456,538171,22271,FELTCRAFT DOLL ROSIE,2,12/09/2010 20:01,2.95,17530,United Kingdom
525457,538171,22750,FELTCRAFT PRINCESS LOLA DOLL,1,12/09/2010 20:01,3.75,17530,United Kingdom
525458,538171,22751,FELTCRAFT PRINCESS OLIVIA DOLL,1,12/09/2010 20:01,3.75,17530,United Kingdom
525459,538171,20970,PINK FLORAL FELTCRAFT SHOULDER BAG,2,12/09/2010 20:01,3.75,17530,United Kingdom
525460,538171,21931,JUMBO STORAGE BAG SUKI,2,12/09/2010 20:01,1.95,17530,United Kingdom


---
Based on the top and bottom data, it's likely a general retail store. I'll maintain the previously defined generic 'retail' nomenclature.

##### E - Reviewing data uniqueness

In [20]:
base_retails_df.nunique()

Invoice        28816
StockCode       4632
Description     4681
Quantity         825
InvoiceDate    25296
Price           1607
Customer ID     4384
Country           41
dtype: int64

### 1 - Going deeper into base errors and data types

##### A - Specialized Data Corrections

In [21]:
retails_df_stage_I = base_retails_df.copy()

---
Stage 0 - Data specific and column corrections

> Country column - understanding overall logic

In [22]:
list(retails_df_stage_I['Country'].unique())

['United Kingdom',
 'France',
 'USA',
 'Belgium',
 'Australia',
 'EIRE',
 'Germany',
 'Portugal',
 'Japan',
 'Denmark',
 'Nigeria',
 'Netherlands',
 'Poland',
 'Spain',
 'Channel Islands',
 'Italy',
 'Cyprus',
 'Greece',
 'Norway',
 'Austria',
 'Sweden',
 'United Arab Emirates',
 'Finland',
 'Switzerland',
 'Unspecified',
 'Malta',
 'Bahrain',
 'RSA',
 'Bermuda',
 'Hong Kong',
 'Singapore',
 'Thailand',
 'Israel',
 'Lithuania',
 nan,
 'West Indies',
 'Lebanon',
 'Korea',
 'Brazil',
 'Canada',
 'Iceland',
 'U.K.']

> Country column - Cleaning and mapping correct values

In [23]:
# sanitize data
retails_df_stage_I['Country'] = retails_df_stage_I['Country'].apply(sanitize_text)

In [24]:
# correct acronyms and normalizing location names
retails_df_stage_I['Country'] = retails_df_stage_I['Country'].replace(NORMATIZE_LOCATION_MAP)

In [25]:
# overall data consistency
retails_df_stage_I['Country'].value_counts(normalize=True) * 100

Country
United Kingdom          92.463316
Ireland                  1.838685
Germany                  1.547114
France                   1.098529
Netherlands              0.526997
Spain                    0.243229
Switzerland              0.225910
Portugal                 0.209543
Belgium                  0.200598
Channel Islands          0.172430
Sweden                   0.171669
Italy                    0.139124
Australia                0.124469
Cyprus                   0.105437
Austria                  0.102202
Greece                   0.098396
United Arab Emirates     0.082218
Denmark                  0.081457
Norway                   0.070228
Finland                  0.067373
co0z0                    0.058999
United States            0.046438
Japan                    0.042632
Poland                   0.036922
Malta                    0.032735
Lithuania                0.029309
Singapore                0.022267
South Africa             0.021126
Bahrain                  0.020364
Canada

> Customer ID column - understanding overall logic

In [26]:
# validating customer id format, to understand if all values are alphanumeric
pattern = r'[a-z]'
retails_df_stage_I[retails_df_stage_I['Customer ID'].str.contains(pattern, na=False, regex=True, case=False)]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
176263,506141,21670,BLUE SPOT CERAMIC DRAWER KNOB,1,4/27/2010 16:16,0.0,TEST,United Kingdom
176264,506141,79160,HEART SHAPE WIRELESS DOORBELL,1,4/27/2010 16:16,0.0,TEST,United Kingdom
176265,506141,90112,PINK DOLLY HAIR CLIPS,1,4/27/2010 16:16,0.0,TEST,United Kingdom
176266,506141,90100,NECKLACE+BRACELET SET PINK DAISY,1,4/27/2010 16:16,0.0,TEST,United Kingdom
176267,506141,85226A,WHITE/BLUE PULL BACK RACING CAR,1,4/27/2010 16:16,0.0,TEST,United Kingdom
176268,506141,21890,S/6 WOODEN SKITTLES IN COTTON BAG,1,4/27/2010 16:16,0.0,TEST,United Kingdom
176269,506141,21826,EIGHT PIECE DINOSAUR SET,1,4/27/2010 16:16,-100.0,TEST,United Kingdom


\+  Search for test in all columns

> Price column - understanding overall logic

In [27]:
retails_df_stage_I['Price'].describe()

count    525439.000000
mean          4.688669
std         146.130044
min      -53594.360000
25%           1.250000
50%           2.100000
75%           4.210000
max       25111.090000
Name: Price, dtype: float64

In [28]:
# checking values that are less than or equal to 0
non_and_zero_prices_df = retails_df_stage_I.query("Price <= 0")

In [29]:
non_and_zero_prices_df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
263,489464,21733,85123a mixed,-96,12/01/2009 10:52,0.0,,United Kingdom
283,489463,71477,short,-240,12/01/2009 10:52,0.0,,United Kingdom
284,489467,85123A,21733 mixed,-192,12/01/2009 10:53,0.0,,United Kingdom
470,489521,21646,,-50,12/01/2009 11:44,0.0,,United Kingdom
3114,489655,20683,,-44,12/01/2009 17:26,0.0,,United Kingdom


In [30]:
non_and_zero_prices_df.nunique()

Invoice        3350
StockCode      2197
Description     381
Quantity        514
InvoiceDate    2295
Price             5
Customer ID      25
Country           5
dtype: int64

In [31]:
non_and_zero_prices_df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
263,489464,21733,85123a mixed,-96,12/01/2009 10:52,0.0,,United Kingdom
283,489463,71477,short,-240,12/01/2009 10:52,0.0,,United Kingdom
284,489467,85123A,21733 mixed,-192,12/01/2009 10:53,0.0,,United Kingdom
470,489521,21646,,-50,12/01/2009 11:44,0.0,,United Kingdom
3114,489655,20683,,-44,12/01/2009 17:26,0.0,,United Kingdom


- What does "mixed" mean? Is it a `StockCode` that I should merge to understand more deeply?
- What does "short" mean?
- Could "Dotcom sales" refer to online sales for a specific product? If so, are we dealing with a hybrid store model?
    - If yes, then it’s not just related to a product description, but rather the entire sale description.

\+  Search for test in all columns  
\+  Search for Dotcom in all columns  

In [32]:
# checking values that are greater than 0
possile_effective_sales = retails_df_stage_I.query("Price > 0")

In [33]:
# checking possible obvious returns
possible_product_returns = non_and_zero_prices_df.merge(
    possile_effective_sales,
    on=['Invoice', 'StockCode'],
    suffixes=('_return', '_sale')
)

possible_effective_sales = None
del possible_effective_sales

In [34]:
# possible returns
possible_product_returns.head(1)

Unnamed: 0,Invoice,StockCode,Description_return,Quantity_return,InvoiceDate_return,Price_return,Customer ID_return,Country_return,Description_sale,Quantity_sale,InvoiceDate_sale,Price_sale,Customer ID_sale,Country_sale
0,492079,85042,ANTIQUE LILY FAIRY LIGHTS,8,12/15/2009 13:49,0.0,15070,United Kingdom,ANTIQUE LILY FAIRY LIGHTS,8,12/15/2009 13:49,4.95,15070,United Kingdom


In [35]:
possible_product_returns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Invoice             21 non-null     object 
 1   StockCode           21 non-null     object 
 2   Description_return  21 non-null     object 
 3   Quantity_return     21 non-null     int64  
 4   InvoiceDate_return  21 non-null     object 
 5   Price_return        21 non-null     float64
 6   Customer ID_return  4 non-null      object 
 7   Country_return      21 non-null     object 
 8   Description_sale    21 non-null     object 
 9   Quantity_sale       21 non-null     int64  
 10  InvoiceDate_sale    21 non-null     object 
 11  Price_sale          21 non-null     float64
 12  Customer ID_sale    4 non-null      object 
 13  Country_sale        21 non-null     object 
dtypes: float64(2), int64(2), object(10)
memory usage: 2.4+ KB


In [36]:
# flag possible returns
retails_df_stage_I['possible_product_return'] = retails_df_stage_I.apply(
    lambda x: 1 if x['Invoice'] in possible_product_returns['Invoice'].values else 0,
    axis=1
)

possible_product_returns = None
del possible_product_returns

In [37]:
non_and_zero_prices_df.sort_values(by='Price', ascending=True).head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
179403,A506401,B,Adjust bad debt,1,4/29/2010 13:36,-53594.36,,United Kingdom
276274,A516228,B,Adjust bad debt,1,7/19/2010 11:24,-44031.79,,United Kingdom
403472,A528059,B,Adjust bad debt,1,10/20/2010 12:04,-38925.87,,United Kingdom
176269,506141,21826,EIGHT PIECE DINOSAUR SET,1,4/27/2010 16:16,-100.0,TEST,United Kingdom
525234,538161,46000S,Dotcom sales,-100,12/09/2010 17:25,0.0,,United Kingdom


\+  Search for test in all columns  
\+  Search for Dotcom in all columns   
\+  Search for general 'adjust' in all columns    

In [38]:
# understanding adjust pattern related to price debs/fees/credit, etc
pattern = r'adjust | fee |credit|debit'
retails_df_stage_I[retails_df_stage_I['Description'].str.contains(pattern, na=False, regex=True, case=False)]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,possible_product_return
179403,A506401,B,Adjust bad debt,1,4/29/2010 13:36,-53594.36,,United Kingdom,0
253702,513826,47566B,stock credited from royal yacht inc,-144,6/29/2010 10:50,0.0,,United Kingdom,0
276274,A516228,B,Adjust bad debt,1,7/19/2010 11:24,-44031.79,,United Kingdom,0
363492,524570,84925D,incorrect credit,-372,9/29/2010 15:33,0.0,,United Kingdom,0
363495,524572,84924D,eurobargain invc/credit,372,9/29/2010 15:34,0.0,,United Kingdom,0
403472,A528059,B,Adjust bad debt,1,10/20/2010 12:04,-38925.87,,United Kingdom,0


\+  Search for test in all columns  
\+  Search for Dotcom in all columns   
\+  Search for general 'adjust' in all columns    
\+  Search for general 'credit' in all columns    
\+  Search for general 'debit' in all columns    
\+  Search for general 'fee' in all columns    
\+  Search for general 'eurobargain' in all columns    

In [39]:
# flag financial details on df
pattern = r'adjust | fee |credit|debit'

retails_df_stage_I['financial_details'] = retails_df_stage_I.apply(
    lambda x: 1 if re.search(pattern, str(x['Description']), re.IGNORECASE) else 0,
    axis=1
)

In [40]:
retails_df_stage_I.nunique()

Invoice                    28816
StockCode                   4632
Description                 4681
Quantity                     825
InvoiceDate                25296
Price                       1607
Customer ID                 4384
Country                       40
possible_product_return        2
financial_details              2
dtype: int64

\+  Search for 'test' in all columns  
\+  Search for 'Dotcom' in all columns   
\+  Search for general 'adjust' in all columns    
\+  Search for general 'credit' in all columns    
\+  Search for general 'debit' in all columns    
\+  Search for general 'fee' in all columns    
\+  Search for general 'eurobargain' in all columns    
\+  Search for general non-alphanumeric in description    

In [None]:
# tag possible financial details
retails_df_stage_I['possible_financial_details'] = retails_df_stage_I.apply(
    lambda x: 1 if x['Invoice'] in non_and_zero_prices_df['Invoice'].values else 0,
    axis=1
)


non_and_zero_prices_df = None
del non_and_zero_prices_df

> Understandig Quantity

In [None]:
retails_df_stage_I['Quantity'].value_counts()

> Understandig stock code

In [None]:
pattern = r'[a-z]'
filtered_stock_code = retails_df_stage_I[retails_df_stage_I['StockCode'].str.contains(pattern, na=False, regex=True, case=False)]

In [None]:
# understanding patterns on non-numeric stock codes
filtered_stock_code = filtered_stock_code.copy()
filtered_stock_code['code_len'] = filtered_stock_code['StockCode'].apply(len)

filtered_stock_code['code_len'].value_counts()

In [None]:
filtered_stock_code = filtered_stock_code.sort_values(by='code_len', ascending=False)

In [None]:
filtered_stock_code.query("code_len == 1").head()

- AMAZON FEE
- 

In [None]:
filtered_stock_code.query("code_len == 7")['StockCode'].value_counts()

In [None]:
# understanding overall test data

pattern = 'test'

retails_df_stage_I[
    retails_df_stage_I.astype(str).apply(lambda col: col.str.contains(pattern, case=False, regex=True)).any(axis=1)
]

ADJUST2  
test mapped

In [None]:
# fact: adjustment_sales
pattern = 'adjust'
adjustment_sales = retails_df_stage_I[
    retails_df_stage_I.astype(str).apply(lambda col: col.str.contains(pattern, case=False, regex=True)).any(axis=1)
]

In [None]:
outliers.describe()

In [None]:
non_outliers

> Search for debt/credit in all columns

In [None]:
pattern = 'debt|credit'

retails_df_stage_I[
    retails_df_stage_I.astype(str)
    .apply(lambda col: col.str.contains(pattern, case=False, regex=True))
    .any(axis=1)
]

In [None]:
pattern = 'test'

retails_df_stage_I[
    retails_df_stage_I.astype(str)
    .apply(lambda col: col.str.contains(pattern, case=False, regex=True))
    .any(axis=1)
]

In [None]:
retails_df_stage_I[retails_df_stage_I['customer_id'].isnull()].sort_values(by='description')

---

lineage save: stage 1

In [None]:
retails_lineage_I_path = os.path.join(
    cwd_path,
    'retails_lineage_I.parquet'
)

In [None]:
retails_df_stage_I.to_parquet(
    retails_lineage_I_path,
    index=False,
    compression='snappy'
)

In [None]:
retails_df_stage = None
del retails_df_stage

##### N - Overal data types to consistent formats

In [None]:
retails_df_stage_II = pd.read_parquet(retails_lineage_I_path)
# base_retails_df = None
# del base_retails_df

In [None]:
# column name normalization
retails_df_stage_II.columns = ['invoice', 'stock_code', 'description', 'quantity', 'invoice_date', 'price', 'customer_id', 'country']

# rename column
retails_df_stage_II.rename(columns={'country': 'location'}, inplace=True)

In [None]:
# base dtypes

# formating dtypes on data
retails_df_stage_II['invoice'] = sanitize_column_data(bg_logger, retails_df_stage_II, 'invoice')
retails_df_stage_II['stock_code'] = sanitize_column_data(bg_logger, retails_df_stage_II, 'stock_code')
retails_df_stage_II['description'] = sanitize_column_data(bg_logger, retails_df_stage_II, 'description')
retails_df_stage_II['customer_id'] = sanitize_column_data(bg_logger, retails_df_stage_II, 'customer_id')
retails_df_stage_II['country'] = sanitize_column_data(bg_logger, retails_df_stage_II, 'country')

In [None]:
# checking stage corrections
retails_df_stage_I.info()

In [None]:
# specialized DTYPES
retails_df_stage_II['quantity'] = pd.to_numeric(retails_df_stage_I['quantity'], errors='coerce')
retails_df_stage_II['price'] = pd.to_numeric(retails_df_stage_I['price'], errors='coerce')

In [None]:
# checking stage corrections
retails_df_stage_II.info()

In [None]:
# treating different date formats and converting to ISO 8601
retails_df_stage_II['invoice_date'] = pd.to_datetime(retails_df_stage_I['invoice_date'], errors='coerce')
retails_df_stage_II['invoice_date'] = retails_df_stage_I['invoice_date'].dt.strftime('%Y-%m-%dT%H:%M:%S')

In [None]:
retails_df_stage_II.info()

In [None]:
memory_usage = retails_df_stage_I.reset_index(drop=True).memory_usage(deep=True) / (1024 * 1024)
bg_logger.info('Old memory usage: %.2f MB x New memory usage: %.2f MB', format_base_memory_usage.sum(), memory_usage.sum())

---
Stage I - Dtypes base format

In [None]:
# removing object memory references
memory_usage = None
base_memory_usage = None
format_base_memory_usage = None

# removing from escope
del memory_usage, base_memory_usage, format_base_memory_usage

> Overall data consistency

In [None]:
retails_df_stage_II.describe()

In [None]:
retails_df_stage_II.info()

In [None]:
retails_df_stage_II.describe()

---

lineage save: stage 2

In [None]:
retails_df_stage_II_path = os.path.join(
    cwd_path,
    'retails_lineage_I.parquet'
)

In [None]:
retails_df_stage_II.to_parquet(
    retails_df_stage_II_path,
    index=False,
    compression='snappy'
)

In [None]:
retails_df_stage_II_path = None
del retails_df_stage_II

0   Invoice      525461 non-null  object   
1   StockCode    525461 non-null  object   
2   Description  522533 non-null  object   
3   Quantity     525461 non-null  int64    
4   InvoiceDate  525461 non-null  object   
5   Price        525439 non-null  float64  
6   Customer ID  417541 non-null  object   
7   Country      525430 non-null  object   

# Warehouse definitions