# requirements

In [1]:
# !pip install sqlalchemy
# !pip install py7zr
# !pip install pandas
# !pip install pyarrow
# !pip install py7zr sqlalchemy pandas pyarrow

# Imports

In [None]:
import os
import logging
import uuid
from datetime import (
    datetime,
    timezone
)
import re
import unicodedata

In [None]:
# import sqlalchemy as sa
import pandas as pd
import py7zr

# Utils & functions

### logger

In [None]:
# Logger
class __UUIDFilter(logging.Filter):
    """
    Internal utils
    Filter that adds a UUID4 to the log record.
    """
    def filter(self, record):
        record.uuid4 = uuid.uuid4()
        return True

def create_logger(log_file=None):
    """
    Creates a logger that writes messages to a file and writes them to the console.

    :param log_file: Name of the log file.
    :return: Configured logger.
    """
    # Logger creation
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    # Custom log format with UTC timestamp and UUID4
    log_format = logging.Formatter(
        '%(asctime)s - %(levelname)s - [%(funcName)s] - [%(uuid4)s] - %(message)s',
        datefmt="%Y-%m-%dT%H:%M:%S%z"
    )

    # Force the UTC time to appear in all handlers
    logging.Formatter.converter = lambda *args: datetime.now(timezone.utc).timetuple()

    uuid_filter = __UUIDFilter()
    logger.addFilter(uuid_filter)

    # File writer handler
    if log_file:
        file_handler = logging.FileHandler(log_file)
        file_handler.setLevel(logging.DEBUG)
        file_handler.setFormatter(log_format)
        logger.addHandler(file_handler)

    # Console handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.DEBUG)
    console_handler.setFormatter(log_format)
    logger.addHandler(console_handler)

    return logger

### extract_7z

In [None]:
def extract_7z(bg_logger, file_path):
    """
    Extracts a 7z file to the same directory as the compressed file.

    :param bg_logger: initialized logger
    :param file_path: Path to the 7z file to extract.
    """
    # Record start time
    start_time = datetime.now()

    # Get extraction directory
    extract_dir = os.path.dirname(file_path)

    # Extract the 7z file
    with py7zr.SevenZipFile(file_path, mode='r') as z:
        z.extractall(path=extract_dir)

    # Calculate extraction duration
    duration = datetime.now() - start_time

    # Log success message with extraction details
    bg_logger.info(
        "Data successfully extracted to directory %s. Duration: %s",
        extract_dir, duration
    )
    extract_dir = None
    del extract_dir


### sanitize_column_data

In [None]:
def sanitize_column_data(bg_logger, df, column, c_dtype=str, fillna_v='0z0'):
    """
    Corrects and specializes the data column format, replacing invalid values with NaN.
    Args:
        bg_logger: Logger instance for logging.
        df: DataFrame containing the data.
        column: Column to be transformed.
        c_dtype: Target data type (default is str).
        fillna_v: Value to fill NaN values (default is 'Unknown').
    Returns:
        The transformed column.
    """
    start_time = datetime.now()
    df[column] = df[column].fillna(fillna_v).str.strip()

    bg_logger.info(
        "Specializing column data '%s' to '%s'. It took %s",
        column, str(c_dtype), str(datetime.now() - start_time)
    )
    return df[column]

### sanitize_text

In [None]:
def sanitize_text(text):
    """
    Normalizes text by:
    - Removing special characters
    - Replacing accented characters with their unaccented counterparts
        Note: This is a symbol-based scenario, meaning it only considers simple transformations.
        For words requiring special handling (e.g., unique characters in specific languages),
        we would need to either create a custom mapping or use a specialized
        library for broader support.
    - Removing extra spaces

    Args:
        text (str): The input string to normalize.

    Returns:
        str: The normalized text.
    """
    if not isinstance(text, str):
        return text  # Return as-is if not a string

    # 1. Remove accents
    text = ''.join(
        c for c in unicodedata.normalize('NFD', text) 
        if unicodedata.category(c) != 'Mn'
    )

    # 2. Remove special characters*
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # 3. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

### data maps

In [None]:
NORMATIZE_LOCATION_MAP = {
    "USA": "United States",
    "US": "United States",
    "UK": "United Kingdom",
    "EIRE": "Ireland",
    "RSA": "South Africa",
    "Unspecified": "co0z0",
}

# Pipeline

### Logger init

In [None]:
# initialize logger
bg_logger = create_logger()

### base path reference

In [34]:
# Get the current working directory
cwd_path = os.getcwd()

## Base data process

### Importing base DF

In [35]:
base_df_path = os.path.join(
    cwd_path,
    '_ingestion',
    'Invoices_Year_2009-2010.7z'
)

In [36]:
extract_7z(bg_logger, base_df_path)

2024-11-23T19:32:54-0300 - INFO - [extract_7z] - [bdc1adf9-bdde-42ea-81a8-6a6d27c43708] - Data successfully extracted to directory c:\Users\mathu\Desktop\projetos\A\betsson_group\_ingestion. Duration: 0:00:00.516834
2024-11-23T19:32:54-0300 - INFO - [extract_7z] - [bdc1adf9-bdde-42ea-81a8-6a6d27c43708] - Data successfully extracted to directory c:\Users\mathu\Desktop\projetos\A\betsson_group\_ingestion. Duration: 0:00:00.516834


In [37]:
# os path join to avoid os-specific path issues

# get csv base df
base_retails_df = pd.read_csv(
    os.path.join(
        cwd_path,
        '_ingestion',
        'Invoices_Year_2009-2010.csv'
    ),
    sep=',',
    encoding='latin1',
    low_memory=False
)

### 0 - Briefly check the data

##### A - dataset base info

In [38]:
base_retails_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      525461 non-null  object 
 1   StockCode    525461 non-null  object 
 2   Description  522533 non-null  object 
 3   Quantity     525461 non-null  int64  
 4   InvoiceDate  525461 non-null  object 
 5   Price        525439 non-null  float64
 6   Customer ID  417541 non-null  object 
 7   Country      525430 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 32.1+ MB


##### B - understanding data variations

In [39]:
base_retails_df.describe()

Unnamed: 0,Quantity,Price
count,525461.0,525439.0
mean,10.337667,4.688669
std,107.42411,146.130044
min,-9600.0,-53594.36
25%,1.0,1.25
50%,3.0,2.1
75%,10.0,4.21
max,19152.0,25111.09


In [40]:
base_retails_df.isnull().sum()

Invoice             0
StockCode           0
Description      2928
Quantity            0
InvoiceDate         0
Price              22
Customer ID    107920
Country            31
dtype: int64

##### C - Understanding base memory usage

In [41]:
base_memory_usage = base_retails_df.reset_index(drop=True).memory_usage(deep=True)
format_base_memory_usage = base_memory_usage.sum() / (1024 * 1024)
bg_logger.info(f'{format_base_memory_usage:.2f} MB')
base_memory_usage

2024-11-23T19:33:43-0300 - INFO - [<module>] - [20de9223-6f64-4fac-8092-34f6faee4353] - 212.19 MB
2024-11-23T19:33:43-0300 - INFO - [<module>] - [20de9223-6f64-4fac-8092-34f6faee4353] - 212.19 MB


Index               128
Invoice        33114252
StockCode      32651237
Description    43837135
Quantity        4203688
InvoiceDate    38130704
Price           4203688
Customer ID    29340975
Country        37017895
dtype: int64

##### D - Overview Top and Bottom data

In [42]:
base_retails_df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,12/01/2009 07:45,6.95,13085,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,12/01/2009 07:45,6.75,13085,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,12/01/2009 07:45,6.75,13085,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,12/01/2009 07:45,2.1,13085,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,12/01/2009 07:45,1.25,13085,United Kingdom


In [43]:
base_retails_df.tail()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
525456,538171,22271,FELTCRAFT DOLL ROSIE,2,12/09/2010 20:01,2.95,17530,United Kingdom
525457,538171,22750,FELTCRAFT PRINCESS LOLA DOLL,1,12/09/2010 20:01,3.75,17530,United Kingdom
525458,538171,22751,FELTCRAFT PRINCESS OLIVIA DOLL,1,12/09/2010 20:01,3.75,17530,United Kingdom
525459,538171,20970,PINK FLORAL FELTCRAFT SHOULDER BAG,2,12/09/2010 20:01,3.75,17530,United Kingdom
525460,538171,21931,JUMBO STORAGE BAG SUKI,2,12/09/2010 20:01,1.95,17530,United Kingdom


---
Based on the top and bottom data, it's likely a general retail store. I'll maintain the previously defined generic 'retail' nomenclature.

##### E - Reviewing data uniqueness

In [44]:
base_retails_df.nunique()

Invoice        28816
StockCode       4632
Description     4681
Quantity         825
InvoiceDate    25296
Price           1607
Customer ID     4384
Country           41
dtype: int64

In [45]:
list(base_retails_df['Country'].unique())

['United Kingdom',
 'France',
 'USA',
 'Belgium',
 'Australia',
 'EIRE',
 'Germany',
 'Portugal',
 'Japan',
 'Denmark',
 'Nigeria',
 'Netherlands',
 'Poland',
 'Spain',
 'Channel Islands',
 'Italy',
 'Cyprus',
 'Greece',
 'Norway',
 'Austria',
 'Sweden',
 'United Arab Emirates',
 'Finland',
 'Switzerland',
 'Unspecified',
 'Malta',
 'Bahrain',
 'RSA',
 'Bermuda',
 'Hong Kong',
 'Singapore',
 'Thailand',
 'Israel',
 'Lithuania',
 nan,
 'West Indies',
 'Lebanon',
 'Korea',
 'Brazil',
 'Canada',
 'Iceland',
 'U.K.']

### 1 - Going deeper into base errors and data types

##### A - Correcting data types to consistent formats

In [46]:
retails_df_stage = base_retails_df.copy()

# base_retails_df = None
# del base_retails_df

---
Stage 0 - Dtypes base format

In [None]:
# base dtypes

# setting them as str and fillna with '0z0' to avoid issues with the data
retails_df_stage['Invoice'] = sanitize_column_data(bg_logger, retails_df_stage, 'Invoice', fillna_v='Unspecified')
retails_df_stage['StockCode'] = sanitize_column_data(bg_logger, retails_df_stage, 'StockCode', fillna_v='Unspecified')
retails_df_stage['Description'] = sanitize_column_data(bg_logger, retails_df_stage, 'Description', fillna_v='Unspecified')
retails_df_stage['Customer ID'] = sanitize_column_data(bg_logger, retails_df_stage, 'Customer ID', fillna_v='Unspecified')
retails_df_stage['Country'] = sanitize_column_data(bg_logger, retails_df_stage, 'Country', fillna_v='Unspecified')

2024-11-23T19:33:48-0300 - INFO - [sanitize_column_data] - [19cdf634-dfc3-4943-9df6-fc941e32b38b] - Specializing column data 'Invoice' to '<class 'str'>'. It took 0:00:00.152145
2024-11-23T19:33:48-0300 - INFO - [sanitize_column_data] - [19cdf634-dfc3-4943-9df6-fc941e32b38b] - Specializing column data 'Invoice' to '<class 'str'>'. It took 0:00:00.152145
2024-11-23T19:33:48-0300 - INFO - [sanitize_column_data] - [e00ce3c2-90fe-491d-ba64-7f4a204e588e] - Specializing column data 'StockCode' to '<class 'str'>'. It took 0:00:00.157396
2024-11-23T19:33:48-0300 - INFO - [sanitize_column_data] - [e00ce3c2-90fe-491d-ba64-7f4a204e588e] - Specializing column data 'StockCode' to '<class 'str'>'. It took 0:00:00.157396
2024-11-23T19:33:48-0300 - INFO - [sanitize_column_data] - [150a24ff-015f-48e6-a094-70df933d2bac] - Specializing column data 'Description' to '<class 'str'>'. It took 0:00:00.179007
2024-11-23T19:33:48-0300 - INFO - [sanitize_column_data] - [150a24ff-015f-48e6-a094-70df933d2bac] - Sp

In [48]:
# checking stage corrections
retails_df_stage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      525461 non-null  object 
 1   StockCode    525461 non-null  object 
 2   Description  525461 non-null  object 
 3   Quantity     525461 non-null  int64  
 4   InvoiceDate  525461 non-null  object 
 5   Price        525439 non-null  float64
 6   Customer ID  525461 non-null  object 
 7   Country      525461 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 32.1+ MB


In [49]:
# specialized DTYPES
retails_df_stage['Quantity'] = pd.to_numeric(retails_df_stage['Quantity'], errors='coerce')
retails_df_stage['Price'] = pd.to_numeric(retails_df_stage['Price'], errors='coerce')

In [50]:
# checking stage corrections
retails_df_stage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      525461 non-null  object 
 1   StockCode    525461 non-null  object 
 2   Description  525461 non-null  object 
 3   Quantity     525461 non-null  int64  
 4   InvoiceDate  525461 non-null  object 
 5   Price        525439 non-null  float64
 6   Customer ID  525461 non-null  object 
 7   Country      525461 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 32.1+ MB


In [51]:
# treating different date formats and converting to ISO 8601
retails_df_stage['InvoiceDate'] = pd.to_datetime(retails_df_stage['InvoiceDate'], errors='coerce')
retails_df_stage['InvoiceDate'] = retails_df_stage['InvoiceDate'].dt.strftime('%Y-%m-%dT%H:%M:%S')

In [52]:
retails_df_stage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      525461 non-null  object 
 1   StockCode    525461 non-null  object 
 2   Description  525461 non-null  object 
 3   Quantity     525461 non-null  int64  
 4   InvoiceDate  525461 non-null  object 
 5   Price        525439 non-null  float64
 6   Customer ID  525461 non-null  object 
 7   Country      525461 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 32.1+ MB


In [53]:
memory_usage = retails_df_stage.reset_index(drop=True).memory_usage(deep=True) / (1024 * 1024)
bg_logger.info('Old memory usage: %.2f MB x New memory usage: %.2f MB', format_base_memory_usage.sum(), memory_usage.sum())

2024-11-23T19:34:29-0300 - INFO - [<module>] - [fa331cf5-e5dc-4816-97bd-05bc179c00a9] - Old memory usage: 212.19 MB x New memory usage: 217.62 MB
2024-11-23T19:34:29-0300 - INFO - [<module>] - [fa331cf5-e5dc-4816-97bd-05bc179c00a9] - Old memory usage: 212.19 MB x New memory usage: 217.62 MB


In [54]:
# removing object memory references
memory_usage = None
base_memory_usage = None
format_base_memory_usage = None

# removing from escope
del memory_usage, base_memory_usage, format_base_memory_usage

##### B - Overall data consistency

In [55]:
retails_df_stage.describe()

Unnamed: 0,Quantity,Price
count,525461.0,525439.0
mean,10.337667,4.688669
std,107.42411,146.130044
min,-9600.0,-53594.36
25%,1.0,1.25
50%,3.0,2.1
75%,10.0,4.21
max,19152.0,25111.09


In [56]:
retails_df_stage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      525461 non-null  object 
 1   StockCode    525461 non-null  object 
 2   Description  525461 non-null  object 
 3   Quantity     525461 non-null  int64  
 4   InvoiceDate  525461 non-null  object 
 5   Price        525439 non-null  float64
 6   Customer ID  525461 non-null  object 
 7   Country      525461 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 32.1+ MB


In [57]:
retails_df_stage.describe()

Unnamed: 0,Quantity,Price
count,525461.0,525439.0
mean,10.337667,4.688669
std,107.42411,146.130044
min,-9600.0,-53594.36
25%,1.0,1.25
50%,3.0,2.1
75%,10.0,4.21
max,19152.0,25111.09


---
checkpoint lineage: 0

In [58]:
retails_lineage_0_path = os.path.join(
    cwd_path,
    'dump',
    'retails_df_stage.parquet'
)

In [59]:
retails_df_stage.to_parquet(
    retails_lineage_0_path,
    index=False,
    compression='snappy'
)

In [60]:
retails_df_stage = None
del retails_df_stage

##### C - Specialized Data Corrections

In [61]:
retails_df_stage_I = pd.read_parquet(retails_lineage_0_path)

---
Stage 1 - Data specif and column corrections

In [71]:
retails_df_stage_I.columns = [sanitize_text(col).lower().replace(' ', '_') for col in retails_df_stage_I.columns]
retails_df_stage_I.columns

Index(['invoice', 'stockcode', 'description', 'quantity', 'invoicedate',
       'price', 'customerid', 'country'],
      dtype='object')

> Country column - understanding overall logic

In [72]:
list(retails_df_stage_I['country'].unique())

['United Kingdom',
 'France',
 'United States',
 'Belgium',
 'Australia',
 'Ireland',
 'Germany',
 'Portugal',
 'Japan',
 'Denmark',
 'Nigeria',
 'Netherlands',
 'Poland',
 'Spain',
 'Channel Islands',
 'Italy',
 'Cyprus',
 'Greece',
 'Norway',
 'Austria',
 'Sweden',
 'United Arab Emirates',
 'Finland',
 'Switzerland',
 'co0z0',
 'Malta',
 'Bahrain',
 'South Africa',
 'Bermuda',
 'Hong Kong',
 'Singapore',
 'Thailand',
 'Israel',
 'Lithuania',
 'West Indies',
 'Lebanon',
 'Korea',
 'Brazil',
 'Canada',
 'Iceland']

> Country column - Cleaning and mapping correct values

In [None]:
# sanitize data
retails_df_stage_I['country'] = retails_df_stage_I['Country'].apply(sanitize_text)

In [None]:
# correct acronyms and normalizing location names
retails_df_stage_I['country'] = retails_df_stage_I['Country'].replace(NORMATIZE_LOCATION_MAP)

In [None]:
# overall data consistency
retails_df_stage_I['country'].value_counts(normalize=True) * 100

Country
United Kingdom          92.457861
Ireland                  1.838576
Germany                  1.547023
France                   1.098464
Netherlands              0.526966
Spain                    0.243215
Switzerland              0.225897
Portugal                 0.209530
Belgium                  0.200586
Channel Islands          0.172420
Sweden                   0.171659
Italy                    0.139116
Australia                0.124462
Cyprus                   0.105431
Austria                  0.102196
Greece                   0.098390
United Arab Emirates     0.082214
Denmark                  0.081452
Norway                   0.070224
Finland                  0.067369
co0z0                    0.064895
United States            0.046435
Japan                    0.042629
Poland                   0.036920
Malta                    0.032733
Lithuania                0.029308
Singapore                0.022266
South Africa             0.021124
Bahrain                  0.020363
Canada

> Customer ID column - understanding overall logic

In [67]:
retails_df_stage_I['Customer ID'].value_counts()

Customer ID
Unspecified    107920
14911            5710
17841            5114
14606            3927
14156            2710
                ...  
18246               1
13096               1
14914               1
16786               1
13222               1
Name: count, Length: 4385, dtype: int64

0   Invoice      525461 non-null  object 
1   StockCode    525461 non-null  object 
2   Description  522533 non-null  object 
3   Quantity     525461 non-null  int64  
4   InvoiceDate  525461 non-null  object 
5   Price        525439 non-null  float64
6   Customer ID  417541 non-null  object 
7   Country      525430 non-null  object 

# Warehouse definitions