# requirements

In [1]:
# !pip install sqlalchemy
# !pip install py7zr
# !pip install pandas
# !pip install pyarrow
# !pip install matplotlib
# !pip install py7zr sqlalchemy pandas pyarrow matplotlib

# Imports

In [2]:
import os
import logging
import uuid
from datetime import (
    datetime,
    timezone
)
import re
import unicodedata

In [3]:
# import sqlalchemy as sa
import pandas as pd
import py7zr

# Utils & functions

### logger

In [4]:
# Logger
class __UUIDFilter(logging.Filter):
    """
    Internal utils
    Filter that adds a UUID4 to the log record.
    """
    def filter(self, record):
        record.uuid4 = uuid.uuid4()
        return True

def create_logger(log_file=None):
    """
    Creates a logger that writes messages to a file and writes them to the console.

    :param log_file: Name of the log file.
    :return: Configured logger.
    """
    # Logger creation
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    # Custom log format with UTC timestamp and UUID4
    log_format = logging.Formatter(
        '%(asctime)s - %(levelname)s - [%(funcName)s] - [%(uuid4)s] - %(message)s',
        datefmt="%Y-%m-%dT%H:%M:%S%z"
    )

    # Force the UTC time to appear in all handlers
    logging.Formatter.converter = lambda *args: datetime.now(timezone.utc).timetuple()

    uuid_filter = __UUIDFilter()
    logger.addFilter(uuid_filter)

    # File writer handler
    if log_file:
        file_handler = logging.FileHandler(log_file)
        file_handler.setLevel(logging.DEBUG)
        file_handler.setFormatter(log_format)
        logger.addHandler(file_handler)

    # Console handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.DEBUG)
    console_handler.setFormatter(log_format)
    logger.addHandler(console_handler)

    return logger

### extract_7z

In [5]:
def extract_7z(bg_logger, file_path):
    """
    Extracts a 7z file to the same directory as the compressed file.

    :param bg_logger: initialized logger
    :param file_path: Path to the 7z file to extract.
    """
    # Record start time
    start_time = datetime.now()

    # Get extraction directory
    extract_dir = os.path.dirname(file_path)

    # Extract the 7z file
    with py7zr.SevenZipFile(file_path, mode='r') as z:
        z.extractall(path=extract_dir)

    # Calculate extraction duration
    duration = datetime.now() - start_time

    # Log success message with extraction details
    bg_logger.info(
        "Data successfully extracted to directory %s. Duration: %s",
        extract_dir, duration
    )
    extract_dir = None
    del extract_dir


### sanitize_column_data

In [6]:
def sanitize_column_data(bg_logger, df, column, c_dtype=str):
    """
    Corrects and specializes the data column format, replacing invalid values with NaN.
    Args:
        bg_logger: Logger instance for logging.
        df: DataFrame containing the data.
        column: Column to be transformed.
        c_dtype: Target data type (default is str).
    Returns:
        The transformed column.
    """
    start_time = datetime.now()
    df[column] = df[column].str.strip()

    bg_logger.info(
        "Specializing column data '%s' to '%s'. It took %s",
        column, str(c_dtype), str(datetime.now() - start_time)
    )
    return df[column]

### sanitize_text

In [7]:
def sanitize_text(text):
    """
    Normalizes text by:
    - Removing special characters
    - Replacing accented characters with their unaccented counterparts
        Note: This is a symbol-based scenario, meaning it only considers simple transformations.
        For words requiring special handling (e.g., unique characters in specific languages),
        we would need to either create a custom mapping or use a specialized
        library for broader support.
    - Removing extra spaces

    Args:
        text (str): The input string to normalize.

    Returns:
        str: The normalized text.
    """
    if not isinstance(text, str):
        return text  # Return as-is if not a string

    # 1. Remove accents
    text = ''.join(
        c for c in unicodedata.normalize('NFD', text) 
        if unicodedata.category(c) != 'Mn'
    )

    # 2. Remove special characters*
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # 3. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

### data maps

In [8]:
NORMATIZE_LOCATION_MAP = {
    "USA": "United States",
    "US": "United States",
    "UK": "United Kingdom",
    "EIRE": "Ireland",
    "RSA": "South Africa",
}

In [9]:
CLOUD_LOST_PRODUCTS_WORDS = [
    'damage', 'wet', 'MIA', 'smashed', 'missing', 'missed',
    'lost', 'crushed', 'broken', 'bad quality',
    'discoloured', 'rotting', 'damp and rusty',
    'unsellable', 'dirty', 'display', 'cant find',
    'debt', 'wrong', '?????', 'donated', 'rusty' 'damges',
    'found', 'gone', 'temp', 'phil said so', 'error',
    'eurobargain', 'broken', 'poor quality', '?sold individually?',
]

# Base Pipeline

### Logger init

In [10]:
# initialize logger
bg_logger = create_logger()

### base path reference

In [11]:
# Get the current working directory
cwd_path = os.getcwd()

## Base data process

### Importing base DF

In [12]:
base_df_path = os.path.join(
    cwd_path,
    'ingestion',
    'Invoices_Year_2009-2010.7z'
)

In [13]:
extract_7z(bg_logger, base_df_path)

2024-11-26T02:30:29 - INFO - [extract_7z] - [13067901-6470-466d-9faa-83f1682b2640] - Data successfully extracted to directory /workspaces/betsson_group/ingestion. Duration: 0:00:00.362010


In [14]:
# os path join to avoid os-specific path issues

# get csv base df
base_retails_df = pd.read_csv(
    os.path.join(
        cwd_path,
        'ingestion',
        'Invoices_Year_2009-2010.csv'
    ),
    sep=',',
    encoding='latin1',
    low_memory=False
)

### 0 - Briefly check the data

##### A - dataset base info

In [15]:
base_retails_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      525461 non-null  object 
 1   StockCode    525461 non-null  object 
 2   Description  522533 non-null  object 
 3   Quantity     525461 non-null  int64  
 4   InvoiceDate  525461 non-null  object 
 5   Price        525439 non-null  float64
 6   Customer ID  417541 non-null  object 
 7   Country      525430 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 32.1+ MB


##### B - understanding data variations

In [16]:
base_retails_df.describe()

Unnamed: 0,Quantity,Price
count,525461.0,525439.0
mean,10.337667,4.688669
std,107.42411,146.130044
min,-9600.0,-53594.36
25%,1.0,1.25
50%,3.0,2.1
75%,10.0,4.21
max,19152.0,25111.09


In [17]:
base_retails_df.isnull().sum()

Invoice             0
StockCode           0
Description      2928
Quantity            0
InvoiceDate         0
Price              22
Customer ID    107920
Country            31
dtype: int64

##### C - Understanding base memory usage

In [18]:
base_memory_usage = base_retails_df.reset_index(drop=True).memory_usage(deep=True)
format_base_memory_usage = base_memory_usage.sum() / (1024 * 1024)
bg_logger.info(f'{format_base_memory_usage:.2f} MB')
base_memory_usage

2024-11-26T02:30:31 - INFO - [<module>] - [805650e0-f9f0-480f-9490-db4e917f1b89] - 188.98 MB


Index               132
Invoice        28910564
StockCode      28447549
Description    39656495
Quantity        4203688
InvoiceDate    33927016
Price           4203688
Customer ID    26000647
Country        32814455
dtype: int64

##### D - Overview Top and Bottom data

In [19]:
base_retails_df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,12/01/2009 07:45,6.95,13085,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,12/01/2009 07:45,6.75,13085,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,12/01/2009 07:45,6.75,13085,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,12/01/2009 07:45,2.1,13085,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,12/01/2009 07:45,1.25,13085,United Kingdom


In [20]:
base_retails_df.tail()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
525456,538171,22271,FELTCRAFT DOLL ROSIE,2,12/09/2010 20:01,2.95,17530,United Kingdom
525457,538171,22750,FELTCRAFT PRINCESS LOLA DOLL,1,12/09/2010 20:01,3.75,17530,United Kingdom
525458,538171,22751,FELTCRAFT PRINCESS OLIVIA DOLL,1,12/09/2010 20:01,3.75,17530,United Kingdom
525459,538171,20970,PINK FLORAL FELTCRAFT SHOULDER BAG,2,12/09/2010 20:01,3.75,17530,United Kingdom
525460,538171,21931,JUMBO STORAGE BAG SUKI,2,12/09/2010 20:01,1.95,17530,United Kingdom


---
Based on the top and bottom data, it's likely a general retail store. I'll maintain the previously defined generic 'retail' nomenclature.

##### E - Reviewing data uniqueness

In [21]:
base_retails_df.nunique()

Invoice        28816
StockCode       4632
Description     4681
Quantity         825
InvoiceDate    25296
Price           1607
Customer ID     4384
Country           41
dtype: int64

### 1 - Going deeper into base errors and data types

##### A - Specialized base Data Corrections

In [22]:
retails_df_stage_I = base_retails_df.copy()

---
Stage 1 - Data specific and column corrections

###### Country column - understanding overall logic

In [23]:
list(retails_df_stage_I['Country'].unique())

['United Kingdom',
 'France',
 'USA',
 'Belgium',
 'Australia',
 'EIRE',
 'Germany',
 'Portugal',
 'Japan',
 'Denmark',
 'Nigeria',
 'Netherlands',
 'Poland',
 'Spain',
 'Channel Islands',
 'Italy',
 'Cyprus',
 'Greece',
 'Norway',
 'Austria',
 'Sweden',
 'United Arab Emirates',
 'Finland',
 'Switzerland',
 'Unspecified',
 'Malta',
 'Bahrain',
 'RSA',
 'Bermuda',
 'Hong Kong',
 'Singapore',
 'Thailand',
 'Israel',
 'Lithuania',
 nan,
 'West Indies',
 'Lebanon',
 'Korea',
 'Brazil',
 'Canada',
 'Iceland',
 'U.K.']

> Country column - Cleaning and mapping correct values

In [24]:
# sanitize data
retails_df_stage_I['Country'] = retails_df_stage_I['Country'].apply(sanitize_text)

In [25]:
# correct acronyms and normalizing location names
retails_df_stage_I['Country'] = retails_df_stage_I['Country'].replace(NORMATIZE_LOCATION_MAP)

In [26]:
# overall data consistency
retails_df_stage_I['Country'].value_counts(normalize=True) * 100

Country
United Kingdom          92.463316
Ireland                  1.838685
Germany                  1.547114
France                   1.098529
Netherlands              0.526997
Spain                    0.243229
Switzerland              0.225910
Portugal                 0.209543
Belgium                  0.200598
Channel Islands          0.172430
Sweden                   0.171669
Italy                    0.139124
Australia                0.124469
Cyprus                   0.105437
Austria                  0.102202
Greece                   0.098396
United Arab Emirates     0.082218
Denmark                  0.081457
Norway                   0.070228
Finland                  0.067373
Unspecified              0.058999
United States            0.046438
Japan                    0.042632
Poland                   0.036922
Malta                    0.032735
Lithuania                0.029309
Singapore                0.022267
South Africa             0.021126
Bahrain                  0.020364
Canada

###### Customer ID column - understanding overall logic

In [27]:
# validating customer id format, to understand if all values are alphanumeric
pattern = r'^(?=.*[a-zA-Z])[a-zA-Z0-9]+$'
retails_df_stage_I[retails_df_stage_I['Customer ID'].str.contains(pattern, na=False, regex=True, case=False)]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
176263,506141,21670,BLUE SPOT CERAMIC DRAWER KNOB,1,4/27/2010 16:16,0.0,TEST,United Kingdom
176264,506141,79160,HEART SHAPE WIRELESS DOORBELL,1,4/27/2010 16:16,0.0,TEST,United Kingdom
176265,506141,90112,PINK DOLLY HAIR CLIPS,1,4/27/2010 16:16,0.0,TEST,United Kingdom
176266,506141,90100,NECKLACE+BRACELET SET PINK DAISY,1,4/27/2010 16:16,0.0,TEST,United Kingdom
176267,506141,85226A,WHITE/BLUE PULL BACK RACING CAR,1,4/27/2010 16:16,0.0,TEST,United Kingdom
176268,506141,21890,S/6 WOODEN SKITTLES IN COTTON BAG,1,4/27/2010 16:16,0.0,TEST,United Kingdom
176269,506141,21826,EIGHT PIECE DINOSAUR SET,1,4/27/2010 16:16,-100.0,TEST,United Kingdom


In [28]:
# looking base cases, like 0, negative values or extreme cases like 999999
retails_df_stage_I[
    retails_df_stage_I['Customer ID'].astype(str).str.fullmatch(r'^(9+|0+)$')
].sort_values(by='Customer ID', ascending=False).head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country


In [29]:
null_clients = retails_df_stage_I[retails_df_stage_I['Customer ID'].isnull()]

In [30]:
null_clients.info()

<class 'pandas.core.frame.DataFrame'>
Index: 107920 entries, 263 to 525235
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      107920 non-null  object 
 1   StockCode    107920 non-null  object 
 2   Description  104992 non-null  object 
 3   Quantity     107920 non-null  int64  
 4   InvoiceDate  107920 non-null  object 
 5   Price        107920 non-null  float64
 6   Customer ID  0 non-null       object 
 7   Country      107898 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 7.4+ MB


In [31]:
# null clients with null descriptions
null_clients[null_clients['Description'].isnull()].nunique()

Invoice        2928
StockCode      1920
Description       0
Quantity        424
InvoiceDate    1943
Price             1
Customer ID       0
Country           1
dtype: int64

In [32]:
# flagging errors
retails_df_stage_I['entry_errors'] = (
    retails_df_stage_I['Description'].isnull() &
    retails_df_stage_I['Customer ID'].isnull() &
    (retails_df_stage_I['Price'] <= 0)
).astype(int)

null_clients = None
del null_clients

\+  Search for test in all columns

###### Price column - understanding overall logic

In [33]:
retails_df_stage_I['Price'].describe()

count    525439.000000
mean          4.688669
std         146.130044
min      -53594.360000
25%           1.250000
50%           2.100000
75%           4.210000
max       25111.090000
Name: Price, dtype: float64

In [34]:
# checking values that are less than or equal to 0
non_and_zero_prices_df = retails_df_stage_I.query("Price <= 0")

In [35]:
non_and_zero_prices_df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,entry_errors
263,489464,21733,85123a mixed,-96,12/01/2009 10:52,0.0,,United Kingdom,0
283,489463,71477,short,-240,12/01/2009 10:52,0.0,,United Kingdom,0
284,489467,85123A,21733 mixed,-192,12/01/2009 10:53,0.0,,United Kingdom,0
470,489521,21646,,-50,12/01/2009 11:44,0.0,,United Kingdom,1
3114,489655,20683,,-44,12/01/2009 17:26,0.0,,United Kingdom,1


In [36]:
non_and_zero_prices_df.nunique()

Invoice         3350
StockCode       2197
Description      381
Quantity         514
InvoiceDate     2295
Price              5
Customer ID       25
Country            5
entry_errors       2
dtype: int64

- What does "mixed" mean? Is it a `StockCode` that I should merge to understand more deeply?
- What does "short" mean?
- Could "Dotcom sales" refer to online sales for a specific product? If so, are we dealing with a hybrid store model?
    - If yes, then it’s not just related to a product description, but rather the entire sale description.

\+  Search for test in all columns  
\+  Search for Dotcom in all columns  

In [37]:
# checking values that are greater than 0
possile_effective_sales = retails_df_stage_I.query("Price > 0")

In [38]:
# checking possible obvious returns
possible_product_returns = non_and_zero_prices_df.merge(
    possile_effective_sales,
    on=['Invoice', 'StockCode'],
    suffixes=('_return', '_sale')
)

possible_effective_sales = None
del possible_effective_sales

In [39]:
possible_product_returns.head()

Unnamed: 0,Invoice,StockCode,Description_return,Quantity_return,InvoiceDate_return,Price_return,Customer ID_return,Country_return,entry_errors_return,Description_sale,Quantity_sale,InvoiceDate_sale,Price_sale,Customer ID_sale,Country_sale,entry_errors_sale
0,492079,85042,ANTIQUE LILY FAIRY LIGHTS,8,12/15/2009 13:49,0.0,15070.0,United Kingdom,0,ANTIQUE LILY FAIRY LIGHTS,8,12/15/2009 13:49,4.95,15070.0,United Kingdom,0
1,492760,21143,ANTIQUE GLASS HEART DECORATION,12,12/18/2009 14:22,0.0,18071.0,United Kingdom,0,ANTIQUE GLASS HEART DECORATION,12,12/18/2009 14:22,1.95,18071.0,United Kingdom,0
2,500870,84033,FLAG OF ST GEORGE,99,03/10/2010 13:24,0.0,,United Kingdom,0,FLAG OF ST GEORGE,1,03/10/2010 13:24,85.1,,United Kingdom,0
3,502469,84016,FLAG OF ST GEORGE CAR FLAG,899,3/24/2010 15:39,0.0,,United Kingdom,0,FLAG OF ST GEORGE CAR FLAG,1,3/24/2010 15:39,93.61,,United Kingdom,0
4,506393,84016,FLAG OF ST GEORGE CAR FLAG,299,4/29/2010 13:10,0.0,,United Kingdom,0,FLAG OF ST GEORGE CAR FLAG,1,4/29/2010 13:10,25.53,,United Kingdom,0


In [40]:
# valid product returns
possible_product_returns = possible_product_returns.query("Quantity_return < Quantity_sale")

In [41]:
# flag possible returns
retails_df_stage_I['product_return'] = retails_df_stage_I.apply(
    lambda x: 1 if x['Invoice'] in possible_product_returns['Invoice'].values else 0,
    axis=1
)

possible_product_returns = None
del possible_product_returns

In [42]:
list(non_and_zero_prices_df['Description'].unique())

['85123a mixed',
 'short',
 '21733 mixed',
 nan,
 'lost',
 'damages',
 'invcd as 84879?',
 '6 RIBBONS EMPIRE  ',
 'DOTCOM POSTAGE',
 'sold as gold',
 'DOOR MAT FAIRY CAKE',
 '21494',
 'lost?',
 'Manual',
 'damaged',
 'wet',
 'CHRISTMAS PUDDING TRINKET POT ',
 'CHRISTMAS CRAFT WHITE FAIRY ',
 'MIA',
 'smashed',
 'ANTIQUE LILY FAIRY LIGHTS',
 'bad quality',
 'ANTIQUE GLASS HEART DECORATION ',
 'discoloured',
 'missing',
 ' FLAMINGO LIGHTS',
 'missing (wrongly coded?)',
 'CHARLOTTE BAG , SUKI DESIGN',
 'RETRO SPOT LARGE MILK JUG',
 '?',
 'Damages',
 'Dotcom multiples',
 'WOODLAND  STICKERS',
 'damaged?',
 'This is a test product.',
 'damages?',
 'No Stock',
 'my error - connor',
 'Dotcom',
 'POLYESTER FILLER PAD 40x40cm',
 'CUBIC MUG TEATIME CAKES',
 'wedding co returns?',
 'damages, lost bits etc',
 'VINTAGE GLASS COFFEE CADDY',
 'WOODEN REGATTA BUNTING',
 'WOODEN UNION JACK BUNTING',
 'PINK  SPOTTY PLATE ',
 'ASSORTED COLOUR BIRD ORNAMENT',
 'found',
 'lost in space',
 'LUNCH BAG SPACEB

In [43]:
escaped_keywords = [re.escape(word) for word in CLOUD_LOST_PRODUCTS_WORDS if word]

# Create the regex pattern to match only at the start of the string
pattern = r'^(?:' + '|'.join(escaped_keywords) + r')\b'

# Flag cloud lost products based on the pattern
retails_df_stage_I['lost_sales'] = retails_df_stage_I['Description'].str.contains(
    pattern,
    case=False,
    na=False
).astype(int)

In [44]:
retails_df_stage_I['lost_sales'].value_counts()

lost_sales
0    525374
1        87
Name: count, dtype: int64

In [45]:
retails_df_stage_I.query("lost_sales == 1").head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,entry_errors,product_return,lost_sales
3162,489660,35956,lost,-1043,12/01/2009 17:43,0.0,,United Kingdom,0,0,1
9308,490130,21493,lost?,-600,12/03/2009 18:28,0.0,,United Kingdom,0,0,1
17428,490766,51008,wet,-200,12/08/2009 11:06,0.0,,United Kingdom,0,0,1
20970,491065,84855,MIA,-108,12/09/2009 11:59,0.0,,United Kingdom,0,0,1
28111,491732,79030A,smashed,-350,12/14/2009 10:00,0.0,,United Kingdom,0,0,1


In [46]:
non_and_zero_prices_df

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,entry_errors
263,489464,21733,85123a mixed,-96,12/01/2009 10:52,0.0,,United Kingdom,0
283,489463,71477,short,-240,12/01/2009 10:52,0.0,,United Kingdom,0
284,489467,85123A,21733 mixed,-192,12/01/2009 10:53,0.0,,United Kingdom,0
470,489521,21646,,-50,12/01/2009 11:44,0.0,,United Kingdom,1
3114,489655,20683,,-44,12/01/2009 17:26,0.0,,United Kingdom,1
...,...,...,...,...,...,...,...,...,...
525231,538159,21324,,-18,12/09/2010 17:17,0.0,,United Kingdom,1
525232,538158,20892,,-32,12/09/2010 17:17,0.0,,United Kingdom,1
525233,538160,20956,,288,12/09/2010 17:18,0.0,,United Kingdom,1
525234,538161,46000S,Dotcom sales,-100,12/09/2010 17:25,0.0,,United Kingdom,0


\+  Search for 'test' in all columns  
\+  Search for 'Dotcom' in all columns   
\+  Search for general 'adjust' in all columns    
\+  Search for general 'credit' in all columns    
\+  Search for general 'debit' in all columns    
\+  Search for general 'fee' in all columns    
\+  Search for general 'eurobargain' in all columns    
\+  Search for general non-alphanumeric in description    

###### Understanding Quantity

In [47]:
retails_df_stage_I['Quantity'].describe()

count    525461.000000
mean         10.337667
std         107.424110
min       -9600.000000
25%           1.000000
50%           3.000000
75%          10.000000
max       19152.000000
Name: Quantity, dtype: float64

In [48]:
# understanding negative quantities, which could represent returns
# there is no zero quantity in the dataset
negative_quantities = retails_df_stage_I.query("Quantity < 0")

In [49]:
negative_quantities.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,entry_errors,product_return,lost_sales
178,C489449,22087,PAPER BUNTING WHITE LACE,-12,12/01/2009 10:33,2.95,16321,Australia,0,0,0
179,C489449,85206A,CREAM FELT EASTER EGG BASKET,-6,12/01/2009 10:33,1.65,16321,Australia,0,0,0
180,C489449,21895,POTTING SHED SOW 'N' GROW SET,-4,12/01/2009 10:33,4.25,16321,Australia,0,0,0
181,C489449,21896,POTTING SHED TWINE,-6,12/01/2009 10:33,2.1,16321,Australia,0,0,0
182,C489449,22083,PAPER CHAIN KIT RETRO SPOT,-12,12/01/2009 10:33,2.95,16321,Australia,0,0,0


\+  Search for 'test' in all columns  
\+  Search for 'Dotcom' in all columns   
\+  Search for general 'adjust' in all columns    
\+  Search for general 'credit' in all columns    
\+  Search for general 'debit' in all columns    
\+  Search for general 'fee' in all columns    
\+  Search for general 'eurobargain' in all columns    
\+  Search for non-alphanumeric in description    
\+  Search for 'c*' in invoice    

In [50]:
possible_non_returned_sales = retails_df_stage_I.query("Quantity > 0")

In [51]:
# match possible returns
possible_returns = negative_quantities.merge(
    possible_non_returned_sales,
    on=['Invoice', 'StockCode'],
    suffixes=('_return', '_sale')
)

In [52]:
possible_returns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Invoice                0 non-null      object 
 1   StockCode              0 non-null      object 
 2   Description_return     0 non-null      object 
 3   Quantity_return        0 non-null      int64  
 4   InvoiceDate_return     0 non-null      object 
 5   Price_return           0 non-null      float64
 6   Customer ID_return     0 non-null      object 
 7   Country_return         0 non-null      object 
 8   entry_errors_return    0 non-null      int64  
 9   product_return_return  0 non-null      int64  
 10  lost_sales_return      0 non-null      int64  
 11  Description_sale       0 non-null      object 
 12  Quantity_sale          0 non-null      int64  
 13  InvoiceDate_sale       0 non-null      object 
 14  Price_sale             0 non-null      float64
 15  Customer ID_sale  

In [53]:
possible_non_returned_sales = None
possible_returns = None
del possible_non_returned_sales, possible_returns

In [54]:
negative_quantities.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,entry_errors,product_return,lost_sales
178,C489449,22087,PAPER BUNTING WHITE LACE,-12,12/01/2009 10:33,2.95,16321,Australia,0,0,0
179,C489449,85206A,CREAM FELT EASTER EGG BASKET,-6,12/01/2009 10:33,1.65,16321,Australia,0,0,0
180,C489449,21895,POTTING SHED SOW 'N' GROW SET,-4,12/01/2009 10:33,4.25,16321,Australia,0,0,0
181,C489449,21896,POTTING SHED TWINE,-6,12/01/2009 10:33,2.1,16321,Australia,0,0,0
182,C489449,22083,PAPER CHAIN KIT RETRO SPOT,-12,12/01/2009 10:33,2.95,16321,Australia,0,0,0


In [55]:
# find possible returns removing the initial C
retails_df_stage_I.query("Invoice == 489449")

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,entry_errors,product_return,lost_sales


In [56]:
# searching non-numeric in quantity
pattern = r'[a-z]'
retails_df_stage_I[retails_df_stage_I['Quantity'].astype(str).str.contains(pattern, na=False, regex=True, case=False)]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,entry_errors,product_return,lost_sales


\+  Search for 'test' in all columns  
\+  Search for 'Dotcom' in all columns   
\+  Search for general 'adjust' in all columns    
\+  Search for general 'credit' in all columns    
\+  Search for general 'debit' in all columns    
\+  Search for general 'fee' in all columns    
\+  Search for general 'eurobargain' in all columns    
\+  Search for non-alphanumeric in description    
\+ For rows without a Description or Price, I am assuming the last recorded Description and Price based on the StockCode. This assumes that StockCode descriptions are static, and the last recorded value is correct and applicable

###### Understanding Description

In [57]:
# searching full numeric values in description
pattern = r'^\d+$'
retails_df_stage_I[retails_df_stage_I['Description'].str.fullmatch(pattern, na=False, case=False)]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,entry_errors,product_return,lost_sales
6911,490007,84347,21494,-720,12/03/2009 12:09,0.0,,United Kingdom,0,0,0
274052,516016,22467,22719,2,7/16/2010 10:11,0.0,,United Kingdom,0,0,0
274053,516017,22719,22467,-2,7/16/2010 10:11,0.0,,United Kingdom,0,0,0


In [58]:
# searching for debt, credit or fee in description
pattern = 'debt|credit| fee'

financial_desc_df = retails_df_stage_I[retails_df_stage_I['Description'].str.contains(pattern, na=False, case=False)]
financial_desc_df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,entry_errors,product_return,lost_sales
179403,A506401,B,Adjust bad debt,1,4/29/2010 13:36,-53594.36,,United Kingdom,0,0,0
253702,513826,47566B,stock credited from royal yacht inc,-144,6/29/2010 10:50,0.0,,United Kingdom,0,0,0
276274,A516228,B,Adjust bad debt,1,7/19/2010 11:24,-44031.79,,United Kingdom,0,0,0
363492,524570,84925D,incorrect credit,-372,9/29/2010 15:33,0.0,,United Kingdom,0,0,0
363495,524572,84924D,eurobargain invc/credit,372,9/29/2010 15:34,0.0,,United Kingdom,0,0,1


In [59]:
# flagging financial transactions
retails_df_stage_I.loc[
    (retails_df_stage_I['Invoice'].isin(financial_desc_df['Invoice'])) & 
    (retails_df_stage_I['StockCode'].isin(financial_desc_df['StockCode'])),
    'financial_details'
] = 1

financial_desc_df = None
del financial_desc_df

In [60]:
# ajudstment
pattern = 'adjust|update'

retails_df_stage_I[retails_df_stage_I['Description'].str.contains(pattern, na=False, case=False)]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,entry_errors,product_return,lost_sales,financial_details
70975,495732,ADJUST,Adjustment by john on 26/01/2010 16,1,1/26/2010 16:20,96.46,,Ireland,0,0,0,
70976,495733,ADJUST,Adjustment by john on 26/01/2010 16,1,1/26/2010 16:21,68.34,14911,Ireland,0,0,0,
70977,495735,ADJUST,Adjustment by john on 26/01/2010 16,1,1/26/2010 16:22,201.56,12745,Ireland,0,0,0,
70978,495734,ADJUST,Adjustment by john on 26/01/2010 16,1,1/26/2010 16:22,205.82,14911,Ireland,0,0,0,
70979,C495737,ADJUST,Adjustment by john on 26/01/2010 16,-1,1/26/2010 16:23,10.50,16154,United Kingdom,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...
253701,513825,47566B,correct previous adjustment,144,6/29/2010 10:49,0.00,,United Kingdom,0,0,0,
276274,A516228,B,Adjust bad debt,1,7/19/2010 11:24,-44031.79,,United Kingdom,0,0,0,1.0
403472,A528059,B,Adjust bad debt,1,10/20/2010 12:04,-38925.87,,United Kingdom,0,0,0,1.0
484477,534747,22687,amazon adjustment,2,11/24/2010 9:50,0.00,,United Kingdom,0,0,0,


In [61]:
# flagging adjustments
retails_df_stage_I.loc[
    retails_df_stage_I['Description'].str.contains(pattern, na=False, case=False),
    'maintenance_adjustment'
] = 1

###### Understanding StockCode

In [62]:
# searching non numeric in stock code

pattern = r'[a-z]'
filtered_stock_code = retails_df_stage_I[retails_df_stage_I['StockCode'].str.contains(pattern, na=False, regex=True, case=False)]
filtered_stock_code

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,entry_errors,product_return,lost_sales,financial_details,maintenance_adjustment
1,489434,79323P,PINK CHERRY LIGHTS,12,12/01/2009 07:45,6.75,13085,United Kingdom,0,0,0,,
2,489434,79323W,WHITE CHERRY LIGHTS,12,12/01/2009 07:45,6.75,13085,United Kingdom,0,0,0,,
12,489436,48173C,DOOR MAT BLACK FLOCK,10,12/01/2009 09:06,5.95,13078,United Kingdom,0,0,0,,
23,489436,35004B,SET OF 3 BLACK FLYING DUCKS,12,12/01/2009 09:06,4.65,13078,United Kingdom,0,0,0,,
28,489436,84596F,SMALL MARSHMALLOWS PINK BOWL,8,12/01/2009 09:06,1.25,13078,United Kingdom,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
525387,538170,84029E,RED WOOLLY HOTTIE WHITE HEART.,2,12/09/2010 19:32,3.75,13969,United Kingdom,0,0,0,,
525388,538170,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,2,12/09/2010 19:32,3.75,13969,United Kingdom,0,0,0,,
525389,538170,85232B,SET OF 3 BABUSHKA STACKING TINS,2,12/09/2010 19:32,4.95,13969,United Kingdom,0,0,0,,
525435,538171,47591D,PINK FAIRY CAKE CHILDRENS APRON,1,12/09/2010 20:01,1.95,17530,United Kingdom,0,0,0,,


In [63]:
# understanding patterns on non-numeric stock codes
filtered_stock_code = filtered_stock_code.copy()
filtered_stock_code['code_len'] = filtered_stock_code['StockCode'].apply(len)

filtered_stock_code['code_len'].value_counts()

code_len
6     76102
7      1003
1       998
4       881
3       736
2       139
12      131
8        96
9        26
Name: count, dtype: int64

In [64]:
# filter gift and charges df
gift_df = retails_df_stage_I[retails_df_stage_I['StockCode'].astype(str).str.contains('gift', na=False, regex=True, case=False)]
charges_df = retails_df_stage_I[retails_df_stage_I['StockCode'].astype(str).str.contains('charges', na=False, regex=True, case=False)]

\+  Search for 'test' in all columns  
\+  Search for 'Dotcom' in all columns   
\+  Search for general 'adjust' in all columns    
\+  Search for general 'credit' in all columns    
\+  Search for general 'debit' in all columns    
\+  Search for general 'fee' in all columns    
\+  Search for general 'eurobargain' in all columns    
\+  Search for non-alphanumeric in description    
\+ For rows without a Description or Price, I am assuming the last recorded Description and Price based on the StockCode. This assumes that StockCode descriptions are static, and the last recorded value is correct and applicable
\+ search for 'update' in all columns

In [65]:
# flag possible returns
returns_df = retails_df_stage_I[
    retails_df_stage_I['Quantity'] < 0
]

retails_df_stage_I.loc[
    retails_df_stage_I['Invoice'].isin(returns_df['Invoice']),
    'product_return'
] = 1

returns_df = None
del returns_df

In [66]:
# flagging lost sales for gift products
retails_df_stage_I.loc[
    (retails_df_stage_I['Invoice'].isin(gift_df['Invoice'])) & 
    (retails_df_stage_I['StockCode'].isin(gift_df['StockCode'])),
    'lost_sales'
] = 1

In [67]:
# flagging bank charges as financial details
retails_df_stage_I.loc[
    (retails_df_stage_I['Invoice'].isin(charges_df['Invoice'])) & 
    (retails_df_stage_I['StockCode'].isin(charges_df['StockCode'])),
    'financial_details'
] = 1

---

lineage save: stage 1

In [68]:
retails_lineage_I_path = os.path.join(
    cwd_path,
    'retails_lineage_I.parquet'
) 

In [69]:
retails_df_stage_I.to_parquet(
    retails_lineage_I_path,
    index=False,
    compression='snappy'
)

##### B - Overal data types to consistent formats

\+  Search for 'test' in all columns  
\+  Search for general 'adjust' in all columns    
\+  Search for general 'credit' in all columns    
\+  Search for general 'debit' in all columns    
\+  Search for general 'fee' in all columns    
\+  Search for general 'eurobargain' in all columns    
\+ For rows without a Description or Price, I am assuming the last recorded Description and Price based on the StockCode. This assumes that StockCode descriptions are static, and the last recorded value is correct and applicable  
\+ search for 'update' in all columns  
\+ map mode description for stockcode then understand when it doesn't match

In [70]:
retails_df_stage_II = pd.read_parquet(retails_lineage_I_path)

---

lineage: stage II

In [71]:
pattern = 'debt|credit'

retails_df_stage_II[
    retails_df_stage_II.astype(str)
    .apply(lambda col: col.str.contains(pattern, case=False, regex=True))
    .any(axis=1)
]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,entry_errors,product_return,lost_sales,financial_details,maintenance_adjustment
179403,A506401,B,Adjust bad debt,1,4/29/2010 13:36,-53594.36,,United Kingdom,0,0,0,1.0,1.0
253702,513826,47566B,stock credited from royal yacht inc,-144,6/29/2010 10:50,0.0,,United Kingdom,0,1,0,1.0,
276274,A516228,B,Adjust bad debt,1,7/19/2010 11:24,-44031.79,,United Kingdom,0,0,0,1.0,1.0
363492,524570,84925D,incorrect credit,-372,9/29/2010 15:33,0.0,,United Kingdom,0,1,0,1.0,
363495,524572,84924D,eurobargain invc/credit,372,9/29/2010 15:34,0.0,,United Kingdom,0,0,1,1.0,
403472,A528059,B,Adjust bad debt,1,10/20/2010 12:04,-38925.87,,United Kingdom,0,0,0,1.0,1.0


In [72]:
pattern = 'test|tste|tst'

test_data = retails_df_stage_II[
    retails_df_stage_II.astype(str)
    .apply(lambda col: col.str.contains(pattern, case=False, regex=True))
    .any(axis=1)
]

In [73]:
pattern = ' fee'

retails_df_stage_II[
    retails_df_stage_II.astype(str)
    .apply(lambda col: col.str.contains(pattern, case=False, regex=True))
    .any(axis=1)
]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,entry_errors,product_return,lost_sales,financial_details,maintenance_adjustment
440688,C531400,AMAZONFEE,AMAZON FEE,-1,11/08/2010 10:08,6706.71,,United Kingdom,0,1,0,1.0,
440698,531411,AMAZONFEE,AMAZON FEE,1,11/08/2010 10:11,6706.71,,United Kingdom,0,0,0,1.0,
517452,C537600,AMAZONFEE,AMAZON FEE,-1,12/07/2010 12:41,1.0,,United Kingdom,0,1,0,1.0,
517953,C537630,AMAZONFEE,AMAZON FEE,-1,12/07/2010 15:04,13541.33,,United Kingdom,0,1,0,1.0,
517955,537632,AMAZONFEE,AMAZON FEE,1,12/07/2010 15:08,13541.33,,United Kingdom,0,0,0,1.0,
519170,C537644,AMAZONFEE,AMAZON FEE,-1,12/07/2010 15:34,13474.79,,United Kingdom,0,1,0,1.0,
519251,C537647,AMAZONFEE,AMAZON FEE,-1,12/07/2010 15:41,5519.25,,United Kingdom,0,1,0,1.0,
519294,C537651,AMAZONFEE,AMAZON FEE,-1,12/07/2010 15:49,13541.33,,United Kingdom,0,1,0,1.0,
519295,C537652,AMAZONFEE,AMAZON FEE,-1,12/07/2010 15:51,6706.71,,United Kingdom,0,1,0,1.0,


In [74]:
pattern = 'eurobargain'

retails_df_stage_II[
    retails_df_stage_II.astype(str)
    .apply(lambda col: col.str.contains(pattern, case=False, regex=True))
    .any(axis=1)
]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,entry_errors,product_return,lost_sales,financial_details,maintenance_adjustment
363495,524572,84924D,eurobargain invc/credit,372,9/29/2010 15:34,0.0,,United Kingdom,0,0,1,1.0,


In [75]:
pattern = 'update'

retails_df_stage_II[
    retails_df_stage_II.astype(str)
    .apply(lambda col: col.str.contains(pattern, case=False, regex=True))
    .any(axis=1)
]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,entry_errors,product_return,lost_sales,financial_details,maintenance_adjustment
241371,512737,DCGSSBOY,update,100,6/17/2010 14:10,0.0,,United Kingdom,0,0,0,,1.0
241372,512738,DCGSSGIRL,update,100,6/17/2010 14:11,0.0,,United Kingdom,0,0,0,,1.0


In [76]:
pattern = 'delete'

retails_df_stage_II[
    retails_df_stage_II.astype(str)
    .apply(lambda col: col.str.contains(pattern, case=False, regex=True))
    .any(axis=1)
]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,entry_errors,product_return,lost_sales,financial_details,maintenance_adjustment


In [77]:
# flagging adjustments

# 1 - entry errors
retails_df_stage_II.loc[
    (retails_df_stage_II['Invoice'].isin(test_data['Invoice'])) & 
    (retails_df_stage_II['StockCode'].isin(test_data['StockCode'])),
    'entry_errors'
] = 1

test_data = None
del test_data

---

lineage save: stage II

In [78]:
retails_df_stage_II_path = os.path.join(
    cwd_path,
    'retails_lineage_II.parquet'
)

In [79]:
retails_df_stage_II.to_parquet(
    retails_df_stage_II_path,
    index=False,
    compression='snappy'
)

In [82]:
retails_df_stage_II = None
del retails_df_stage_II

#### C - Fine adjustments and basic data quality

\+ For rows without a Description or Price, I am assuming the last recorded Description and Price based on the StockCode. This assumes that StockCode descriptions are static, and the last recorded value is correct and applicable  

---

lineage save: stage III

In [80]:
retails_df_stage_III = pd.read_parquet(retails_df_stage_II_path)

In [89]:
retails_df_stage_III.head(2)

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,entry_errors,product_return,lost_sales,financial_details,maintenance_adjustment
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,12/01/2009 07:45,6.95,13085,United Kingdom,0,0,0,,
1,489434,79323P,PINK CHERRY LIGHTS,12,12/01/2009 07:45,6.75,13085,United Kingdom,0,0,0,,


In [90]:
# column name normalization
retails_df_stage_III.columns = [
    'invoice', 'stock_code', 'description',
    'quantity', 'invoice_date', 'price',
    'customer_id', 'country', 'entry_errors',
    'product_return', 'lost_sales', 'financial_details',
    'maintenance_adjustment']

# rename column
retails_df_stage_III.rename(columns={'country': 'location'}, inplace=True)

In [94]:
# base dtypes

# formating dtypes on data
retails_df_stage_III['invoice'] = sanitize_column_data(bg_logger, retails_df_stage_III, 'invoice')
retails_df_stage_III['stock_code'] = sanitize_column_data(bg_logger, retails_df_stage_III, 'stock_code')
retails_df_stage_III['description'] = sanitize_column_data(bg_logger, retails_df_stage_III, 'description')
retails_df_stage_III['customer_id'] = sanitize_column_data(bg_logger, retails_df_stage_III, 'customer_id')
retails_df_stage_III['country'] = sanitize_column_data(bg_logger, retails_df_stage_III, 'location')

2024-11-26T02:38:30 - INFO - [sanitize_column_data] - [2e767643-45a1-49e0-9d20-c3ce52818e1e] - Specializing column data 'invoice' to '<class 'str'>'. It took 0:00:00.064480
2024-11-26T02:38:31 - INFO - [sanitize_column_data] - [c0bed4e9-ce88-44e1-b34d-c0c6d0d4f264] - Specializing column data 'stock_code' to '<class 'str'>'. It took 0:00:00.061434
2024-11-26T02:38:31 - INFO - [sanitize_column_data] - [09555383-c22b-4ca2-94bc-841914944db2] - Specializing column data 'description' to '<class 'str'>'. It took 0:00:00.064282
2024-11-26T02:38:31 - INFO - [sanitize_column_data] - [4638d26e-0556-43c1-ac8b-eea5c1f44bec] - Specializing column data 'customer_id' to '<class 'str'>'. It took 0:00:00.054349
2024-11-26T02:38:31 - INFO - [sanitize_column_data] - [59f08f36-415c-44de-af4b-164d55758fcd] - Specializing column data 'location' to '<class 'str'>'. It took 0:00:00.062929


In [95]:
# checking stage corrections
retails_df_stage_III.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 14 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   invoice                 525461 non-null  object 
 1   stock_code              525461 non-null  object 
 2   description             522533 non-null  object 
 3   quantity                525461 non-null  int64  
 4   invoice_date            525461 non-null  object 
 5   price                   525439 non-null  float64
 6   customer_id             417541 non-null  object 
 7   location                525430 non-null  object 
 8   entry_errors            525461 non-null  int64  
 9   product_return          525461 non-null  int64  
 10  lost_sales              525461 non-null  int64  
 11  financial_details       80 non-null      float64
 12  maintenance_adjustment  78 non-null      float64
 13  country                 525430 non-null  object 
dtypes: float64(3), int64

In [96]:
# specialized DTYPES
retails_df_stage_III['quantity'] = pd.to_numeric(retails_df_stage_III['quantity'], errors='coerce')
retails_df_stage_III['price'] = pd.to_numeric(retails_df_stage_III['price'], errors='coerce')

In [97]:
# checking stage corrections
retails_df_stage_III.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 14 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   invoice                 525461 non-null  object 
 1   stock_code              525461 non-null  object 
 2   description             522533 non-null  object 
 3   quantity                525461 non-null  int64  
 4   invoice_date            525461 non-null  object 
 5   price                   525439 non-null  float64
 6   customer_id             417541 non-null  object 
 7   location                525430 non-null  object 
 8   entry_errors            525461 non-null  int64  
 9   product_return          525461 non-null  int64  
 10  lost_sales              525461 non-null  int64  
 11  financial_details       80 non-null      float64
 12  maintenance_adjustment  78 non-null      float64
 13  country                 525430 non-null  object 
dtypes: float64(3), int64

In [98]:
# treating different date formats and converting to ISO 8601
retails_df_stage_III['invoice_date'] = pd.to_datetime(retails_df_stage_III['invoice_date'], errors='coerce')
retails_df_stage_III['invoice_date'] = retails_df_stage_III['invoice_date'].dt.strftime('%Y-%m-%dT%H:%M:%S')

In [99]:
retails_df_stage_III.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 14 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   invoice                 525461 non-null  object 
 1   stock_code              525461 non-null  object 
 2   description             522533 non-null  object 
 3   quantity                525461 non-null  int64  
 4   invoice_date            525461 non-null  object 
 5   price                   525439 non-null  float64
 6   customer_id             417541 non-null  object 
 7   location                525430 non-null  object 
 8   entry_errors            525461 non-null  int64  
 9   product_return          525461 non-null  int64  
 10  lost_sales              525461 non-null  int64  
 11  financial_details       80 non-null      float64
 12  maintenance_adjustment  78 non-null      float64
 13  country                 525430 non-null  object 
dtypes: float64(3), int64

In [100]:
memory_usage = retails_df_stage_I.reset_index(drop=True).memory_usage(deep=True) / (1024 * 1024)
bg_logger.info('Old memory usage: %.2f MB x New memory usage: %.2f MB', format_base_memory_usage.sum(), memory_usage.sum())

2024-11-26T02:38:52 - INFO - [<module>] - [2ac56cea-d927-4779-8a65-fc3fc69ed0f1] - Old memory usage: 188.98 MB x New memory usage: 209.06 MB


---
lineage save: stage III

In [101]:
retails_df_stage_III_path = os.path.join(
    cwd_path,
    'retails_lineage_III.parquet'
)

In [102]:
retails_df_stage_III.to_parquet(
    retails_df_stage_III_path,
    index=False,
    compression='snappy'
)

In [103]:
retails_df_stage = None
del retails_df_stage

# Warehouse definitions