In [1]:
from database_utils import DatabaseConnector
from data_extraction import DataExtractor
from data_cleaning import DataCleaning
from dateutil.parser import parse
import pandas as pd
from decouple import config
import re
import numpy as np


In [2]:
connector = DatabaseConnector()
cred_path ='db_creds.yaml'
credentials = connector.read_db_creds(file_path = cred_path)
engine1, engine2 = connector.init_db_engine(credentials)
extractor = DataExtractor()

In [3]:
s3_address = config('S3_ADDRESS')
csv_filepath = config('CSV_FILEPATH')
#make sure to configure aws credentials before running this method
products_df, csv_file = extractor.extract_from_s3(s3_address, csv_filepath)
print(products_df.info())

Extracted successfully to this path: \Users\mohdi\multinationalrdc\multinational_retail_proj\products.csv 
<class 'pandas.core.frame.DataFrame'>
Index: 1853 entries, 0 to 1852
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   product_name   1849 non-null   object
 1   product_price  1849 non-null   object
 2   weight         1849 non-null   object
 3   category       1849 non-null   object
 4   EAN            1849 non-null   object
 5   date_added     1849 non-null   object
 6   uuid           1849 non-null   object
 7   removed        1849 non-null   object
 8   product_code   1849 non-null   object
dtypes: object(9)
memory usage: 144.8+ KB
None


In [4]:
df = products_df.copy()

In [6]:
gibberish_values = ['S1YB74MLMJ', 'C3NCA2CL35', 'WVPMHZP59U']
subset_df = df[df.category.isin(gibberish_values)]
subset_df

Unnamed: 0,product_name,product_price,weight,category,EAN,date_added,uuid,removed,product_code
751,VLPCU81M30,XCD69KUI0K,9GO9NZ5JTL,S1YB74MLMJ,OO7KH8P79I,CCAVRB79VV,7QB0Z9EW1G,T3QRRH7SRP,SDAV678FVD
1133,9SX4G65YUX,N9D2BZQX63,Z8ZTDGUZVU,C3NCA2CL35,E8EOGWOY8S,09KREHTMWL,CP8XYQVGGU,BPSADIOQOK,BSDTR67VD90
1400,LB3D71C025,ODPMASE7V7,MX180RYSHX,WVPMHZP59U,BHPF2JTNKQ,PEPWA0NCVH,VIBLHHVPMN,H5N71TV8AY,OPSD21HN67


In [7]:
print(df.weight.unique())

['1.6kg' '0.48kg' '590g' '540g' '1.91kg' '0.91kg' '0.46kg' '0.38kg'
 '8.981kg' '1.478kg' '1.2g' '0.66kg' '1.8kg' '1.9kg' '1.725kg' '0.54kg'
 '0.322kg' '0.71kg' '0.88kg' '0.67kg' '11.076kg' '4kg' '0.385kg' '1.38kg'
 '2.57kg' '1.35kg' '0.695kg' '1.15kg' '0.98kg' '1.447kg' '1.3625kg'
 '2.25kg' '0.79kg' '0.8kg' '1.08kg' '2.476kg' '0.137kg' '11.5kg' '0.44kg'
 '2.75kg' '0.911kg' '0.33kg' '1kg' '0.5kg' '0.45kg' '0.7kg' '0.41kg'
 '1.3kg' '2kg' '0.34kg' '0.37kg' '0.76kg' '1.18kg' '0.685kg' '1.59kg'
 '1.4kg' '1.66kg' '13.5kg' '0.745kg' '1.44kg' '0.74kg' '0.660kg' '0.419kg'
 '0.418kg' '0.470kg' '0.353kg' '0.350kg' '0.96kg' '1.20kg' '1.21kg'
 '1.02kg' '0.365kg' '0.677kg' '0.55kg' '0.43kg' '0.11kg' '1.23kg' '1.03kg'
 '0.87kg' '0.39kg' '0.35kg' '0.42kg' '0.27kg' '726g' '0.61kg' '0.864kg'
 '0.667kg' '0.63kg' '0.72kg' '0.58kg' '0.627kg' '0.3kg' '0.32kg' '0.01kg'
 '0.650kg' '0.68kg' '0.36kg' '1.395kg' '0.9kg' '0.468kg' '0.687kg'
 '0.955kg' '0.700kg' '0.900kg' '0.06kg' '0.967kg' '0.03kg' '1.041kg'
 '0.6

In [None]:
def convert_weight(products_df):
    try:
        if 'x' in products_df:   
            numeric_part1, units1, numeric_part2, units2 = re.match(r"[\d.]+)\s*([a-zA-Z]*)\s*x\s*([\d.])\s*([a-zA-Z]*)", value).groups()
            if not units1:
                units1 = units2
                result = float(numeric_part1) * float(numeric_part2)
                if units1.lower() in ['g', 'gram', 'grams']:
                    result /= 1000
                elif units1.lower() in ['ml', 'milliliter', 'milliliters']:
                    result /= 1000
                elif units1.lower() in ['kg', 'kilogram', 'kilograms']:
                    pass  # No conversion needed for kg
                elif units1.lower() in ['oz', 'ounce', 'ounces']:
                    result *= 0.0283495
                else:
                    # If units are not recognized, return NaN
                    return np.nan
        else:
            numeric_part, units = re.match(r"([\d.]+)\s*([a-zA-Z]*)", value).groups()
            result = float(numeric_part)
            if units.lower() in ['g', 'gram', 'grams']:
                result /= 1000
            elif units.lower() in ['ml', 'milliliter', 'milliliters']:
                result /= 1000
            elif units.lower() in ['kg', 'kilogram', 'kilograms']:
                pass  # No conversion needed for kg
            elif units.lower() in ['oz', 'ounce', 'ounces']:
                result *= 0.0283495
            else:
                # If units are not recognised, return NaN
                return np.nan
            return round(result, 3)
    except Exception as e:
                    # If any error occurs, return NaN
                    return np.nan

In [None]:
cleaner = DataCleaning()
cleaner.clean_products_data()