In [None]:
import pandas as pd
import re
import json
from enum import Enum, auto

In [None]:
INPUT_DATASET_PATH = "data/cash_flow_statements/input_dataset_v.1.xlsx"
UNITS_OF_MEASUREMENT_DATASET_PATH = "data/units_of_measurement.csv"
CONVERSION_RATES_PATH = "data/conversion_rates.csv"
UNITS_OF_MEASUREMENT_VARIATIONS_PATH = "data/units_of_measurement_variations.json"

class NumberNotations(Enum):
    """
    This class is an enum for supported number format notations.
    """
    
    EU = auto()
    US = auto()

In [None]:
with open(UNITS_OF_MEASUREMENT_VARIATIONS_PATH) as json_reader:
    unit_variations = json.load(json_reader)

In [None]:
def standardize_units(text:str, unit_variations:dict)->str:
    """Replace units of measurement variations in given text,
    
    Args: 
        text (str): input text where variations will be replaced.
        unit_variations (dict): dictionary with (variation:standard_unit).

    Returns: 
        str: string with unit of measurement variations replaced with its standard chosen unit.

    """
    for variation, standard_unit in unit_variations.items():
        if variation in text:
            text = text.replace(variation,standard_unit)

    return text


def parse_amount_field(dataset:pd.DataFrame, field:str, number_format:NumberNotations=NumberNotations.US)->pd.DataFrame:
    """Convert string amount field into numeric.
    
    Args:
        dataset(DataFrame): Dataset with the column to be corrected.
        field(str): column to be corrected.
        number_format(NumberNotation): US|EU notation.
    
        Returns:
            DataFrame: dataframe with chosen column converted to numeric.
    """
    if number_format == NumberNotations.US:
        dataset[field] = dataset[field].str.replace(',', '', regex=False)
    else:
        dataset[field] = dataset[field].str.replace('.', '', regex=False)
        dataset[field] = dataset[field].str.replace(',', '.', regex=False)
    
    dataset[field] = pd.to_numeric(dataset[field], errors='coerce')
    return dataset


def standardize_units_of_measurement(dataset:pd.DataFrame)->pd.DataFrame:
    """Convert all amounts to standard unit of measurment for volumes, energies and weights.
    
    Args:
        dataset(DataFrame): Dataset to standardize.
    
    Returns:
        Dataframe: dataframe with standardized amounts.
    """
    df = pd.read_csv(CONVERSION_RATES_PATH)
    result = pd.merge(dataset,df, how='left')

    result['Unit'] = result['TargetUnit']
    result['Amount'] = result['Amount'] * result['ConversionRate']

    result = result.drop(['TargetUnit', 'ConversionRate'], axis=1)

    return result


# def extract_description(dataset:pd.DataFrame)->pd.DataFrame:
#     filtered_dataset = dataset.loc[dataset['GL_Account (General Ledger)'].isin(['Raw Materials Expense','Utilities Expense'])]

In [None]:
cash_flow_dataset = pd.read_excel(INPUT_DATASET_PATH)
cash_flow_dataset = cash_flow_dataset.map(lambda s: s.lower() if type(s) == str else s)

units_of_measurment_dataset = pd.read_csv(UNITS_OF_MEASUREMENT_DATASET_PATH)
units_of_measurment_dataset = units_of_measurment_dataset.map(lambda s: s.lower() if type(s) == str else s)

In [None]:
cash_flow_dataset['Description'] = cash_flow_dataset['Description'].apply(
    lambda x: standardize_units(x, unit_variations)
)

In [None]:
all_units='|'.join(list(units_of_measurment_dataset['Unit']))
pattern = rf'([\d.,]+)\s*({all_units})\b'

cash_flow_dataset[['Amount', 'Unit']] = cash_flow_dataset['Description'].str.extract(pattern, flags=re.IGNORECASE)

In [None]:
cash_flow_dataset = parse_amount_field(dataset=cash_flow_dataset, field='Amount')

cash_flow_dataset = parse_amount_field(dataset=cash_flow_dataset, field='Amount_EUR')

cash_flow_dataset = standardize_units_of_measurement(cash_flow_dataset)

cash_flow_dataset = cash_flow_dataset[['Description','Amount','Unit','Amount_EUR']]

In [None]:
cash_flow_dataset