# Notes

In this notebook we will:

- Preprocess the filenames to something more standardized
- Evaluate the file contents to assess how 'unclean' they are

In [8]:
import pandas as pd
import numpy as np
import os
import re

## Get the file list

This part will get the original file list

In [9]:
files = os.listdir('budget-data/budget-data/')

aux_files = set(files)
if len(aux_files) != len(files):
    print('There are duplicated files')

if '.DS_Store' in files:
    files.remove('.DS_Store')

files[0:5]

['Cotação nº 18604 Sotécnica.tsv',
 'Cotação Nº 2206291344.C.Auchan.tsv',
 'Cotacao nº 22262 TRM.tsv',
 'Cotação nº 18612 Azinor.tsv',
 'Cotação nº 18564 Race.tsv']

## Extract data from file's names

Every file has an invoice ID and the company name. We will extract this information in order to create the new files.

In [11]:
files_converted = []

for file in files:
    company_name = re.findall(r'\d+.(.*).tsv$', file)
    invoice_number = re.findall(r'\d+', file)
    files_converted.append((company_name[0], invoice_number[0]))

files_converted[:5]

[('Sotécnica', '18604'),
 ('C.Auchan', '2206291344'),
 ('TRM', '22262'),
 ('Azinor', '18612'),
 ('Race', '18564')]

## Create the new files

This will create files with the new names.

The name structure will be as follows: `<companyName>_<invoiceID>.tsv`

In [19]:
if not os.path.exists('budget-data/budget-data-new/'):
    os.mkdir('budget-data/budget-data-new/')

for i, file in enumerate(files_converted):
    old_file_path = 'budget-data/budget-data/' + files[i]
    new_file_path = 'budget-data/budget-data-new/' + file[0] + '_' + file[1] + '.tsv'

    with open(old_file_path, 'r', encoding='utf-8') as f:
        data = f.read()
    
    with open(new_file_path, 'w', encoding='utf-8') as f:
        f.write(data)

new_file_names = os.listdir('budget-data/budget-data-new/')

## Validation

This part will validate the file's contents to assess how 'unclean' they are

In [15]:
def validate_description(description):
    # Description should have more than 1 dot identifying the product
    if '.' not in str(description):
        return False

    if not len(str(description).split('.')) >= 3:
        return False

    if isinstance(description, float) or isinstance(description, int):
        return False

    if 'filtros' not in str(description.lower()) and \
        'rolos' not in str(description.lower()) and \
            'filtro' not in str(description.lower()) and \
                'rolo' not in str(description.lower()):
        return False
    
    return True


def validate_quantity(quantity):
    valid = True

    # Quantity should be a number
    if not isinstance(quantity, np.int64):
        valid = False
    
    return valid


def validate_price(price):
    valid = True

    # Price should be a number
    if not isinstance(price, np.float64) and not isinstance(price, np.int64):
        valid = False
    
    return valid

def process_files(files):
    new_df = pd.DataFrame(columns=['invoice_id', 'company', 'description', 'quantity', 'price_per_unit'])

    files_with_errors = []

    for file in files:
        df = pd.read_csv('budget-data/budget-data-new/' + file, sep='\t', encoding='utf-8')

        if len(df.columns) > 3:
            files_with_errors.append((file, "columns"))
            continue

        descriptions = df.dimensions
        quantities = df.qtd
        prices = df.punit

        products_to_add = []  # Holds products to add to the new dataframe

        valid = True
        for i in range(len(descriptions)):
            valid = validate_description(descriptions.iloc[i])

            if not valid:
                files_with_errors.append((file, "description"))
                break

            valid = validate_quantity(quantities.iloc[i])
            if not valid:
                files_with_errors.append((file, "quantity"))
                break

            valid = validate_price(prices.iloc[i])
            if not valid:
                files_with_errors.append((file, "price"))
                break

            products_to_add.append([descriptions.iloc[i], quantities.iloc[i], prices.iloc[i]])

        if not valid:
            continue

    return new_df, files_with_errors

In [20]:
new_df, files_with_errors = process_files(new_file_names)

files_with_errors = pd.DataFrame(files_with_errors, columns=['file', 'reason'])
files_with_errors

Unnamed: 0,file,reason
0,Taguspark SA_18610.tsv,description
1,C.TECCI_2207061721.tsv,description
2,C.upK_2207291200.tsv,description
3,ProAir Industrial_18586.tsv,description
4,Frostline ANEXO_18589.tsv,description
5,ATM_22267.tsv,description
6,ATM _18576.tsv,description
7,Turbogas_18578.tsv,description
8,IKEA Loulé_22264.tsv,description
9,C.Sotecnica_2207251735.tsv,description
