In [7]:
import numpy as np
import pandas as pd
import pickle 
import re

In [8]:
old_invoice_path = 'expired_invoices.txt'
new_invoice_path = "invoices_new.pkl"

In [9]:
class DataExtractor:
    """
    This class extracts and transforms invoice data from new and old invoice files.

    Attributes:
        old_invoices: A list of expired invoice IDs.
        new_invoices_row: A list of new invoices.
        conversion_table: conversion table for type
        number_words: dict from string to number(0-20)
    """

    def __init__(self, new_invoice_path, old_invoice_path):
        self.old_invoices = self.extract_old_invoices(old_invoice_path)
        self.new_invoices_row = self.extract_new_invoices(new_invoice_path)
        self.conversion_table = {0: 'Material', 1: 'Equipment', 2: 'Service', 3: 'Other'}
        self.number_words = {"one": 1, "two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "seven": 7,
                             "eight": 8, "nine": 9, "ten": 10, "eleven": 11, "twelve": 12, "thirteen": 13,
                             "fourteen": 14, "fifteen": 15, "sixteen": 16, "seventeen": 17, "eighteen": 18,
                             "nineteen": 19, "twenty": 20}

    def extract_new_invoices(self, new_invoice_path):
        """
        Extracts data from the new invoice file path.
        Args:
            new_invoice_path: The path to the new invoice data file.
        Returns:
            list of loaded invoice data.
        Raises:
            pickle.UnpicklingError
            FileNotFoundError
        """
        try:
            with open(new_invoice_path, "rb") as file:
                new_row_data = pickle.load(file)
                return new_row_data
        except pickle.UnpicklingError:
            print("Unpickling Error")
        except FileNotFoundError:
            print(f"File path does not exist")

    def extract_old_invoices(self, old_invoice_path):
        """
        Extracts expired invoice IDs from the file path.
        Args:
            old_invoice_path: The path to the file containing expired invoice IDs.
        Returns:
            list of integers representing the expired invoice IDs.
        """
        with open(old_invoice_path, 'rb') as file:
            expired_invoices = file.read()
        expired_invoices_list = expired_invoices.split()
        expired_invoices_list = [int(invoice[:-1]) for invoice in expired_invoices_list]
        return expired_invoices_list
    
    def clean_and_convert(self, s):
        if type(s) is int:
            return int(s)
        s = s.replace('O', '0') 
        s = re.sub(r'\D', '', s)  
        return int(s) if s.isdigit() else None

    def transform(self): 
        new_invoice_data = []
        for invoice in self.new_invoices_row:
            invoice_total = 0
            invoice_data = {}

            inv_id = self.clean_and_convert(invoice["id"])
            invoice_data["invoice_id"] = inv_id
            invoice_data["created_on"] = invoice["created_on"]
            invoice_data["is_expired"] = (inv_id in self.old_invoices)
            if "items" in invoice:
                for i in invoice["items"]:
                    item = i["item"]
                    invoice_data["invoiceitem_id"] = self.clean_and_convert(item["id"])
                    invoice_data["invoiceitem_name"] = item["name"]
                    try:
                        item_type_id = self.clean_and_convert(item["type"])
                    except ValueError as e:
                        print(f"Error converting string")

                    invoice_data["type"] = self.conversion_table[item_type_id]
                    invoice_data["unit_price"] = int(item["unit_price"])
                    try:
                        invoice_data["total_price"] = int(invoice_data["unit_price"] * i["quantity"])
                    except ValueError as e:
                        quantity = self.number_words[i["quantity"]]
                        invoice_data["total_price"] = int(invoice_data["unit_price"] * quantity)
                    invoice_total += invoice_data["total_price"]
                    invoice_data["percentage_in_invoice"] = float(invoice_data["total_price"]) / invoice_total

                    new_invoice_data.append(invoice_data)
        df = pd.DataFrame(new_invoice_data)
        df['created_on'] = pd.to_datetime(df['created_on'], errors='coerce')
        return df


In [10]:
data_extractor = DataExtractor(new_invoice_path, old_invoice_path)
df = data_extractor.transform()
df.head()

Unnamed: 0,invoice_id,created_on,is_expired,invoiceitem_id,invoiceitem_name,type,unit_price,total_price,percentage_in_invoice
0,3653710,2019-05-27,False,154019,ii_154019,Equipment,176,176,0.138801
1,3653710,2019-05-27,False,154019,ii_154019,Equipment,176,176,0.138801
2,389528,2019-03-19,True,189841,ii_189841,Equipment,171,684,0.119393
3,389528,2019-03-19,True,189841,ii_189841,Equipment,171,684,0.119393
4,389528,2019-03-19,True,189841,ii_189841,Equipment,171,684,0.119393


In [11]:
df.dtypes

invoice_id                        int64
created_on               datetime64[ns]
is_expired                         bool
invoiceitem_id                    int64
invoiceitem_name                 object
type                             object
unit_price                        int64
total_price                       int64
percentage_in_invoice           float64
dtype: object

In [12]:
df.to_csv('extracted_invoice_data.csv', index=False)