In [23]:
import os
import env
import json
import openai
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.docstore.document import Document
import re

_ = load_dotenv(".env")
openai.api_key = os.environ["OPENAI_API_KEY"]

In [24]:
PERSIST_DIRECTORY = "CategoriesDB"

In [25]:
categorias = [
    "Restaurantes y bares",
    "Compras",
    "Cultura",
    "Deporte",
    "Educación",
    "Salud",
    "Servicios",
    "Transporte",
    "Turismo",
]
documents = [Document(page_content=categoria, metadata={}) for categoria in categorias]

embedding = OpenAIEmbeddings()
smalldb = Chroma.from_documents(
    documents=documents, embedding=embedding, persist_directory=PERSIST_DIRECTORY
)

In [33]:
def get_category(text):
    results = smalldb.similarity_search(text, k=3)
    return results[0].page_content


def get_amount(text):
    pattern_k = r"(\d+\.?\d*)\s*k"
    pattern_usd = r"(\d+\.?\d*)\s*usd"

    match_k = re.search(pattern_k, text, re.IGNORECASE)
    match_usd = re.search(pattern_usd, text, re.IGNORECASE)

    if match_k:
        amount = float(match_k.group(1)) * 1000 if match_k.group(1) else 0.0
        currency = "pesos"
    elif match_usd:
        amount = float(match_usd.group(1)) if match_usd.group(1) else 0.0
        currency = "usd"
    else:
        return (0.0, "pesos")

    return (amount, currency)


def process_text(text):
    category = get_category(text)
    amount, currency = get_amount(text)
    return {"amount": amount, "currency": currency, "category": category}

In [34]:
text = "6k futbol"
process_text(text)

{'amount': 6000.0, 'currency': 'pesos', 'category': 'Deporte'}