# document preprocessing

In [2]:
import os

topic_choices = ['adtech','walmart_annual_reports','nutrition','crypto'] 

TOPIC = 'adtech'

files = [file for file in os.listdir(TOPIC) if file.endswith(".pdf") or file.endswith(".txt")]

In [3]:
from helpers.text_preprocess import preprocess
import pandas as pd
from tqdm.auto import tqdm

total_chunks = 0
chunk_list = []

with tqdm(total=len(files), desc="Processing files", unit="file") as pbar:
    for file in files:
        pages_and_chunks = preprocess(f"{TOPIC}/{file}")
        chunk_count = len(pages_and_chunks)
        total_chunks += chunk_count
        chunk_list.append(pages_and_chunks)

        pbar.update(1)
        pbar.set_postfix(file=file, chunks=chunk_count, total_chunks=total_chunks)

print(f"Total chunks: {total_chunks}")

pages_and_chunks = [i for s in chunk_list for i in s]


pd.DataFrame(pages_and_chunks).to_csv(f"{TOPIC}/text_chunks.csv", escapechar="\\")

Processing files:   0%|          | 0/2 [00:00<?, ?file/s]

Total chunks: 2241


# embeddings

In [4]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
#device = 'cpu'

In [5]:
embedding_model = SentenceTransformer(model_name_or_path='all-mpnet-base-v2',device=device)

In [6]:
from helpers.embeddings import embed

embeddings = embed(embedding_model, pages_and_chunks,device)

torch.save(embeddings, f"{TOPIC}/embeddings.pt")

Embedding: 100%|██████████| 2241/2241 [00:16<00:00, 132.37chunk/s]


# rag

In [1]:
import torch 
import pandas as pd
import random
from helpers.rag import ask

TOPIC = 'adtech'

device = "cuda" if torch.cuda.is_available() else "cpu"

text_chunks = pd.read_csv(f"{TOPIC}/text_chunks.csv")

pages_and_chunks = text_chunks.to_dict(orient="records")

embeddings = torch.load(f"{TOPIC}/embeddings.pt").to(device)
#embeddings = torch.tensor(embeddings).to(device)



In [2]:
query = "can you give a brief overview of the subject of algorithmic marketing"

In [3]:
query = query
print(f"Query: {query}")

# Answer query with context and return context
answer, context_items = ask(
    query=query,
    embeddings=embeddings,
    llm_model_id="google/gemma-2b-it",
    embedding_model_id="all-mpnet-base-v2",
    pages_and_chunks=pages_and_chunks,
    n_resources_to_return = 6,
    temperature=0.90,
    max_new_tokens=2048,
    return_answer_only=False,
)

print(f"Answer:\n")
print(answer)
print(f"Context items:")
context_items

Query: can you give a brief overview of the subject of algorithmic marketing


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Answer:

Sure, here's a brief overview of the subject of algorithmic marketing from the
context items:  - **Algorithmic marketing is a marketing process that is auto-
mated to such a degree that it can be steered by setting a business objective in
a marketing software system.** This implies that the market- ing system should
be intelligent and knowledgeable enough to understand and execute a sequence of
business actions that will achieve the desired objective.   - **The sub- ject of
algorithmic marketing mainly concerns the processes that can be found in the
four areas of the marketing mix and the automation of these processes by using
data-driven techniques and econometric methods.** The book mainly focuses on the
automation of activities related to product analysis, promotions,
recommendations, and pricing.   - **The history of algorithmic marketing shows
that the concept was laid in the 1970s with the development of the internet and
the spread of multimedia websites.** The transitio

[{'page_number': 17,
  'sentence_chunk': 'adtech/algorithmic-marketing-ai-for-marketing-operations-r1.8ga.pdf:Some of these publications are mainly focused on the technology and implementation aspects, whereas others dive deep into mathematical modeling, optimization, and econometrics. In practice, both aspects are important for the successful creation and operation of a marketing system. Many of these results are also based on or related to models developed in scientiﬁc marketing by academic researchers.1.1 the subject of algorithmic marketing One of the traditional deﬁnitions of marketing describes it as the activ- ity of deﬁning products and services offered by a company and com- municating them to existing or potential customers. This activity can',
  'score': tensor(0.8529)},
 {'page_number': 18,
  'sentence_chunk': 'adtech/algorithmic-marketing-ai-for-marketing-operations-r1.8ga.pdf:1.1 the subject of algorithmic marketing 3 be broken down into several streams that are typically 