In [1]:
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
import requests
from bs4 import BeautifulSoup
import re
import math

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. Get the raw text, remove non-ascii non-polish characters
# 2. Split it on \n and remove empty strings
# 3. Split it into chunks of N tokens
# 4. Run semantic search on each chunk, get the most similar one
# 5. Run QA on the most similar chunk
# 6. Standardize the answer into a given format (e.g. single number, single date, single name, etc.)

In [3]:
# 1. Get the raw text, remove non-ascii non-polish characters

url = 'https://www.santanderconsumer.pl/oszczednosci/rachunek-oszczednosciowy,1.html'

resp = requests.get(url)
print(resp.encoding)
resp = resp.text

soup = BeautifulSoup(resp, 'lxml').text
print(len(soup))

soup = ''.join(letter for letter in soup if letter.isascii() or letter.lower() in {'ś', 'ż', 'ź', 'ć', 'ą', 'ę', 'ł', 'ń', 'ó'})
print(len(soup))

utf-8
21048
20991


In [44]:
# 2. Split it on \n and remove empty strings
# splitted = re.split('\n', soup[soup.find('oprocentowanie promocyjne'):])
splitted = re.split('\n', soup)
splitted = [s for s in splitted if s != '' and not s.isspace()]
print(len(splitted))
splitted[:5]

288


['\t\tSantander Consumer Bank S.A. -  Bank od kredytów / Oszczędności / Rachunek oszczędnościowy\t',
 'Kontakt i pomoc',
 'Mapa oddziałów',
 '!',
 'Wiadomości - top']

In [36]:
model_name = "henryk/bert-base-multilingual-cased-finetuned-polish-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(tokenizer.model_max_length)

pipe = pipeline("question-answering", model=model_name)

512


Some weights of the model checkpoint at henryk/bert-base-multilingual-cased-finetuned-polish-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [37]:
# 3. Split it into chunks of N tokens
enc_splitted = [tokenizer.encode(s) for s in splitted]

max_len = 400
stride_len = 100

# Ensure that the chunks are not longer than the model's max length
enc_splitted_processed = []
for s in enc_splitted:
    n_splits = 1
    while math.ceil(len(s) / n_splits)  > len(s):
        n_splits += 1
    
    l = len(s)
    len_per_split = l // n_splits
    for i in range(n_splits):
        idx = i * len_per_split
        if i == n_splits - 1:
            enc_splitted_processed.append(s[idx:])
        else:
            enc_splitted_processed.append(s[idx: idx + len_per_split])

chunks = []
buffer = []
buffer_len = 0
for i, current in enumerate(enc_splitted_processed):
    assert len(current) <= max_len, f"Current chunk is too long: {len(current)}"

    if buffer_len + len(current) <= max_len:
        # Add current to the buffer as long as it fits in max_len
        buffer.append(current)
        buffer_len += len(current)
    else:
        current_chunk = '\n'.join(tokenizer.decode(b) for b in buffer)
        current_chunk = current_chunk.replace('[CLS]', '').replace('[SEP]', '')
        # current_chunk = current_chunk.replace(' .', '.').replace(' ,', ',')
        # current_chunk = current_chunk.replace(' ?', '?').replace(' !', '!')
        # current_chunk = current_chunk.replace(' :', ':').replace(' ;', ';')
        current_chunk = re.sub('(?<=\d), (?=\d)', ',', current_chunk)
        current_chunk = current_chunk.strip()
        chunks.append(current_chunk)

        # delete appropriate number of tokens from the buffer from left
        to_delete = max(stride_len, buffer_len + len(current) - max_len)
        deleted = 0
        while deleted < to_delete:
            deleted += len(buffer[0])
            buffer = buffer[1:]
        buffer_len -= deleted

        buffer.append(current)
        buffer_len += len(current)


In [38]:
print(len(enc_splitted))
print(len(enc_splitted_processed))
print(len(chunks))

288
288
47


In [39]:
chunks[:5]

['Santander Consumer Bank S. A. - Bank od kredytów / Oszczędności / Rachunek oszczędnościowy \n Kontakt i pomoc \n Mapa oddziałów \n! \n Wiadomości - top \n Weź Kredyt Online Zaloguj \n Weź kredyt online \n Kredyty gotówkowe \n Kredyty samochodowe \n Karty kredytowe \n Oszczędności \n Lokata przez telefon \n Lokaty dla firm \n Lokaty przez internet \n Lokaty w oddziale \n Rachunek oszczędnościowy \n Finansowanie zakupów \n Ubezpieczenia \n Bankowość Elektroniczna \n Internetowy odnawialny limit & nbspkredytowy \n! \n Wiadomości \n Kontakt \n Mapa oddziałów \n Jeśli już masz \n Facebook \n LinkedIn \n Goldenline \n YouTube \n Blog \n Strona główna \n > \n Oszczędności > Rachunek oszczędnościowy > \n Menu \n Załóż Rachunek \n Rachunek oszczędnościowy \n 6 % dla nowych klientów na rachunku oszczędnościowym. \n Załóż Rachunek \n Korzyści \n Jak założyć Rachunek? \n Ważne informacje \n Oprocentowanie Rachunku \n Przydatne wskazówki \n Dokumenty \n FAQ \n Informacje dodatkowe \n Jeśli już ma

In [40]:
import logging
from pathlib import Path
from typing import List, Tuple
import time

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document


LOGGER = logging.getLogger(__name__)


class SemanticSearch():
    """Class containing modules for the semantic search.
    """

    model_name: str
    model: HuggingFaceEmbeddings

    def __init__(self,
                 model_name: str = "sdadas/st-polish-paraphrase-from-distilroberta",
                 **kwargs
        ) -> None:
        self.model_name = model_name
        self.model = HuggingFaceEmbeddings(model_name=self.model_name, **kwargs)

    def vectorize_doc(self, doc: Path, vectordb_dir: Path) -> None:
        """Transform a doc containing all the information into a VectorDB.

        Args:
            doc (Path): File path containing the information. doc is a .txt file with /n/n/n separator.
            vectordb_path (Path, optional): _description_. Defaults to config.VECTORDB_PATH.
        """
        if doc.exists():
            with open(doc, "r") as f:
                text = f.read()
            texts = text.split("\n\n\n")
            LOGGER.info(f'Number of chunks: {len(texts)}')
            Chroma.from_texts(texts=texts, 
                              embedding=self.model, 
                              persist_directory=str(vectordb_dir) # Need to be a string
            )
            LOGGER.info(f"VectorDB correctly created at {vectordb_dir}")
        else:
            raise FileNotFoundError(f"{doc} does not exist.")
        
    def vectorize_text(self, strings: str, vectordb_dir: Path) -> None:
        """Transform a doc containing all the information into a VectorDB.

        Args:
            doc (str): string containing the information.
            vectordb_path (Path, optional): _description_. Defaults to config.VECTORDB_PATH.
        """
        LOGGER.info(f'Number of chunks: {len(strings)}')
        Chroma.from_texts(
            texts=strings, 
            embedding=self.model,
            persist_directory=str(vectordb_dir) # Need to be a string
        )
        LOGGER.info(f"VectorDB correctly created at {vectordb_dir}")
        

    def search(self, query: str, vectordb_dir: str = str('../data/vectordb'),
            k: int = 1) -> List[Tuple[Document, float]]:
        """From a query, find the elements corresponding based on personal information stored in vectordb.
        Euclidian distance is used to find the closest vectors.

        Args:
        query (str): Question asked by the user.
        vectordb_dir (str, optional): Path to the vectordb. Defaults to config.VECTORDB_DIR.

        Returns:
        List[Tuple[Document, float]]: Elements corresponding to the query based on semantic search, associated
        with their respective score.
        """
        timestamp = time.time()
        vectordb = Chroma(persist_directory=vectordb_dir, embedding_function=self.model)
        results = vectordb.similarity_search_with_score(query=query, k=k)
        LOGGER.info(f"It took {time.time() - timestamp} to search elements with semantic search.")
        return results

In [55]:
semantic_search = SemanticSearch()

semantic_search.vectorize_text(strings=chunks, vectordb_dir='../data/vectordb')
result = semantic_search.search("lokata, konto oszczędnościowe, oprocentowanie", k=5)
print(result)

[(Document(page_content='Rachunek obsługiwany jest w Bankowości internetowej, \n możesz posiadać tylko jeden Rachunek oszczędnościowy. \n \n Oprocentowanie Rachunku \n Oprocentowanie standardowe \n Do 50 000 PLN włącznie \n 1,90 % \n Nadwyżka ponad 50 000 do 200 000 PLN włącznie \n 1,95 % \n Nadwyżka ponad 200 000 PLN \n 2,00 % \n Oprocentowanie promocyjne * \n Do 50 000 PLN włącznie \n 5,50 % \n Nadwyżka ponad 50 000 do 200 000 PLN włącznie \n 5,20 % \n Nadwyżka ponad 200 000 PLN \n 5,00 % \n Saldo na rachunku \n Oprocentowanie standardowe \n Oprocentowanie promocyjne * \n Oprocentowanie promocyjne podwyższone * * \n Do 50 000 PLN włącznie \n 1,90 % \n 5,50 % \n 6,00 % \n Nadwyżka ponad 50 000 do 200 000 PLN włącznie \n 1,95 % \n 5,20 % \n 5,20 % \n Nadwyżka ponad 200 000 PLN \n 2,00 % \n 5,00 % \n 5,00 % \n Oprocentowanie standardowe środków zgromadzonych na Rachunku oszczędnościowym jest zmienne. Odsetki kapitalizowane są w okresach miesięcznych.'), 70.24005126953125), (Document(pag

In [56]:
# best_context = result[0][0].page_content

In [57]:
best_contexts = [r[0].page_content for r in result]

In [61]:
question = "Jaka jest wysokość oprocentowania promocyjnego na lokacie?"
# question = 'Do kogo jest skierowana oferta?'

preds = []
for context in best_contexts:
    # generate 3 answers to the question
    pred = pipe(question=question, context=context, do_sample=False, top_k=3)
    preds.append(pred)

In [64]:
[[p['answer'] for p in pred] for pred in preds]

[['2,00 %', '1,95 %', '2,00 % \n Oprocentowanie promocyjne'],
 ['2,00 %', '1,95 %', '2,00 % \n Oprocentowanie promocyjne'],
 ['2,00 %', '1,95 %', '2,00 % \n Oprocentowanie promocyjne'],
 ['2,00 %', '1,95 %', '2,00 % \n Oprocentowanie promocyjne'],
 ['2,00 %', '1,95 %', '2,00 % \n Oprocentowanie promocyjne']]

In [13]:
# 1. Get the raw text, remove non-ascii non-polish characters
# 2. Split it on \n or . and remove empty strings
# 3. Split it into chunks of N characters
# 4. Run semantic search on each chunk, get the most similar one
# 5. Run QA on the most similar chunk
# 6. Standardize the answer into a given format (e.g. single number, single date, single name, etc.)


In [14]:
# import plotly.express as px
# px.histogram([len(s) for s in splitted])

In [15]:
max([len(s) for s in splitted])

578

In [16]:
soup

'\n\n\n\n\n\t\tSantander Consumer Bank S.A. -  Bank od kredytów / Oszczędności / Rachunek oszczędnościowy\t\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\nKontakt i pomoc\n\n\n\n\n\n\nMapa oddziałów\n\n\n\n\n\n!\nWiadomości - top\n\n\n\nWeź Kredyt Online Zaloguj\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\nWeź kredyt online\n\n\n\n\nKredyty gotówkowe\n\n\n\n\nKredyty samochodowe\n\n\n\n\nKarty kredytowe\n\n\n\n\nOszczędności\n\n\n\n\n\t\t\t\t\t\t\t\tLokata przez telefon\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\tLokaty dla firm\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\tLokaty przez internet\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\tLokaty w oddziale\t\t\t\t\t\t\t\n\n\n\n\t\t\t\t\t\t\t\tRachunek oszczędnościowy\t\t\t\t\t\t\t\n\n\n\n\n\nFinansowanie zakupów\n\n\n\n\nUbezpieczenia\n\n\n\n\nBankowość Elektroniczna\n\n\n\n\nInternetowy odnawialny limit&nbspkredytowy\n\n\n\n \n\n\n!\nWiadomości\n\n\n\n\n\n\n\nKontakt\n\n\n\n\n\nMapa oddziałów\n\n\n\n\n

In [17]:
context[41928-10: ]

''

In [18]:
len(resp)

95817

In [19]:
# resp[47663:]

In [20]:
# with open('../data/test_data.txt', 'w') as f:
#     f.write(resp)

In [21]:
import logging
from pathlib import Path
from typing import List, Tuple
import time

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document


LOGGER = logging.getLogger(__name__)


class SemanticSearch():
    """Class containing modules for the semantic search.
    """

    model_name: str
    model: HuggingFaceEmbeddings

    def __init__(self,
                 model_name: str = "sdadas/st-polish-paraphrase-from-distilroberta",
                 **kwargs
        ) -> None:
        self.model_name = model_name
        self.model = HuggingFaceEmbeddings(model_name=self.model_name, **kwargs)

    def vectorize_doc(self, doc: Path, vectordb_dir: Path) -> None:
        """Transform a doc containing all the information into a VectorDB.

        Args:
            doc (Path): File path containing the information. doc is a .txt file with /n/n/n separator.
            vectordb_path (Path, optional): _description_. Defaults to config.VECTORDB_PATH.
        """
        if doc.exists():
            with open(doc, "r") as f:
                text = f.read()
            texts = text.split("\n\n\n")
            LOGGER.info(f'Number of chunks: {len(texts)}')
            Chroma.from_texts(texts=texts, 
                              embedding=self.model, 
                              persist_directory=str(vectordb_dir) # Need to be a string
            )
            LOGGER.info(f"VectorDB correctly created at {vectordb_dir}")
        else:
            raise FileNotFoundError(f"{doc} does not exist.")
        

    def search(self, query: str, vectordb_dir: str = str('../data/vectordb.txt'),
            k: int = 1) -> List[Tuple[Document, float]]:
        """From a query, find the elements corresponding based on personal information stored in vectordb.
        Euclidian distance is used to find the closest vectors.

        Args:
        query (str): Question asked by the user.
        vectordb_dir (str, optional): Path to the vectordb. Defaults to config.VECTORDB_DIR.

        Returns:
        List[Tuple[Document, float]]: Elements corresponding to the query based on semantic search, associated
        with their respective score.
        """
        timestamp = time.time()
        vectordb = Chroma(persist_directory=vectordb_dir, embedding_function=self.model)
        results = vectordb.similarity_search_with_score(query=query, k=k)
        LOGGER.info(f"It took {time.time() - timestamp} to search elements with semantic search.")
        return results

In [22]:
semantic_search = SemanticSearch()
input_data = Path('../data/test_data.txt')

semantic_search.vectorize_doc(doc=input_data, vectordb_dir='../data/vectordb.txt')
result = semantic_search.search("oprocentowanie promocyjne dla nowych klientów")
print(result)

[(Document(page_content='<!DOCTYPE html>\n<html lang="pl" class="desktop">\n\n<head>\n\t<meta http-equiv="X-UA-Compatible" content="IE=edge"> \n\t<meta charset="UTF-8" />\n\t<title>\n\t\tRachunek oszczędnościowy / Oszczędności / Rachunek oszczędnościowy / Oszczędności\t</title>\n\t<meta name="viewport" content="width=device-width, initial-scale=1.0" />\n\t<meta name="author" content="ideo - www.ideo.pl" />\n\t<meta name="generator" content="edito - www.edito.pl" />\n\t\t<meta name="Keywords" content="kredyt samochodowy, kredyty samochodowe, kredyt gotówkowy, kredyty gotówkowe, karta kredytowa, lokata, lokaty, kredyty gospodarcze, ubezpieczenia do kredytów" />\n\t\t\t<meta name="Description" content="Sprawdź ofertę Santander Consumer Banku - kredyty samochodowe, karty kredytowe, kredyty gotówkowe, lokaty, ubezpieczenia, kredyty ratalne, kredyty gospodarcze." />\n\t\t\t<meta name="robots" content="index, follow" />\n\t<script type="text/javascript">\n//<![CDATA[\nwindow["_csrf_"] = "0816

In [23]:
len(result[0][0].page_content)

95646

In [24]:
HuggingFaceEmbeddings(model_name='sdadas/st-polish-paraphrase-from-distilroberta')

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
), model_name='sdadas/st-polish-paraphrase-from-distilroberta', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False)