In [1]:
import sys
sys.path.append('../')

In [2]:
from scraping.pdf_handling import pdf_to_text
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from scraping.text_processing import get_raw_content, remove_non_ascii, split_on_newline, TextChunker
from scraping.semantic_search import SemanticSearch

import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df_urls = pd.read_excel('../Bank_list.xlsx', index_col=0)
df_urls.head(3)

Unnamed: 0,Name,Individual,Corporation
1.0,Powszechna Kasa Oszczędności Bank Polski SA,https://www.pkobp.pl/oplaty-i-oprocentowanie/o...,https://www.pkobp.pl/oplaty-i-oprocentowanie/o...
2.0,Bank Handlowy w Warszawie SA,https://www.citibank.pl/dokumenty/\n[zakładki:...,
3.0,ING Bank Śląski SA,https://www.ing.pl/indywidualni/tabele-i-regul...,https://www.ing.pl/male-firmy/tabele-i-regulam...


In [4]:
bank_name = 'Bank Pocztowy SA'
client_type = 'Individual'

In [5]:
# urls

In [6]:
model_name = "henryk/bert-base-multilingual-cased-finetuned-polish-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(tokenizer.model_max_length)

pipe = pipeline("question-answering", model=model_name)

chunker = TextChunker(tokenizer=tokenizer, max_len=400, stride_len=100)

512


Some weights of the model checkpoint at henryk/bert-base-multilingual-cased-finetuned-polish-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
data = pdf_to_text(f'../bank_data/{client_type}/{bank_name}')
len(data)

4

In [8]:
urls = df_urls.loc[df_urls['Name'] == bank_name, 'Individual'].dropna().tolist()
for url in urls:
    print('---')
    soup = get_raw_content(url)
    print(len(soup))
    soup = remove_non_ascii(soup)
    print(len(soup))

    splitted = split_on_newline(soup)
    data[url] = splitted


---
40489
40368


In [9]:
all_chunks = []

for file, content in data.items(): 
    print('---')
    print(len(content))
    content = remove_non_ascii(content)
    print(len(content))

    splitted = split_on_newline(content)

    chunks = chunker.split_text(splitted)
    all_chunks.extend(chunks)
    print(len(chunks))

Token indices sequence length is longer than the specified maximum sequence length for this model (745 > 512). Running this sequence through the model will result in indexing errors


---
3539
3525
10
---
3599
3598
8
---
3281
3274
6
---
5498
5490
17
---
271
1694
1


In [12]:
len(all_chunks)

42

In [10]:
semantic_search = SemanticSearch()

semantic_search.vectorize_text(strings=all_chunks)
result = semantic_search.search("wysokość oprocentowania promocyjnego na lokacie w %", k=5)
print(len(result))

best_contexts = [r[0].page_content for r in result]

question = "Jaka jest wysokość oprocentowania promocyjnego na lokacie?"
# question = 'Do kogo jest skierowana oferta?'

preds = []
for context in best_contexts:
    # generate 3 answers to the question
    pred = pipe(question=question, context=context, do_sample=False, top_k=3)

    pred = [p['answer'] for p in pred]
    preds.append(pred)

5


In [11]:
preds

[['indywidualni15,00 %', '22,50 %', '5,75 % 22,50 %'],
 ['towanie15,00 % \n 13,99 %', '13,99 %', 'towanie15,00 %'],
 ['9,25 %', '9,25 % wynosi 15,00 %', '15,00 %'],
 ['22,51 %', '1 056,00 zł', '1 056,00'],
 ['22,51 %', '22,51', '1 056,00 zł']]