In [1]:
import sys
sys.path.append('../')

In [2]:
from scraping.pdf_handling import pdf_to_text
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from scraping.text_processing import get_raw_content, remove_non_ascii, split_on_newline, TextChunker
from scraping.semantic_search import SemanticSearch

import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df_urls = pd.read_excel('../Bank_list.xlsx', index_col=0)
df_urls.head(3)

Unnamed: 0,Name,Individual,Corporation
1.0,Powszechna Kasa Oszczędności Bank Polski SA,https://www.pkobp.pl/oplaty-i-oprocentowanie/o...,https://www.pkobp.pl/oplaty-i-oprocentowanie/o...
2.0,Bank Handlowy w Warszawie SA,https://www.citibank.pl/dokumenty/\n[zakładki:...,
3.0,ING Bank Śląski SA,https://www.ing.pl/indywidualni/tabele-i-regul...,https://www.ing.pl/male-firmy/tabele-i-regulam...


In [4]:
bank_name = 'Santander Consumer Bank SA'
client_type = 'Individual'
date = '18-11-2023'

In [5]:
# urls

In [6]:
model_name = "henryk/bert-base-multilingual-cased-finetuned-polish-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(tokenizer.model_max_length)

pipe = pipeline("question-answering", model=model_name)

chunker = TextChunker(tokenizer=tokenizer, max_len=400, stride_len=100)

512


Some weights of the model checkpoint at henryk/bert-base-multilingual-cased-finetuned-polish-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
data = pdf_to_text(f'../bank_data/{date}/{client_type}/{bank_name}')
print(len(data))
for file, text in data.items():
    print('---')
    print(len(text))
    text = remove_non_ascii(text)
    print(len(text))

    splitted = split_on_newline(text)
    data[file] = splitted

10
---
1598
1598
---
2579
2564
---
10904
10804
---
986
985
---
5682
5671
---
5105
5100
---
1858
1858
---
2838
2836
---
1190
1190
---
1754
1751


In [9]:
urls = df_urls.loc[df_urls['Name'] == bank_name, 'Individual'].dropna().tolist()
for url in urls:
    print('---')
    soup = get_raw_content(url)
    print(len(soup))
    soup = remove_non_ascii(soup)
    print(len(soup))

    splitted = split_on_newline(soup)
    data[url] = splitted


---
4564
4563
---
14624
14589


In [10]:
all_chunks = []

for file, splitted in data.items(): 
    print('---')
    chunks = chunker.split_text(splitted)
    all_chunks.extend(chunks)

    print(file[-20:], len(splitted), len(chunks))

---
spozycja_wyplaty.pdf 32 2
---
c_zdjecie_dowodu.pdf 30 5
---
g.o_w.2.0_online.pdf 117 27
---
eraktywny_201118.pdf 26 1
---
_o_lokate_online.pdf 83 15
---
a_lokaty_on-line.pdf 52 10
---
okat_terminowych.pdf 71 5
---
o.d_odst_a_w.8.0.pdf 24 9
---
ywarka_depozytow.pdf 45 1
---
ych_interaktywny.pdf 39 3
---
czednosciowy,1.html
 101 8
---
rzez-internet,2.html 287 30


In [11]:
len(all_chunks)

116

In [12]:
semantic_search = SemanticSearch()

semantic_search.vectorize_text(strings=all_chunks)
result = semantic_search.search("wysokość oprocentowania promocyjnego na lokacie w %", k=5)
print(len(result))

best_contexts = [r[0].page_content for r in result]

question = "Jaka jest wysokość oprocentowania promocyjnego na lokacie?"
# question = 'Do kogo jest skierowana oferta?'

preds = []
for context in best_contexts:
    # generate 3 answers to the question
    pred = pipe(question=question, context=context, do_sample=False, top_k=3)

    pred = [p['answer'] for p in pred]
    preds.append(pred)

5


In [13]:
preds

[['w stosunku rocznym', '0 %', '0 % w stosunku rocznym'],
 ['1 000 - 400 000 PLN \n 1 mies',
  '1 000 - 400 000',
  '1 000 - 400 000 PLN \n 1 mies. \n 2. 00 %'],
 ['5,00 %', '5,00 % na 6 miesięcy', '5,00 % na 6'],
 ['0 %',
  '0 % w stosunku rocznym',
  '1 000 - 400 000 PLN \n 1 mies. \n 1. 00 %'],
 ['0 %', '0 % w stosunku rocznym', '0 % w stosunku rocznym.']]

In [14]:
pipe(question=question, context='\n'.join(all_chunks), do_sample=False, top_k=3)

[{'score': 0.22213014960289001,
  'start': 74561,
  'end': 74569,
  'answer': '4,6150 %'},
 {'score': 0.18798911571502686,
  'start': 73894,
  'end': 73902,
  'answer': '4,6150 %'},
 {'score': 0.06678435206413269,
  'start': 78375,
  'end': 78394,
  'answer': '520. 000. 000,00 zł'}]