In [None]:
import json
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

def similarity(a: str, b: str) -> float:
    emb_1 = encoder.encode(a)
    emb_2 = encoder.encode(b)

    return round(float(encoder.similarity(emb_1, emb_2).item()), 3)

dataset = load_dataset('IlyaGusev/librusec_full', split='train', streaming=True)

with open('All_annotations.json', 'r', encoding='utf-8') as f:
    all_a = json.load(f)

with open('titles.json', 'r', encoding='utf-8') as f:
    needed_titles = json.load(f)
    
annotations_dict = {}
for a in all_a:
    annotations_dict[a['title']] = a
     
SIM_TH = 0.65
texts_dict = {}
needed_titles = set(needed_titles)
encoder = SentenceTransformer('deepvk/USER-bge-m3')

with tqdm(total=496858, desc="Processing records") as pbar:
    for record in dataset:
        title = record.get("title", "")
        authors = record.get("authors", [""])
        lang = record.get("lang", "")

        if title in needed_titles:
            if title not in texts_dict:
                if lang in ['ru', 'rus']:
                    if annotations_dict[title]['author'] in authors:
                        texts_dict[title] = (record.get("sections", ""), authors)
                    else:
                        for author in authors:
                            if similarity(author, annotations_dict[title]['author']) > SIM_TH:
                                texts_dict[title] = (record.get("sections", " "), authors)
                                break

        pbar.set_postfix({
            'found': len(texts_dict),
        })
        pbar.update(1)

        if len(texts_dict) == len(needed_titles):
            break
            
combined = []
for title, value in texts_dict.items():
  combined.append({'title': title, 'author': annotations_dict[title]['author'], 'authors': value[1], 'annotation': annotations_dict[title]['annotation'],'text': value[0], 'categories': annotations_dict[title]['categories']})

Processing records: 100%|██████████| 496858/496858 [4:05:20<00:00, 33.75it/s, found=634]


In [1]:
pip install -r requirements.txt

Collecting asyncio (from -r requirements.txt (line 1))
  Using cached asyncio-3.4.3-py3-none-any.whl.metadata (1.7 kB)
Collecting openai (from -r requirements.txt (line 2))
  Downloading openai-1.93.0-py3-none-any.whl.metadata (29 kB)
Collecting torch (from -r requirements.txt (line 3))
  Downloading torch-2.7.1-cp310-cp310-win_amd64.whl.metadata (28 kB)
Collecting scipy (from -r requirements.txt (line 4))
  Downloading scipy-1.15.3-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting nltk (from -r requirements.txt (line 5))
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting transformers (from -r requirements.txt (line 6))
  Downloading transformers-4.53.0-py3-none-any.whl.metadata (39 kB)
Collecting tqdm (from -r requirements.txt (line 7))
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting evaluate (from -r requirements.txt (line 8))
  Using cached evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting sentence_transformers (from -r req

  DEPRECATION: Building 'rouge-score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge-score'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [1]:
import sys
sys.path.append('methods')

from methods import Summarisation

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dagri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dagri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
  from torch.distributed.optim import ZeroRedundancyOptimizer


In [2]:
with open('Access_key.txt', 'r', encoding='utf-8') as file:
    url, key = file.read().split()

In [3]:
bench = Summarisation(URL=url, KEY=key, model_name='RefalMachine/RuadaptQwen2.5-7B-Lite-Beta') 

In [4]:
from utils import chunk_text
text = "\n".join(bench.collection[3]['text'])
chunks = chunk_text(text)

In [5]:
print(len(text))

870892


In [6]:
print(len(chunks))

109


In [7]:
import time

In [None]:
print('Blueprint Cluster')
for _ in range(3):
    start = time.perf_counter()
    s = await bench.cluster_blueprint.run(chunks)
    end = time.perf_counter()
    count = end - start
    print(f"Time: {count:.2f}")

In [8]:
print('Blueprint')
for _ in range(3):
    start = time.perf_counter()
    s = await bench.blueprint.run(chunks)
    end = time.perf_counter()
    count = end - start
    print(f"Time: {count:.2f}")

Blueprint
Time: 186.62
Time: 182.65
Time: 213.71


In [8]:
print('Hierarchical')
for _ in range(3):
    start = time.perf_counter()
    s = await bench.hierarchical.run(chunks, initial_word_limit=500, filtered=False)
    end = time.perf_counter()
    count = end - start
    print(f"Time: {count:.2f}")

Hierarchical
Time: 184.15
Time: 188.51
Time: 175.25


In [8]:
print('Hierarchical')
for _ in range(3):
    start = time.perf_counter()
    s = await bench.hierarchical.run(chunks, initial_word_limit=500, filtered=True)
    end = time.perf_counter()
    count = end - start
    print(f"Time: {count:.2f}")

Hierarchical
chunks len:  109
sum len:  82
count:  0
waiting...
Done!
len of new sum:  14
count:  1
waiting...
Done!
len of new sum:  4
count:  2
waiting...
Done!
len of new sum:  2
Time: 213.54
chunks len:  109
sum len:  82
count:  0
waiting...
Done!
len of new sum:  15
count:  1
waiting...
Done!
len of new sum:  3
count:  2
waiting...
Done!
len of new sum:  1
Time: 192.43
chunks len:  109
sum len:  82
count:  0
waiting...
Done!
len of new sum:  13
count:  1
waiting...
Done!
len of new sum:  3
count:  2
waiting...
Done!
len of new sum:  1
Time: 199.93


In [None]:
pip install ipywidgets