Просто тестим код

In [1]:
from datetime import datetime, timedelta
import sys
import os

base_path, _ = os.path.split(os.getcwd())
sys.path.append(base_path)
from news_parser import get_financial_news, DEFAULT_FINANCE_FEEDS
from dedupe import dedupe_articles

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ------------- fetch news (last 24h) -------------
end = datetime.utcnow()
start = end - timedelta(hours=24)   # последние 24 часа

print("Fetching news...")
news = get_financial_news(start.isoformat(), end.isoformat(),
                          feed_urls=DEFAULT_FINANCE_FEEDS,
                          max_workers=6,
                          fetch_text=True)

print(f"Fetched {len(news)} items")

  end = datetime.utcnow()


Fetching news...


fetching articles: 100%|██████████| 61/61 [00:13<00:00,  4.59it/s]

Fetched 61 items





In [3]:
# print a few items
for n in news[:2]:
    print(
        n.get('published'),
        n.get('source'),
        n.get('title'),
        n.get('url')[:20],
        n.get('text')[:120],
        sep='\n',
        end='\n\n'
    )


2025-10-04 14:09:45+00:00
All Articles on Seeking Alpha
Do Not Take Profits On PetroChina, Despite Its Spectacular Rally
https://seekingalpha


2025-10-04 14:03:00+00:00
All Articles on Seeking Alpha
American States Water: Scoop Up This Dividend King Now
https://seekingalpha




In [10]:
sources = set()
for n in news:
    source = n.get('source')
    sources.add(source)
print(sources)

{'Stock Market News', 'Yahoo Finance', 'UK homepage', 'All Articles on Seeking Alpha'}


In [4]:
# ------------- prepare articles list for dedupe -------------
articles = []
for n in news:
    articles.append({
        'title': n.get('title') or "",
        'text': n.get('text') or "",
        'url': n.get('url') or "",
        'published': n.get('published'),
        'source': n.get('source') or ""
    })


In [5]:
# ------------- run dedupe -------------
print("Running semantic dedupe (dedupe.dedupe_articles)... this may take a while for many items.")
annotated_articles, clusters_meta = dedupe_articles(
    articles,
    similarity_threshold=0.75,
    model_name="all-MiniLM-L6-v2",
    use_sentence_transformers=True
)

Running semantic dedupe (dedupe.dedupe_articles)... this may take a while for many items.


In [6]:
print("Clusters meta:", clusters_meta)

Clusters meta: {0: {'size': 1, 'rep_article_index': 0, 'sources': ['All Articles on Seeking Alpha'], 'earliest': datetime.datetime(2025, 10, 4, 14, 9, 45, tzinfo=tzutc()), 'latest': datetime.datetime(2025, 10, 4, 14, 9, 45, tzinfo=tzutc()), 'avg_similarity': 1.0, 'backend': 'sentence-transformers'}, 1: {'size': 1, 'rep_article_index': 1, 'sources': ['All Articles on Seeking Alpha'], 'earliest': datetime.datetime(2025, 10, 4, 14, 3, tzinfo=tzutc()), 'latest': datetime.datetime(2025, 10, 4, 14, 3, tzinfo=tzutc()), 'avg_similarity': 1.0, 'backend': 'sentence-transformers'}, 2: {'size': 1, 'rep_article_index': 2, 'sources': ['Yahoo Finance'], 'earliest': datetime.datetime(2025, 10, 4, 14, 0, tzinfo=tzutc()), 'latest': datetime.datetime(2025, 10, 4, 14, 0, tzinfo=tzutc()), 'avg_similarity': 1.0, 'backend': 'sentence-transformers'}, 3: {'size': 1, 'rep_article_index': 3, 'sources': ['Yahoo Finance'], 'earliest': datetime.datetime(2025, 10, 4, 14, 0, tzinfo=tzutc()), 'latest': datetime.dateti

In [7]:
for group_id, meta_inf in clusters_meta.items():
    print(group_id)
    for k, v in meta_inf.items():
        print(f"{k}: {v}")

0
size: 1
rep_article_index: 0
sources: ['All Articles on Seeking Alpha']
earliest: 2025-10-04 14:09:45+00:00
latest: 2025-10-04 14:09:45+00:00
avg_similarity: 1.0
backend: sentence-transformers
1
size: 1
rep_article_index: 1
sources: ['All Articles on Seeking Alpha']
earliest: 2025-10-04 14:03:00+00:00
latest: 2025-10-04 14:03:00+00:00
avg_similarity: 1.0
backend: sentence-transformers
2
size: 1
rep_article_index: 2
sources: ['Yahoo Finance']
earliest: 2025-10-04 14:00:00+00:00
latest: 2025-10-04 14:00:00+00:00
avg_similarity: 1.0
backend: sentence-transformers
3
size: 1
rep_article_index: 3
sources: ['Yahoo Finance']
earliest: 2025-10-04 14:00:00+00:00
latest: 2025-10-04 14:00:00+00:00
avg_similarity: 1.0
backend: sentence-transformers
4
size: 1
rep_article_index: 4
sources: ['Yahoo Finance']
earliest: 2025-10-04 14:00:00+00:00
latest: 2025-10-04 14:00:00+00:00
avg_similarity: 1.0
backend: sentence-transformers
5
size: 1
rep_article_index: 5
sources: ['All Articles on Seeking Alpha']

In [8]:
for a in annotated_articles:
    print(a['dedup_group_id'], a['title'][:80], a['source'])


0 Do Not Take Profits On PetroChina, Despite Its Spectacular Rally All Articles on Seeking Alpha
1 American States Water: Scoop Up This Dividend King Now All Articles on Seeking Alpha
2 The Best Way to Invest in Bitcoin Without Actually Buying Cryptocurrency Yahoo Finance
3 I’m single and making $61K/year. I really want to buy my first home — but can I  Yahoo Finance
4 Sports-betting stocks face growing threat from prediction rivals Yahoo Finance
5 Morgan Stanley Direct Lending: Buy The Dip Opportunity All Articles on Seeking Alpha
6 Scotiabank: Fundamentals Justify A Buy Rating, But Technicals Suggest Some Cauti All Articles on Seeking Alpha
7 Ecuador revokes environmental license for Canada’s DPM to develop gold project Stock Market News
8 AppLovin Is Positioned To Capitalize On Ad-Tech With AXON All Articles on Seeking Alpha
9 Viking Therapeutics: A Speculative Buy All Articles on Seeking Alpha
10 Fair Issac Vs. Equifax: The Credit Score War Has Begun All Articles on Seeking Alpha
1