Skip to content

Commit

Permalink
Identify article using only the first 400 chars of its ID
Browse files Browse the repository at this point in the history
Some feeds, like Google News, have very long article IDs because they
contain base64 encoded information. 400 chars seems enough to avoid
duplicates while maintaining a reasonably sized index.
  • Loading branch information
NicolasLM committed May 6, 2019
1 parent 100a2c7 commit a810953
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions reader/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def synchronize_parsed_feed(feed: models.Feed, parsed_feed: ParsedFeed):
articles_to_uncache = list()
existing_articles = (
models.Article.objects.filter(feed=feed)
.filter(id_in_feed__in={a.id for a in parsed_feed.articles})
.filter(id_in_feed__in={a.id[:400] for a in parsed_feed.articles})
.select_related('feed')
.prefetch_related('attachment_set')
)
Expand All @@ -136,7 +136,7 @@ def synchronize_parsed_feed(feed: models.Feed, parsed_feed: ParsedFeed):
article, created, modified = create_or_update_if_needed(
models.Article,
existing_articles,
id_in_feed=parsed_article.id,
id_in_feed=parsed_article.id[:400],
feed=feed,
defaults={
'uri': parsed_article.link or '',
Expand Down

0 comments on commit a810953

Please sign in to comment.