Skip to content

Commit

Permalink
fix(main): fix continue fetching logic
Browse files Browse the repository at this point in the history
This commit fixes the logic for continue fetching in the `fetchFeed` function. Previously, it was incorrectly using the `FetchedUntil` field to determine if an item should be skipped. Now, it correctly uses a hashset of fetched URLs to check if an item has already been fetched.
  • Loading branch information
NOBLES5E committed Aug 4, 2023
1 parent 89c3b7d commit de6df43
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 14 deletions.
1 change: 1 addition & 0 deletions go.mod
Expand Up @@ -3,6 +3,7 @@ module github.com/NOBLES5E/FeedFlux
go 1.20

require (
github.com/emirpasic/gods v1.18.1
github.com/mmcdole/gofeed v1.2.1
github.com/sirupsen/logrus v1.9.3
github.com/urfave/cli/v2 v2.25.7
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Expand Up @@ -7,6 +7,8 @@ github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46t
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
Expand Down
25 changes: 11 additions & 14 deletions main.go
Expand Up @@ -2,9 +2,11 @@ package main

import (
"context"

"crypto/sha256"
"encoding/json"
"fmt"
"github.com/emirpasic/gods/sets/hashset"
"github.com/mmcdole/gofeed"
log "github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
Expand All @@ -13,8 +15,8 @@ import (
)

type FeedProgress struct {
URL string `json:"url"`
FetchedUntil int64 `json:"last_fetch_time"` // Unix timestamp
URL string `json:"url"`
FetchedUrls *hashset.Set `json:"fetchedUrls"`
}

func main() {
Expand Down Expand Up @@ -104,7 +106,7 @@ func readRecordFile(url string, recordDirPath string) (*FeedProgress, error) {
}

func fetchFeed(url string, results chan *gofeed.Item, recordDirPath string, continueFetch bool, timeoutSeconds int) {
defer close(results)
defer close(results)
fp := gofeed.NewParser()
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeoutSeconds)*time.Second)
defer cancel()
Expand All @@ -115,13 +117,8 @@ func fetchFeed(url string, results chan *gofeed.Item, recordDirPath string, cont
}
// FetchedUntil is the largest published timestamp of all items
record := FeedProgress{
URL: url,
FetchedUntil: 0,
}
for _, item := range feed.Items {
if item.PublishedParsed.Unix() > record.FetchedUntil {
record.FetchedUntil = item.PublishedParsed.Unix()
}
URL: url,
FetchedUrls: hashset.New(),
}
// record file path is sha256 of url
fileName := sha256.Sum256([]byte(url))
Expand All @@ -133,24 +130,24 @@ func fetchFeed(url string, results chan *gofeed.Item, recordDirPath string, cont
"items": len(feed.Items),
}).Infof("fetched %s", url)

lastTimeStamp := int64(0)
if continueFetch {
// If continue fetching, read record file
recordFile, err := readRecordFile(url, recordDirPath)
if err != nil {
log.Warnf("error reading record file %s: %s, continue fetching from scratch", recordFilePath, err)
} else {
log.Infof("continue fetching %s from %s", url, time.Unix(recordFile.FetchedUntil, 0))
lastTimeStamp = recordFile.FetchedUntil
log.Infof("continue fetching %s", url)
record = *recordFile
}
}

for _, item := range feed.Items {
// If continue fetching, skip items that are fetched before
if continueFetch && item.PublishedParsed.Unix() <= lastTimeStamp {
if continueFetch && record.FetchedUrls.Contains(item.Link) {
continue
}
results <- item
record.FetchedUrls.Add(item.Link)
}

if recordDirPath != "" {
Expand Down

0 comments on commit de6df43

Please sign in to comment.