From 6ffcaf409d40b4804bcf60c4ddea2410a2c8cac9 Mon Sep 17 00:00:00 2001 From: zoranbogoevski Date: Fri, 23 Feb 2024 13:56:27 +0100 Subject: [PATCH] Add some new features --- README.md | 21 +++++++++++++++++ config/rssfeed.php | 18 +++++++++----- src/RssFeed.php | 59 ++++++++++++++++++++++++---------------------- 3 files changed, 64 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index b18f834..e3af1f8 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,26 @@ php artisan vendor:publish --provider="Kalimeromk\Rssfeed\RssFeedServiceProvider This will publish a rssfeed.php config file to your config directory. Here you can set the XPaths for content elements. +```php +return [ + 'domain_xpaths' => [ + [ + 'domain' => 'mistagogia.mk', + 'content_element_xpaths' => [ + '//div[@class="single_post"]', + ], + ], + ], + 'min_image_width' => 300, + 'image_storage_path' => 'images', +]; + +``` +### In this configuration file: + +* domain_xpaths: Defines specific XPaths for content elements based on the domain. This allows for precise targeting of content within the RSS feed items for each domain. +* min_image_width: Sets the minimum width for images to be considered for storage, ensuring that only images of adequate size are saved. +* image_storage_path: Specifies the path where images from RSS feed items should be stored. ## Credits This package was created by KalimeroMK. @@ -78,3 +98,4 @@ use Kalimeromk\Rssfeed\Jobs\RssFeedJob; $feedUrls = ['https://example.com/rss']; RssFeedJob::dispatch($feedUrls); +``` \ No newline at end of file diff --git a/config/rssfeed.php b/config/rssfeed.php index 0879a2a..5e12040 100644 --- a/config/rssfeed.php +++ b/config/rssfeed.php @@ -1,9 +1,15 @@ [ - '//div[@class="post-content"]', - '//div[@class="article-body"]', - '//div[@class="td-post-content"]', - '//div[contains(concat(" ", normalize-space(@class), " "), " post-single-content ") and contains(concat(" ", normalize-space(@class), " "), " box ") and contains(concat(" ", normalize-space(@class), " "), " mark-links ")]', + 'domain_xpaths' => [ + [ + 'domain' => 'mistagogia.mk', + 'content_element_xpaths' => [ + '//div[@class="single_post"]', + ], + ], ], -]; \ No newline at end of file + 'min_image_width' => 300, + 'image_storage_path' => 'images', +]; + diff --git a/src/RssFeed.php b/src/RssFeed.php index bf151c2..60f5c6a 100644 --- a/src/RssFeed.php +++ b/src/RssFeed.php @@ -46,7 +46,7 @@ public function parseRssFeeds(array $feedUrls, $jobId = null): array $fullContent = $this->retrieveFullContent($itemLink); // Save the image to storage - $images = $this->saveImageToStorage($fullContent); + $images = $this->saveImagesToStorage($fullContent['images']); // Add the extracted item data to the parsedItems array $parsedItems[] = [ @@ -71,15 +71,16 @@ public function parseRssFeeds(array $feedUrls, $jobId = null): array * @return array|bool * @throws CantOpenFileFromUrlException */ - public function saveImageToStorage(array $images): array + public function saveImagesToStorage(array $images): array { $savedImageNames = []; + $imageStoragePath = config('rssfeed.image_storage_path', 'images'); foreach ($images as $image) { - $file = UrlUploadedFile::createFromUrl($image); - $imageName = Str::random(15) . '.' . $file->extension(); - $file->storeAs('images', $imageName, 'public'); - $savedImageNames[] = $imageName; + $file = UrlUploadedFile::createFromUrl($image); + $imageName = Str::random(15) . '.' . $file->extension(); + $file->storeAs($imageStoragePath, $imageName, 'public'); + $savedImageNames[] = $imageName; } @@ -92,54 +93,54 @@ public function saveImageToStorage(array $images): array */ public function retrieveFullContent(string $postLink): bool|array { - // Fetch the HTML content using cURL - $html = $this->fetchContentUsingCurl($postLink); // Use the previously defined cURL fetching function - - if ($html === false) { - return false; // Handle the error as appropriate - } - - // Load the HTML content into DOMDocument + $html = $this->fetchContentUsingCurl($postLink); + $parsedUrl = parse_url($postLink); + $host = $parsedUrl['host'] ?? ''; + $domains = config('rssfeed.domain_xpaths'); + $contentElementXPaths = collect($domains)->firstWhere('domain', $host)['content_element_xpaths']; $dom = new DOMDocument(); @$dom->loadHTML($html); - - // Use DOMXPath to work with the DOM $xpath = new DOMXPath($dom); - // Initialize an array to hold the image URLs $imageUrls = []; $selectedContent = ''; + $minImageWidth = config('rssfeed.min_image_width', 600); // Retrieve the minimum image width from config - // Process each XPath query in the configuration - foreach ($config['content_element_xpaths'] as $xpathQuery) { + foreach ($contentElementXPaths as $xpathQuery) { $elements = $xpath->query($xpathQuery); - - // Check if elements were found for the current XPath query if ($elements->length > 0) { foreach ($elements as $element) { - // Extract and concatenate the HTML of each matching element $selectedContent .= $dom->saveHTML($element); - - // Find and store all tags within the current element $images = $xpath->query('.//img', $element); foreach ($images as $img) { $src = $img->getAttribute('src'); - $imageUrls[] = $src; + // Get image dimensions + list($width, $height) = getimagesize($src); + if ($width >= $minImageWidth) { + // Add to array if not already added + if (!in_array($src, $imageUrls)) { + $imageUrls[] = $src; + } + } } } } } - // Optionally, you might want to remove