From 6ffcaf409d40b4804bcf60c4ddea2410a2c8cac9 Mon Sep 17 00:00:00 2001
From: zoranbogoevski <zoran@orangemelon.com>
Date: Fri, 23 Feb 2024 13:56:27 +0100
Subject: [PATCH] Add some new features

---
 README.md          | 21 +++++++++++++++++
 config/rssfeed.php | 18 +++++++++-----
 src/RssFeed.php    | 59 ++++++++++++++++++++++++----------------------
 3 files changed, 64 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index b18f834..e3af1f8 100644
--- a/README.md
+++ b/README.md
@@ -37,6 +37,26 @@ php artisan vendor:publish --provider="Kalimeromk\Rssfeed\RssFeedServiceProvider
 
 This will publish a rssfeed.php config file to your config directory. Here you can set the XPaths for content elements.
 
+```php
+return [
+    'domain_xpaths' => [
+        [
+            'domain' => 'mistagogia.mk',
+            'content_element_xpaths' => [
+                '//div[@class="single_post"]',
+            ],
+        ],
+    ],
+    'min_image_width' => 300,
+    'image_storage_path' => 'images',
+];
+
+```
+### In this configuration file:
+
+* domain_xpaths: Defines specific XPaths for content elements based on the domain. This allows for precise targeting of content within the RSS feed items for each domain.
+* min_image_width: Sets the minimum width for images to be considered for storage, ensuring that only images of adequate size are saved.
+* image_storage_path: Specifies the path where images from RSS feed items should be stored.
 ## Credits
 
 This package was created by KalimeroMK.
@@ -78,3 +98,4 @@ use Kalimeromk\Rssfeed\Jobs\RssFeedJob;
 $feedUrls = ['https://example.com/rss'];
 
 RssFeedJob::dispatch($feedUrls);
+```
\ No newline at end of file
diff --git a/config/rssfeed.php b/config/rssfeed.php
index 0879a2a..5e12040 100644
--- a/config/rssfeed.php
+++ b/config/rssfeed.php
@@ -1,9 +1,15 @@
 <?php
+
 return [
-    'content_element_xpaths' => [
-        '//div[@class="post-content"]',
-        '//div[@class="article-body"]',
-        '//div[@class="td-post-content"]',
-        '//div[contains(concat(" ", normalize-space(@class), " "), " post-single-content ") and contains(concat(" ", normalize-space(@class), " "), " box ") and contains(concat(" ", normalize-space(@class), " "), " mark-links ")]',
+    'domain_xpaths' => [
+        [
+            'domain' => 'mistagogia.mk',
+            'content_element_xpaths' => [
+                '//div[@class="single_post"]',
+            ],
+        ],
     ],
-];
\ No newline at end of file
+    'min_image_width' => 300,
+    'image_storage_path' => 'images',
+];
+
diff --git a/src/RssFeed.php b/src/RssFeed.php
index bf151c2..60f5c6a 100644
--- a/src/RssFeed.php
+++ b/src/RssFeed.php
@@ -46,7 +46,7 @@ public function parseRssFeeds(array $feedUrls, $jobId = null): array
                 $fullContent = $this->retrieveFullContent($itemLink);
 
                 // Save the image to storage
-                $images = $this->saveImageToStorage($fullContent);
+                $images = $this->saveImagesToStorage($fullContent['images']);
 
                 // Add the extracted item data to the parsedItems array
                 $parsedItems[] = [
@@ -71,15 +71,16 @@ public function parseRssFeeds(array $feedUrls, $jobId = null): array
      * @return array|bool
      * @throws CantOpenFileFromUrlException
      */
-    public function saveImageToStorage(array $images): array
+    public function saveImagesToStorage(array $images): array
     {
         $savedImageNames = [];
+        $imageStoragePath = config('rssfeed.image_storage_path', 'images');
 
         foreach ($images as $image) {
-                $file = UrlUploadedFile::createFromUrl($image);
-                $imageName = Str::random(15) . '.' . $file->extension();
-                $file->storeAs('images', $imageName, 'public');
-                $savedImageNames[] = $imageName;
+            $file = UrlUploadedFile::createFromUrl($image);
+            $imageName = Str::random(15) . '.' . $file->extension();
+            $file->storeAs($imageStoragePath, $imageName, 'public');
+            $savedImageNames[] = $imageName;
 
         }
 
@@ -92,54 +93,54 @@ public function saveImageToStorage(array $images): array
      */
     public function retrieveFullContent(string $postLink): bool|array
     {
-        // Fetch the HTML content using cURL
-        $html = $this->fetchContentUsingCurl($postLink); // Use the previously defined cURL fetching function
-
-        if ($html === false) {
-            return false; // Handle the error as appropriate
-        }
-
-        // Load the HTML content into DOMDocument
+        $html = $this->fetchContentUsingCurl($postLink);
+        $parsedUrl = parse_url($postLink);
+        $host = $parsedUrl['host'] ?? '';
+        $domains = config('rssfeed.domain_xpaths');
+        $contentElementXPaths = collect($domains)->firstWhere('domain', $host)['content_element_xpaths'];
         $dom = new DOMDocument();
         @$dom->loadHTML($html);
-
-        // Use DOMXPath to work with the DOM
         $xpath = new DOMXPath($dom);
 
-        // Initialize an array to hold the image URLs
         $imageUrls = [];
         $selectedContent = '';
+        $minImageWidth = config('rssfeed.min_image_width', 600); // Retrieve the minimum image width from config
 
-        // Process each XPath query in the configuration
-        foreach ($config['content_element_xpaths'] as $xpathQuery) {
+        foreach ($contentElementXPaths as $xpathQuery) {
             $elements = $xpath->query($xpathQuery);
-
-            // Check if elements were found for the current XPath query
             if ($elements->length > 0) {
                 foreach ($elements as $element) {
-                    // Extract and concatenate the HTML of each matching element
                     $selectedContent .= $dom->saveHTML($element);
-
-                    // Find and store all <img> tags within the current element
                     $images = $xpath->query('.//img', $element);
                     foreach ($images as $img) {
                         $src = $img->getAttribute('src');
-                        $imageUrls[] = $src;
+                        // Get image dimensions
+                        list($width, $height) = getimagesize($src);
+                        if ($width >= $minImageWidth) {
+                            // Add to array if not already added
+                            if (!in_array($src, $imageUrls)) {
+                                $imageUrls[] = $src;
+                            }
+                        }
                     }
                 }
             }
         }
 
-        // Optionally, you might want to remove <script> and <style> from $selectedContent
         $selectedContent = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', "", $selectedContent);
         $selectedContent = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', "", $selectedContent);
 
-        // Return the content and images
         return [
             'content' => trim($selectedContent),
-            'images' => $imageUrls, // This is an array of image URLs found in the selected content
+            'images' => $imageUrls,
         ];
     }
+
+
+
+
+
+// The cURL fetching function from previous examples
     private function fetchContentUsingCurl(string $url): bool|string
     {
         $ch = curl_init();
@@ -153,4 +154,6 @@ private function fetchContentUsingCurl(string $url): bool|string
 
         return $httpCode === 200 ? $data : false;
     }
+
+
 }