Skip to content

Commit

Permalink
Add some new features
Browse files Browse the repository at this point in the history
  • Loading branch information
zoranbogoevski committed Feb 23, 2024
1 parent 266d7d9 commit 6ffcaf4
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 34 deletions.
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,26 @@ php artisan vendor:publish --provider="Kalimeromk\Rssfeed\RssFeedServiceProvider

This will publish a rssfeed.php config file to your config directory. Here you can set the XPaths for content elements.

```php
return [
'domain_xpaths' => [
[
'domain' => 'mistagogia.mk',
'content_element_xpaths' => [
'//div[@class="single_post"]',
],
],
],
'min_image_width' => 300,
'image_storage_path' => 'images',
];

```
### In this configuration file:

* domain_xpaths: Defines specific XPaths for content elements based on the domain. This allows for precise targeting of content within the RSS feed items for each domain.
* min_image_width: Sets the minimum width for images to be considered for storage, ensuring that only images of adequate size are saved.
* image_storage_path: Specifies the path where images from RSS feed items should be stored.
## Credits

This package was created by KalimeroMK.
Expand Down Expand Up @@ -78,3 +98,4 @@ use Kalimeromk\Rssfeed\Jobs\RssFeedJob;
$feedUrls = ['https://example.com/rss'];

RssFeedJob::dispatch($feedUrls);
```
18 changes: 12 additions & 6 deletions config/rssfeed.php
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
<?php

return [
'content_element_xpaths' => [
'//div[@class="post-content"]',
'//div[@class="article-body"]',
'//div[@class="td-post-content"]',
'//div[contains(concat(" ", normalize-space(@class), " "), " post-single-content ") and contains(concat(" ", normalize-space(@class), " "), " box ") and contains(concat(" ", normalize-space(@class), " "), " mark-links ")]',
'domain_xpaths' => [
[
'domain' => 'mistagogia.mk',
'content_element_xpaths' => [
'//div[@class="single_post"]',
],
],
],
];
'min_image_width' => 300,
'image_storage_path' => 'images',
];

59 changes: 31 additions & 28 deletions src/RssFeed.php
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public function parseRssFeeds(array $feedUrls, $jobId = null): array
$fullContent = $this->retrieveFullContent($itemLink);

// Save the image to storage
$images = $this->saveImageToStorage($fullContent);
$images = $this->saveImagesToStorage($fullContent['images']);

// Add the extracted item data to the parsedItems array
$parsedItems[] = [
Expand All @@ -71,15 +71,16 @@ public function parseRssFeeds(array $feedUrls, $jobId = null): array
* @return array|bool
* @throws CantOpenFileFromUrlException
*/
public function saveImageToStorage(array $images): array
public function saveImagesToStorage(array $images): array
{
$savedImageNames = [];
$imageStoragePath = config('rssfeed.image_storage_path', 'images');

foreach ($images as $image) {
$file = UrlUploadedFile::createFromUrl($image);
$imageName = Str::random(15) . '.' . $file->extension();
$file->storeAs('images', $imageName, 'public');
$savedImageNames[] = $imageName;
$file = UrlUploadedFile::createFromUrl($image);
$imageName = Str::random(15) . '.' . $file->extension();
$file->storeAs($imageStoragePath, $imageName, 'public');
$savedImageNames[] = $imageName;

}

Expand All @@ -92,54 +93,54 @@ public function saveImageToStorage(array $images): array
*/
public function retrieveFullContent(string $postLink): bool|array
{
// Fetch the HTML content using cURL
$html = $this->fetchContentUsingCurl($postLink); // Use the previously defined cURL fetching function

if ($html === false) {
return false; // Handle the error as appropriate
}

// Load the HTML content into DOMDocument
$html = $this->fetchContentUsingCurl($postLink);
$parsedUrl = parse_url($postLink);
$host = $parsedUrl['host'] ?? '';
$domains = config('rssfeed.domain_xpaths');
$contentElementXPaths = collect($domains)->firstWhere('domain', $host)['content_element_xpaths'];
$dom = new DOMDocument();
@$dom->loadHTML($html);

// Use DOMXPath to work with the DOM
$xpath = new DOMXPath($dom);

// Initialize an array to hold the image URLs
$imageUrls = [];
$selectedContent = '';
$minImageWidth = config('rssfeed.min_image_width', 600); // Retrieve the minimum image width from config

// Process each XPath query in the configuration
foreach ($config['content_element_xpaths'] as $xpathQuery) {
foreach ($contentElementXPaths as $xpathQuery) {
$elements = $xpath->query($xpathQuery);

// Check if elements were found for the current XPath query
if ($elements->length > 0) {
foreach ($elements as $element) {
// Extract and concatenate the HTML of each matching element
$selectedContent .= $dom->saveHTML($element);

// Find and store all <img> tags within the current element
$images = $xpath->query('.//img', $element);
foreach ($images as $img) {
$src = $img->getAttribute('src');
$imageUrls[] = $src;
// Get image dimensions
list($width, $height) = getimagesize($src);
if ($width >= $minImageWidth) {
// Add to array if not already added
if (!in_array($src, $imageUrls)) {
$imageUrls[] = $src;
}
}
}
}
}
}

// Optionally, you might want to remove <script> and <style> from $selectedContent
$selectedContent = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', "", $selectedContent);
$selectedContent = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', "", $selectedContent);

// Return the content and images
return [
'content' => trim($selectedContent),
'images' => $imageUrls, // This is an array of image URLs found in the selected content
'images' => $imageUrls,
];
}





// The cURL fetching function from previous examples
private function fetchContentUsingCurl(string $url): bool|string
{
$ch = curl_init();
Expand All @@ -153,4 +154,6 @@ private function fetchContentUsingCurl(string $url): bool|string

return $httpCode === 200 ? $data : false;
}


}

0 comments on commit 6ffcaf4

Please sign in to comment.