Skip to content

Commit

Permalink
Refactor of the HTML meta helper (#122)
Browse files Browse the repository at this point in the history
Move the HTML meta helper into a new class and refactor the actual parsing of title and description. The helper now uses one single call to get the HTML and then parses the title and description from it.
The helper also now handles invalid URLs correctly. This includes URLs without a proper protocol as they can't be correctly queried for the meta data.
  • Loading branch information
Kovah committed Apr 29, 2020
1 parent 89dfdf3 commit 0a1f6b3
Show file tree
Hide file tree
Showing 7 changed files with 216 additions and 98 deletions.
134 changes: 134 additions & 0 deletions app/Helper/HtmlMeta.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
<?php

namespace App\Helper;

use GuzzleHttp\Exception\RequestException;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Facades\Log;

/**
* Class HtmlMeta
*
* @package App\Helper
*/
class HtmlMeta
{
/**
* Get the title and description of an URL.
*
* Returned array:
* array [
* 'success' => bool,
* 'title' => string,
* 'description' => string|null,
* ]
*
* @param $url
* @return array
*/
public static function getFromUrl($url): array
{
if (!filter_var($url, FILTER_VALIDATE_URL)) {
return [
'success' => false,
'title' => $url,
'description' => null,
];
}

$fallback = [
'success' => false,
'title' => parse_url($url, PHP_URL_HOST),
'description' => null,
];

$html = self::getHtmlContent($url);

if ($html === null) {
return $fallback;
}

$title = self::parseTitle($html);
$metaTags = self::getMetaTags($html);

$description = $metaTags['description']
?? $metaTags['og:description']
?? $metaTags['twitter:description']
?? $fallback['description'];

return [
'success' => true,
'title' => $title ?? $fallback['title'],
'description' => $description,
];
}

/**
* Try to get the HTML content of an URL.
* If a connection or response error occurs, null is returned, otherwise
* the HTML as a string.
*
* @param string $url
* @return string|null
*/
protected static function getHtmlContent(string $url): ?string
{
try {
$response = Http::timeout(5)->get($url);
} catch (ConnectionException $e) {
flash(trans('link.added_connection_error'), 'warning');
Log::warning($url . ': ' . $e->getMessage());

return null;
} catch (RequestException $e) {
flash(trans('link.added_request_error'), 'warning');
Log::warning($url . ': ' . $e->getMessage());

return null;
}

if (!$response->successful()) {
return null;
}

return $response->body();
}

/**
* Parses the meta tags from HTML by using a specific regex.
* Returns an array of all found meta tags or an empty array if no tags were found.
*
* @param string $html
* @return array
*/
protected static function getMetaTags(string $html): array
{
$pattern = '/<[\s]*meta[\s]*(name|property)="?' . '([^>"]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si';

if (preg_match_all($pattern, $html, $out)) {
return array_combine($out[2], $out[3]);
}

return [];
}

/**
* Try to parse the title tag from the HTML by using regex.
* If a title tag was found, excessive whitespace and newlines are removed from the string.
*
* @param $html
* @return string|null
*/
protected static function parseTitle($html): ?string
{
$res = preg_match("/<title>(.*)<\/title>/siU", $html, $titleMatches);

if ($res) {
$title = preg_replace('/\s+/', ' ', $titleMatches[1]);
$title = trim($title);
}

return $title ?? null;
}
}
84 changes: 3 additions & 81 deletions app/Helper/LinkAce.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,103 +2,25 @@

namespace App\Helper;

use GuzzleHttp\Exception\RequestException;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Support\Facades\Http;
use Illuminate\Support\Facades\Log;

/**
* Class LinkAce
*
* @package App\Helper
*/
class LinkAce
{
/**
* Get the title and description of a website form it's URL
*
* @param string $url
* @return array
*/
public static function getMetaFromURL(string $url): array
{
if (!filter_var($url, FILTER_VALIDATE_URL)) {
return [
'title' => $url,
'description' => null,
];
}

$fallback = [
'title' => parse_url($url, PHP_URL_HOST),
'description' => null,
];

// Try to get the HTML content of that URL
try {
$response = Http::timeout(5)->get($url);
} catch (ConnectionException $e) {
flash(trans('link.added_connection_error'), 'warning');
Log::warning($url . ': ' . $e->getMessage());

return $fallback;
} catch (RequestException $e) {
flash(trans('link.added_request_error'), 'warning');
Log::warning($url . ': ' . $e->getMessage());

return $fallback;
}

if (!$response->successful()) {
return $fallback;
}

$html = $response->body();

if (empty($html)) {
return $fallback;
}

// Try to get the meta tags of that URL
try {
$tags = get_meta_tags($url);
} catch (\Exception $e) {
return $fallback;
}

// Parse the HTML for the title
$res = preg_match("/<title>(.*)<\/title>/siU", $html, $title_matches);

if ($res) {
// Clean up title: remove EOL's and excessive whitespace.
$title = preg_replace('/\s+/', ' ', $title_matches[1]);
$title = trim($title);
}

// Get the title or the og:description tag or the twitter:description tag
$description = $tags['description']
?? $tags['og:description']
?? $tags['twitter:description']
?? $fallback['description'];

return [
'title' => $title ?? $fallback['title'],
'description' => $description,
];
}

/**
* Generate the code for the bookmarklet
*/
public static function generateBookmarkletCode(): string
{
$bm_code = 'javascript:javascript:(function(){var%20url%20=%20location.href;' .
$bmCode = 'javascript:javascript:(function(){var%20url%20=%20location.href;' .
"var%20title%20=%20document.title%20||%20url;window.open('##URL##?u='%20+%20encodeURIComponent(url)" .
"+'&t='%20+%20encodeURIComponent(title),'_blank','menubar=no,height=720,width=600,toolbar=no," .
"scrollbars=yes,status=no,dialog=1');})();";

$bm_code = str_replace('##URL##', route('bookmarklet-add'), $bm_code);
$bmCode = str_replace('##URL##', route('bookmarklet-add'), $bmCode);

return $bm_code;
return $bmCode;
}
}
3 changes: 2 additions & 1 deletion app/Http/Controllers/App/ImportController.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace App\Http\Controllers\App;

use App\Helper\HtmlMeta;
use App\Helper\LinkAce;
use App\Helper\LinkIconMapper;
use App\Http\Controllers\Controller;
Expand Down Expand Up @@ -60,7 +61,7 @@ public function doImport(DoImportRequest $request)
continue;
}

$linkMeta = LinkAce::getMetaFromURL($link['uri']);
$linkMeta = HtmlMeta::getFromUrl($link['uri']);

$title = $link['title'] ?: $linkMeta['title'];

Expand Down
9 changes: 8 additions & 1 deletion app/Repositories/LinkRepository.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace App\Repositories;

use App\Helper\HtmlMeta;
use App\Helper\LinkAce;
use App\Helper\LinkIconMapper;
use App\Jobs\SaveLinkToWaybackmachine;
Expand Down Expand Up @@ -29,13 +30,19 @@ class LinkRepository
*/
public static function create(array $data): Link
{
$linkMeta = LinkAce::getMetaFromURL($data['url']);
$linkMeta = HtmlMeta::getFromUrl($data['url']);

$data['title'] = $data['title'] ?: $linkMeta['title'];
$data['description'] = $data['description'] ?: $linkMeta['description'];
$data['user_id'] = auth()->user()->id;
$data['icon'] = LinkIconMapper::mapLink($data['url']);

// If the meta helper was not successfull, disable future checks and set the status to broken
if ($linkMeta['success'] === false) {
$data['check_disabled'] = true;
$data['status'] = Link::STATUS_BROKEN;
}

$link = Link::create($data);

if (isset($data['tags'])) {
Expand Down
21 changes: 21 additions & 0 deletions tests/Feature/Controller/Models/LinkControllerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,27 @@ public function testFullStoreRequest(): void
$this->assertEquals($tag->name, $databaseLink->tags->first()->name);
}

public function testStoreRequestWithInvalidUrl(): void
{
$response = $this->post('links', [
'url' => 'example.com',
'title' => null,
'description' => null,
'lists' => null,
'tags' => null,
'is_private' => '0',
]);

$response->assertStatus(302)
->assertRedirect('links/1');

$databaseLink = Link::first();

$this->assertTrue($databaseLink->check_disabled);
$this->assertEquals(Link::STATUS_BROKEN, $databaseLink->status);
$this->assertEquals('example.com', $databaseLink->title);
}

public function testStoreRequestWithContinue(): void
{
$response = $this->post('links', [
Expand Down
Loading

0 comments on commit 0a1f6b3

Please sign in to comment.