-
Notifications
You must be signed in to change notification settings - Fork 1
/
WWW.php
executable file
·102 lines (96 loc) · 2.92 KB
/
WWW.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
<?php
/**
* WWW parser.
*
* @addtogroup Parsers
* @author Michał "Hołek" Połtyn
* @copyright © 2009 Michał Połtyn
* @license GNU General Public Licence 2.0 or later
*/
class WWW extends Parser {
private $refname;
private $url;
public function fetch($url)
{
$fh = fopen($url,'r');
if ($fh) // if file found
{
$foundTitle = false; // title checker
$foundCharset = false; // charset checker
$text = ''; // text storage
while ((!$foundTitle && !$foundCharset) && !feof($fh))
{
$text .= $line = stream_get_line($fh, 20*1024, '</head>');
if (stripos($line, '</title>'))
{
$foundTitle = true;
}
if (stripos($line, 'content="text/html; charset=') || preg_match('/<\?xml(.*?) encoding="/is', $line))
{
$foundCharset = true;
}
}
fclose($fh);
preg_match('/<title>(.*?)<\/title>/i',$text,$title); // put found title in $title[1]
$this->title = trim(strip_tags(ereg_replace(' +', ' ', $title[1]))); // get rid of unnecessary whitespaces
$charset = 'UTF-8'; // default charset value
if ($foundCharset) // if found charset...
{
preg_match('/("|\')text\/html;\s+charset=(.*?)("|\')/is',$text,$charset); // ..find it again
$charset = strtoupper($charset[2]); // ..store it
if (!strlen($charset)) {
preg_match('/<\?xml(.*?) encoding=("|\')(.*?)("|\')/is',$text,$charset);
$charset = strtoupper($charset[3]); // ..store it
}
if (!in_array($charset, array('UTF-8','UTF-7','UTF-16','UTF-32','BIG-5','EUC-JP','EUC-KR','EUC-TW','JIS','ISO-2022-JP','ISO-2022-JP-MS'))) // if it's fixed-width encoding...
{
$replaceArray = array(array(), array()); // this is a replace array for illegal SGML characters;
for ($i=0; $i<32; $i++) // produces a correct XML output
{
$replaceArray[0][] = chr($i);
$replaceArray[1][] = "";
}
for ($i=127; $i<160; $i++)
{
$replaceArray[0][] = chr($i);
$replaceArray[1][] = "";
}
$this->title = str_replace($replaceArray[0], $replaceArray[1], $this->title); // get rid of illegal SGML chars and HTML and PHP tags
$this->title = iconv($charset, 'UTF-8//TRANSLIT', $this->title); // ..and convert the title accordingly (ignore weird characters)
}
$refname = preg_match('/((?:[a-z][a-z\.\d\-]+)\.(?:[a-z][a-z\-]+))(?![\w\.])/is',$url,$urlParts);
$this->refname = $urlParts[1];
}
if ($this->title != '')
{
$this->title = $url;
}
else
{
$this->title = false;
}
}
else
{
$this->errors[] = array('WWW-error',$url);
$this->title = false;
}
}
/**
* URL info getter
*
* @return array
*/
public function getOutput()
{
// Here you can sort which fields are meant to be shown first
// at the generated template. Simply the first one goes first. ;)
return array(
'title' => $this->title,
'url' => $this->url,
'__refname' => $this->refname,
'__sourceurl' => $this->url
);
}
}
?>