-
Notifications
You must be signed in to change notification settings - Fork 40
/
WebDocumentParser.cs
148 lines (137 loc) · 5.37 KB
/
WebDocumentParser.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
//BSD, 2014-present, WinterDev
//ArthurHub, Jose Manuel Menendez Poo
// "Therefore those skilled at the unorthodox
// are infinite as heaven and earth,
// inexhaustible as the great rivers.
// When they come to an end,
// they begin again,
// like the days and months;
// they die and are reborn,
// like the four seasons."
//
// - Sun Tsu,
// "The Art of War"
using System.Collections.Generic;
using LayoutFarm.WebDom;
using LayoutFarm.WebDom.Parser;
namespace LayoutFarm.Composers
{
public abstract class ExternalHtmlTreeWalker
{
public abstract IEnumerable<ExternalHtmlNode> GetHtmlNodeIter();
}
public enum ExternalHtmlNodeKind
{
Element,
Document,
TextNode,
Attribute,
EnterChildContext,//special
ExitChildContext,//special
}
public abstract class ExternalHtmlNode
{
public abstract object ActualHtmlNode { get; }
public abstract string HtmlElementName { get; }
public abstract ExternalHtmlNodeKind HtmlNodeKind { get; }
public abstract string CurrentTextNodeContent { get; }
public abstract int Level { get; }
public abstract void GetAttributeNameAndValue(out string name, out string value);
}
public static class WebDocumentParser
{
/// <summary>
/// Parses the source html to css boxes tree structure.
/// </summary>
/// <param name="source">the html source to parse</param>
public static HtmlDocument ParseDocument(LayoutFarm.HtmlBoxes.HtmlHost htmlHost, TextSource snapSource)
{
HtmlParser parser = GetHtmlParser();
//------------------------
HtmlDocument newdoc = new HtmlDocument(htmlHost);
parser.Parse(snapSource, newdoc, newdoc.RootNode);
FreeHtmlParser(parser);
return newdoc;
}
public static HtmlDocument ParseDocument(LayoutFarm.HtmlBoxes.HtmlHost htmlHost, ExternalHtmlTreeWalker externalTreeWalker)
{
HtmlDocument newdoc = new HtmlDocument(htmlHost);
//start from
HtmlElement domElem = (HtmlElement)newdoc.RootNode;
Stack<HtmlElement> elemStack = new Stack<HtmlElement>();
HtmlElement newDomElem = null;
foreach (ExternalHtmlNode node in externalTreeWalker.GetHtmlNodeIter())
{
switch (node.HtmlNodeKind)
{
case ExternalHtmlNodeKind.EnterChildContext:
{
elemStack.Push(domElem);
if (newDomElem != null)
{
domElem = newDomElem;
}
}
break;
case ExternalHtmlNodeKind.ExitChildContext:
{
domElem = elemStack.Pop();
}
break;
case ExternalHtmlNodeKind.Attribute:
{
node.GetAttributeNameAndValue(out string attrName, out string attrValue);
DomAttribute attr = newdoc.CreateAttribute(attrName, attrValue);
newDomElem.SetAttribute(attr);
}
break;
case ExternalHtmlNodeKind.Element:
newDomElem = (HtmlElement)newdoc.CreateElement(node.HtmlElementName);
domElem.AddChild(newDomElem);
//System.Diagnostics.Debug.WriteLine(new string(' ', node.Level) + node.HtmlElementName);
break;
case ExternalHtmlNodeKind.TextNode:
DomTextNode textnode = newdoc.CreateTextNode(node.CurrentTextNodeContent.ToCharArray());
domElem.AddChild(textnode);
//System.Diagnostics.Debug.WriteLine(new string(' ', node.Level) + node.CurrentTextNodeContent);
break;
case ExternalHtmlNodeKind.Document:
//System.Diagnostics.Debug.WriteLine("Root");
break;
}
}
return newdoc;
}
public static void ParseHtmlDom(TextSource snapSource, IHtmlDocument htmldoc, WebDom.DomElement parentElement)
{
HtmlParser parser = GetHtmlParser();
//------------------------
parser.Parse(snapSource, (LayoutFarm.WebDom.Impl.HtmlDocument)htmldoc, parentElement);
FreeHtmlParser(parser);
}
static Queue<HtmlParser> s_sharedParsers = new Queue<HtmlParser>();
static object s_sharedParserLock1 = new object();
static HtmlParser GetHtmlParser()
{
lock (s_sharedParserLock1)
{
if (s_sharedParsers.Count == 0)
{
return HtmlParser.CreateHtmlParser(ParseEngineKind.HtmlKitParser);
}
else
{
return s_sharedParsers.Dequeue();
}
}
}
static void FreeHtmlParser(HtmlParser parser)
{
parser.ResetParser();
lock (s_sharedParserLock1)
{
s_sharedParsers.Enqueue(parser);
}
}
}
}