In [246]:
from pypeg2 import *

#tools

class Digit(int):
    grammar = re.compile(r"[0-9]")

class Alpha(str):
    grammar = re.compile(r"[a-zA-Z]")
    
class HexDig:
    grammar = [Digit, "A", "B", "C", "D", "E", "F"]
    
class SubDelimiter(Keyword):
    grammar = Enum(K("!"), K("$"), K("&"), K("'"), K("("), K(")"), K("*"), K("+"), K(","), K(";"), K("="))
    
class Unreserved(str):
    grammar = [Alpha, Digit, "-", ".", "_", "~"]
    
class PctEncoded:
    grammar = "%", HexDig, HexDig
    
class PChar(str):
    grammar = [Unreserved, PctEncoded, SubDelimiter, ":", "@"]
#endtools

class Method(Keyword):
    grammar = Enum(K("GET"), K("HEAD"), K("POST"), K("PUT"), K("DELETE"), K("TRACE"), K("CONNECT"), K("OPTIONS"))
    
class DecOctet(str):
    grammar = [Digit, (re.compile(r"[1-9]"), Digit), ("1", 2, Digit), ("2", re.compile(r"[0-4]"), Digit), ("25", re.compile(r"[0-5]"))]
    
class IPAddress:
    grammar = attr("ip_field_1", DecOctet), ".", attr("ip_field_2", DecOctet), ".", attr("ip_field_3", DecOctet), ".", attr("ip_field_4", DecOctet)

# reg-name    = ( unreserved | pct-encoded | sub-delims )+
class RegName(str):
    grammar = re.compile(r"([a-zA-Z0-9-._~]|%[A-F0-9])*") 
    
class UserInfo:
    grammar = some([Unreserved, PctEncoded, SubDelimiter, ":"])
    
class Port(str):
    grammar = re.compile(r"[0-9]+")

# obligé de passer par la regex car some considère l'espace
# Segment = *pchar
class Segment(str):
    grammar = re.compile(r"([a-zA-Z0-9-._~:@!$&'()*+,;=]|%[A-F0-9])*") 
    
class Path(List):
    grammar = some("/", Segment)
    
class Authority:
    grammar = optional(attr("userinfo", UserInfo), "@"), attr("host", [IPAddress, RegName]), optional(":", attr("port", Port))

class HierPart:
    grammar = [("//", attr("authority", Authority), optional(attr("path", Path))), attr("path", Path)]
    
class Scheme:
    grammar = Alpha, maybe_some([Alpha, Digit, "+", "-", "."])
    
# obligé de passer par la regex car some considère l'espace
class Query_Fragment(str):
    grammar = re.compile(r"([a-zA-Z0-9-._~/?]|%[A-F0-9])*") 
    
class URI:
    grammar = optional(attr("scheme", Scheme), ":"), attr("hier_part", HierPart), optional(("?", attr("query", Query_Fragment))), optional(("#", attr("fragment", Query_Fragment)))

class HTTPVersion(str):
    grammar = "HTTP/", Digit, ".", Digit
    
class ReqLine:
    grammar = attr("method", Method), blank, attr("request_uri", URI), blank, attr("http_version", HTTPVersion), endl
    
#MESSAGE_HEADER
# token = [^separators]+
class Token(str):
    grammar = re.compile(r"[^()<>@,;:\\\"\/[\]?={}]+")

"""
quoted-string   = <"> *(qdtext) <">
qdtext          = [^"]+
"""
class QuotedString:
    grammar = "\"", re.compile("r[^\"]+"), "\""

"""
qvalue = ( "0" [ "." DIGIT+ ] )
       | ( "1" [ "." "0"+ ]   )
"""
class QValue(str):
    grammar = re.compile(r"(0(\.[0-9]+)?|1((.0)+)?)")

"""
parameter       = attribute "=" value
attribute       = token
value           = token | quoted-string
"""
class Parameter(str):
    grammar = attr("attribute", Token), "=", attr("value", [Token, QuotedString])
    
class Parameters(List):
    grammar = some(Parameter)

class Accept(str):
    grammar = "Accept", ":", attr("type", Token), "/", attr("subtype", Token), optional(Parameters), attr("q_value", optional(";q=", QValue))

# not used anymore
class Charset(str):
    grammar = re.compile(r"\w+(-\w+)*"), attr("q_value", optional(";q=", QValue))
# RFC 2978
# Je garde que les 3 plus utilisés
class AcceptCharset(List):
    grammar = "Accept-Charset", ":", csl(Charset)
    
class LanguageRange(str):
    grammar = re.compile(r"\w{1,8}(-\w{1,8})?"), attr("q_value", optional(";q=", QValue))
    
class AcceptLanguage(List):
    grammar = "Accept-Language", ":", csl(LanguageRange)

class Authorization():
    grammar = "Authorization", ":", attr("credentials", word)
    
class Expect():
    grammar = "Expect", ":", attr("value", "100-Continue")
    
# basic regex for email
class From(str):
    grammar = "From", ":", attr("mailbox", re.compile(r"^[^@\s]+@[^@\s\.]+\.[^@\.\s]+$"))

class Host(str):
    grammar = "Host", ":", attr("host", [IPAddress, RegName]), attr("port", optional(":", Port))
    
class IfMatch:
    grammar = "If-Match", ":", attr("credentials", word)
    
class ProxyAuthorization():
    grammar = "Proxy-Authorization", ":", attr("crendentials", word)
    
class RequestHeader(List):
    grammar = some([Accept, AcceptCharset, AcceptLanguage, Authorization, Expect, From, Host, IfMatch, ProxyAuthorization])

#END MESSAGE_HEADER

##### REQ
class Request:
    grammar = attr("req_line", ReqLine), endl, attr("request_header", optional(RequestHeader)), endl
#### END REQ

In [249]:
to_parse = """GET /hello/martin_le.html?boss#tinmar HTTP/1.1 
Host : 192.168.1.1:443
Accept-Charset: UTF-8 ;q=0.9, ISO-8859-5
Accept-Language: en-us, fr-en ;q=0.2
Expect: 100-Continue
"""


f = parse(to_parse, Request)
f.request_header[3]

<__main__.Expect at 0x1a3e3096a00>

<__main__.Expect at 0x1a3e41a50a0>

In [180]:
GET /hello.htm HTTP/1.1
User-Agent: Mozilla/4.0 (compatible; MSIE5.01; Windows NT)
Host: www.tutorialspoint.com
Accept-Language: en-us
Accept-Encoding: gzip, deflate
Connection: Keep-Alive

SyntaxError: invalid syntax (<ipython-input-180-8033c460cb90>, line 1)