/
Blogger.hs
206 lines (183 loc) · 6.84 KB
/
Blogger.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-}
module Hakyll.Convert.Blogger
(FullPost(..), readPosts, distill)
where
import Control.Applicative
import Control.Arrow
import Control.Monad
import Data.Binary
import qualified Data.ByteString as B
import Data.Char
import Data.Data
import Data.Function
import Data.List
import Data.List
import Data.Maybe
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import Data.Time (UTCTime)
import Data.Time.Format (parseTimeM, formatTime, defaultTimeLocale)
import Hakyll.Core.Compiler
import Hakyll.Core.Item
import Hakyll.Web.Template.Context
import Text.Atom.Feed
import Text.Atom.Feed.Export
import Text.Atom.Feed.Import
import Text.XML.Light
import Hakyll.Convert.Common
-- | A post and its comments
data FullPost = FullPost
{ fpPost :: Entry
, fpComments :: [Entry]
, fpUri :: String
}
-- | An entry is assumed to be either a post, or a comment.
-- If it's a post, it should be associated with the URI
-- that visitors would use to read the post (on the old blog)
-- If it's a comment, it should be the URI for the corresponding
-- post.
data BloggerEntry =
Post { beUri_ :: String, beEntry :: Entry }
| Comment { beUri_ :: String, beEntry :: Entry }
| Orphan { beEntry :: Entry }
deriving (Show)
beUri :: BloggerEntry -> Maybe String
beUri (Orphan _) = Nothing
beUri (Post u _) = Just u
beUri (Comment u _) = Just u
-- ---------------------------------------------------------------------
-- Feed to helper type
-- ---------------------------------------------------------------------
-- | Returns only published posts
readPosts :: FilePath -> IO (Maybe [FullPost])
readPosts f = do
parseAtomDoc <$> B.readFile f
where
parseAtomDoc x =
select =<< parseXMLDoc (T.decodeUtf8 x)
select =
fmap (extractPosts . feedEntries) . elementFeed . deleteDrafts
-- has to be done on the XML level as our atom lib doesn't understand
-- the blogger-specific XML for drafts
deleteDrafts :: Element -> Element
deleteDrafts e =
e { elContent = filter isInnocent (elContent e) }
where
isInnocent (Elem e) = not (isDraft e)
isInnocent _ = True
isDraft :: Element -> Bool
isDraft e =
isJust $ findElement draft e
where
draft = QName
{ qName = "draft"
, qURI = Just "http://purl.org/atom/app#"
, qPrefix = Just "app"
}
-- | Warning: this silently ignores orphans, templates, settings
extractPosts :: [Entry] -> [FullPost]
extractPosts entries =
map toFullPost blocks
where
toFullPost (uri, entries) = FullPost
{ fpPost = post
, fpComments = comments
, fpUri = uri
}
where
post = case [ e | Post _ e <- entries ] of
[] -> huh "Block of entries with no post?!"
[p] -> p
ps -> huh "Block of entries with more than one post?!"
comments = [ e | Comment _ e <- entries ]
huh msg = error . unlines $ msg : map (txtToString . entryTitle . beEntry) entries
blocks = [ (u,xs) | (Just u, xs) <- blocks_ ] -- drop orphans
blocks_ = buckets beUri
$ map identifyEntry
$ filter isInteresting entries
-- | Contains actual meat (posts, comments; but not eg. templates)
isInteresting :: Entry -> Bool
isInteresting e =
not $ any isBoring cats
where
isBoring c = any (\t -> isBloggerCategoryOfType t c) ["settings", "template"]
cats = entryCategories e
-- | Tag an entry from the blogger feed as either being a post,
-- a comment, or an "orphan" (a comment without an associated post)
identifyEntry :: Entry -> BloggerEntry
identifyEntry e =
if isPost e
then case getLink "self" `mplus` getLink "alternate" of
Just l -> Post (postUrl l) e
Nothing -> entryError e oopsSelf
else case getLink "alternate" of
Just l -> Comment (postUrl l) e
Nothing -> Orphan e
where
isPost = any (isBloggerCategoryOfType "post") . entryCategories
postUrl = takeWhile (/= '?') . linkHref
getLink ty = case filter (isLink ty) $ entryLinks e of
[] -> Nothing
[x] -> Just x
xs -> entryError e (oopsLink ty)
isLink ty l = linkRel l == Just (Right ty) && linkType l == Just "text/html"
oopsSelf = "Was expecting blog posts to have a self link"
oopsLink ty = "Was expecting entries have at most one link of type " ++ ty
isBloggerCategory :: Category -> Bool
isBloggerCategory = (== Just "http://schemas.google.com/g/2005#kind")
. catScheme
isBloggerCategoryOfType :: String -- ^ \"comment\", \"post\", etc
-> Category
-> Bool
isBloggerCategoryOfType ty c =
isBloggerCategory c &&
catTerm c == "http://schemas.google.com/blogger/2008/kind#" ++ ty
-- ---------------------------------------------------------------------
--
-- ---------------------------------------------------------------------
distill :: FullPost -> DistilledPost
distill fp = DistilledPost
{ dpBody = body fpost
, dpUri = fpUri fp
, dpTitle = title fpost
, dpTags = tags fpost
, dpCategories = []
, dpDate = date fpost
}
where
fpost = fpPost fp
--
body = fromContent . entryContent
fromContent (Just (HTMLContent x)) = T.pack x
fromContent _ = error "Hakyll.Convert.Blogger.distill expecting HTML"
--
title p = case txtToString (entryTitle p) of
"" -> Nothing
t -> Just (T.pack t)
tags = map (T.pack . catTerm)
. filter (not . isBloggerCategory)
. entryCategories
date x = T.pack $
case parseTime' =<< entryPublished x of
Nothing -> "1970-01-01"
Just d -> formatTime' d
parseTime' d = msum $ map (\f -> parseTimeM True defaultTimeLocale f d)
[ "%FT%T%Q%z" -- with time zone
, "%FT%T%QZ" -- zulu time
]
formatTime' :: UTCTime -> String
formatTime' = formatTime defaultTimeLocale "%FT%TZ" --for hakyll
-- ---------------------------------------------------------------------
-- odds and ends
-- ---------------------------------------------------------------------
entryError e msg =
error $ msg ++ " [on entry " ++ entryId e ++ "]\n" ++ show e
buckets :: Ord b => (a -> b) -> [a] -> [ (b,[a]) ]
buckets f = map (first head . unzip)
. groupBy ((==) `on` fst)
. sortBy (compare `on` fst)
. map (\x -> (f x, x))