-
Notifications
You must be signed in to change notification settings - Fork 8
/
Main.hs
199 lines (160 loc) · 6.73 KB
/
Main.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
{-# LANGUAGE EmptyDataDecls #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}
module Main where
import Data.RDF
import qualified Data.Text as T (Text, splitOn, unpack)
import Data.Text.Read (decimal)
import System.FilePath.Glob (glob)
import Control.Monad (liftM)
import Control.Monad.IO.Class (liftIO)
import Database.Persist
import Database.Persist.Sqlite
import Database.Persist.TH
type Year = Int
type Title = T.Text
type URI = T.Text
share [mkPersist sqlSettings, mkMigrate "migrateAll"] [persistLowerCase|
Book
gutId Int
UniqueGutId gutId
title [Title]
author [AuthorId]
toc [T.Text] Maybe
deriving Show
Author
gutId Int
UniqueAuthorGutId gutId
names [T.Text]
dob Year
dod Year
webpage URI
deriving Show
|]
testText :: FilePath
testText = "../gutenberg-meta/rdf-files/cache/epub/10885/pg10885.rdf"
testText2 :: FilePath
testText2 = "../gutenberg-meta/rdf-files/cache/epub/15233/pg15233.rdf"
-- This one has two authors.
testText3 :: FilePath
testText3 = "../gutenberg-meta/rdf-files/cache/epub/1423/pg1423.rdf"
parseRDFXML :: String -> IO (Either ParseFailure (RDF TList))
parseRDFXML = parseFile (XmlParser Nothing Nothing)
-- | Gets a value out of a LNode PlainL in the object of a triple,
-- Where the queryString is the verb.
-- For example: getPlains rdf "dcterms:title" should return the title.
getPlains :: RDF TList -> T.Text -> [T.Text]
getPlains rdf queryString = objects where
triples = getTriples rdf queryString
objects = map (getTitle . objectOf) triples
getTitle node = myTitle where (LNode (PlainL myTitle)) = node
-- | Gets a value out of a LNode PlainL in the object of a triple,
-- given some triples.
-- For example: getPlains triples "dcterms:title" should return the title.
getPlainsFromTriples :: Triples -> T.Text -> [T.Text]
getPlainsFromTriples triples queryString = [name | (LNode (PlainL name)) <- nameObjects] where
nameTriples = [nameTriple | nameTriple <- triples, predicateOf nameTriple == UNode queryString]
nameObjects = map objectOf nameTriples
-- | Gets an integer value out of a LNode TypedL in the object of a triple,
-- given some triples.
-- For example: getIntrFrom triples "pgterms:birthdate" should return the author birth date.
getIntsFromTriples :: Triples -> T.Text -> [Int]
getIntsFromTriples triples queryString =
map parseInt [name | (LNode (TypedL name "http://www.w3.org/2001/XMLSchema#integer")) <- nameObjects] where
nameTriples = [nameTriple | nameTriple <- triples, predicateOf nameTriple == UNode queryString]
nameObjects = map objectOf nameTriples
getURIs :: RDF TList -> T.Text -> [T.Text]
getURIs rdf verb = map getURI nodes where
nodes = getNodes rdf verb
getURI node = objectURI where (UNode objectURI) = node
getURIsFromTriples :: Triples -> T.Text -> [T.Text]
getURIsFromTriples triples verb = map getURI nodes where
nodes = map objectOf [t | t <- triples, predicateOf t == UNode verb]
getURI node = objectURI where (UNode objectURI) = node
getNodes :: RDF TList -> T.Text -> [Node]
getNodes rdf verb = map objectOf triples where
triples = getTriples rdf verb
getTriples :: RDF TList -> T.Text -> Triples
getTriples rdf verb = query rdf Nothing (Just (UNode verb)) Nothing
getTitles :: RDF TList -> [Title]
getTitles rdf = getPlains rdf "dcterms:title"
getTOCs :: RDF TList -> Maybe [T.Text]
getTOCs rdf = let parsed = getPlains rdf "dcterms:tableOfContents" in
case parsed of
[] -> Nothing
_ -> Just parsed
parseInt :: T.Text -> Int
parseInt str = case decimal str of
(Right num) -> fst num
(Left _) -> error "Couldn't parse the integer."
-- | Returns a triples set about each author.
getAuthorsInfo :: RDF TList -> [Triples]
getAuthorsInfo rdf = [query rdf nodes Nothing Nothing |
nodes <- map Just (getNodes rdf "dcterms:creator")]
mkAuthor :: Triples -> Author
mkAuthor ai = Author gutID name dob dod webpage where
gutID = getAuthorGutId ai
name = getAuthorNames ai
dob = getAuthorDOB ai
dod = getAuthorDOD ai
webpage = getAuthorWebpage ai
-- | Just get the author Gutenberg ID.
getAuthorGutIds :: RDF TList -> [Int]
getAuthorGutIds rdf = map (parseInt . getURIpath) $ getURIs rdf "dcterms:creator"
-- | Just get the author Gutenberg ID from a single author's triples.
getAuthorGutId :: Triples -> Int
getAuthorGutId triples = parseInt $ getURIpath agent where
(UNode agent) = subjectOf $ head triples
-- | Takes the triples of info related to a single author,
-- and returns a list of names for that author.
getAuthorNames :: Triples -> [T.Text]
getAuthorNames authorInfo = getPlainsFromTriples authorInfo "pgterms:name"
-- | Takes the triples of info related to a single author,
-- and returns a list of names for that author.
getAuthorDOB :: Triples -> Int
getAuthorDOB authorInfo = head $ getIntsFromTriples authorInfo "pgterms:birthdate"
-- | Takes the triples of info related to a single author,
-- and returns a list of names for that author.
getAuthorDOD :: Triples -> Int
getAuthorDOD authorInfo = head $ getIntsFromTriples authorInfo "pgterms:deathdate"
getAuthorWebpage :: Triples -> URI
getAuthorWebpage authorInfo = head $ getURIsFromTriples authorInfo "pgterms:webpage"
-- | Takes a path, like http://some-URL-here.com/path/to/the-thing and returns the-thing
getURIpath :: T.Text -> T.Text
getURIpath uri = last $ T.splitOn "/" uri
getID :: RDF TList -> Int
getID rdf = bookID where
triples = query rdf Nothing Nothing (Just (UNode "pgterms:ebook"))
ids = map (extractID . subjectOf) triples
extractID node = getURIpath idURI where
(UNode idURI) = node
bookID = case ids of
[_,_] -> error "More than one ID."
[x] -> read $ T.unpack x
_ -> error "No IDs or something wrong with the ID."
-- rdfFiles :: [FilePath]
rdfFiles = glob "../gutenberg-meta/**/*.rdf"
parseMeta :: FilePath -> IO ()
parseMeta file = do
parsed <- parseRDFXML file
let result = fromEither parsed
authorInfo = getAuthorsInfo result
authors = map mkAuthor authorInfo
-- print authors
runSqlite "test.db" $ do
runMigration migrateAll
-- FIXME: This won't work if there's an author that's already in the database.
authorIds <- mapM insert authors
liftIO $ print authorIds
-- testBookId <- insert $ Book (getID result) (getTitles result) authorIds (getTOCs result)
-- testBookResult <- selectList [BookId ==. testBookId] [LimitTo 1]
-- liftIO $ print testBookResult
main :: IO ()
main = do
parseMeta testText3