Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 237 lines (216 sloc) 8.255 kB
59697f5 Created comments and added bundler to satisfy install instructions.
Louis St-Amour authored
1 ###############################################################################
2 # #
3 # # # ### # # #
4 # ### # # ### ## ## # # ### ### # # ### ### ### #
5 # # # # # # # # # # # # ### ## # # # # # # # # # #
6 # # ### ### # # # ### ## # ### ### ### ### # ### #
7 # # ### # # #
8 # #
c3b6999 Mostly done, just have to clean up unique IDs and add sub-sections to…
Louis St-Amour authored
9 # journal2epub.rb 0.1 (c) 2011 Louis St-Amour, MIT Licensed (Expat) #
59697f5 Created comments and added bundler to satisfy install instructions.
Louis St-Amour authored
10 # Command-line tool to convert an issue of code4lib journal into epub format. #
11 # #
12 # Installation (ideally with rvm): gem install bundler && bundle install #
13 # #
c3b6999 Mostly done, just have to clean up unique IDs and add sub-sections to…
Louis St-Amour authored
14 # Usage: ruby journal2epub.rb --issue <number> #
15 # e.g. ruby journal2epub.rb --issue 12 will convert #
59697f5 Created comments and added bundler to satisfy install instructions.
Louis St-Amour authored
16 # http://journal.code4lib.org/issues/issue12 into 12.epub #
17 ###############################################################################
18
c3b6999 Mostly done, just have to clean up unique IDs and add sub-sections to…
Louis St-Amour authored
19 require 'open-uri'
20 require 'fileutils'
21 require 'digest/sha1'
22 require 'erb'
23
59697f5 Created comments and added bundler to satisfy install instructions.
Louis St-Amour authored
24 require 'rubygems'
25 require 'bundler/setup'
26
c3b6999 Mostly done, just have to clean up unique IDs and add sub-sections to…
Louis St-Amour authored
27 require 'nokogiri'
28 require 'trollop'
29 require 'mime/types'
30 require 'zip/zip'
31 require 'tidy_ffi'
32
33 # Thanks to Sam Stephenson, http://refactormycode.com/codes/281-given-a-hash-of-variables-render-an-erb-template
34 class Hash
35 def to_binding(object = Object.new)
36 object.instance_eval("def binding_for(#{keys.join(",")}) binding end")
37 object.binding_for(*values)
38 end
39 end
40
41 opts = Trollop::options do
42 version "journal2epub.rb 0.1 (c) 2011 Louis St-Amour, MIT Licensed (Expat)"
43 banner <<-EOS
44 #{version}
45 Command-line tool to convert an issue of code4lib journal into epub format.
46
47 Usage: ruby journal2epub.rb [options]
48
49 Possible [options]
50 ================== ========================
51 EOS
52 opt :issue, "Issue number to download", :default => "12" #:type => :string
53 end
54 Trollop::die :issue, "is required" if opts[:issue].nil?
55
56 issues_url = 'http://journal.code4lib.org/issues'
57 issues_doc = Nokogiri(open(issues_url).read)
58 issues_doc.encoding = 'UTF-8'
59 issue_date = issues_doc.xpath("//*[starts-with(a,'Issue " +
60 opts[:issue] + ",')]").first.text.split(", ")[1]
61
62 url = "http://journal.code4lib.org/issues/issue"+opts[:issue]
63 doc = Nokogiri(open(url).read)
64 doc.encoding = 'UTF-8'
65 content = (doc/"#content")
66
67 issue = {
68 :uri => url,
69 :title => (content/"h1").first.text+', Code4Lib Journal',
70 :uid => 'urn:uuid:'+Digest::SHA1.hexdigest(url), # Issues are unique per URL?
71 :issn => '19405758',
72 :now => Time.now.strftime("%Y-%m-%d"),
73 :date => issue_date, # e.g. 2010-12-21
74 :articles => (content/"div.article").map do |a|
75 {
76 :id => a[:id],
77 :filename => "#{a[:id]}.xhtml",
78 :title => (a/"h2").first.text,
79 :url => (a/"a").first[:href], # Luckily always exists, always absolute.
80 :author => (a/"p.author").first.text,
81 :abstract => (a/"div.abstract").first.inner_html.strip,
82 :sections => [],
83 }
84 end,
85 :files => []
86 }
87
88 # Make a copy for us to work with...
89 dir = 'tmp'
90 FileUtils.rm_rf dir
91 FileUtils.cp_r 'epub_template', dir
92
93 issue[:articles].each do |article|
94 article_doc = Nokogiri(open(article[:url]).read)
95 article_doc.encoding = 'UTF-8'
96 div = (article_doc/"div.article").first
97 (div/"#issueDesignation").first.remove
98 article[:html] = div.to_xhtml(:indent => 0, :encoding => 'UTF-8')
99 result = ERB.new(File.read("#{dir}/OEBPS/template.html")).result(article.to_binding)
100 tidy = TidyFFI::Tidy.new(result)
101 tidy.options.drop_font_tags = true
102 tidy.options.char_encoding = 'utf8'
103 tidy.options.clean = true
104 tidy.options.output_xhtml = true
105 tidy.options.lower_literals = true
106 tidy.options.numeric_entities = true
107 tidy.options.drop_proprietary_attributes = true
108 tidy.options.alt_text = ''
109 tidy.options.doctype = 'strict'
110 tidy.options.wrap = 0
111 doc2 = Nokogiri(tidy.clean)
112 doc2.encoding = 'UTF-8'
113 #(doc2/'//meta[@name="generator"]').first.remove
114 (doc2/"//html[@lang]").remove_attr('lang')
115 (doc2/"//pre").remove_attr('name')
116 (doc2/"//a").remove_attr('name')
5141aa7 Updated bunches of stuff. Let's see if we're done.
Louis St-Amour authored
117 (doc2/"//a[@href]").each do |link|
118 if link[:href] =~ %r{^/articles/}
cbaf9f9 Works, and output every issue from 1 to 12\!
Louis St-Amour authored
119 link['href'] = "http://journal.code4lib.org#{link[:href]}"
5141aa7 Updated bunches of stuff. Let's see if we're done.
Louis St-Amour authored
120 end
121 if link[:href] =~ %r{^http://journal.code4lib.org/articles/}
122 fragment = link[:href].split('#')[1]
123 issue[:articles].each do |a|
124 if link[:href] == a[:url]
cbaf9f9 Works, and output every issue from 1 to 12\!
Louis St-Amour authored
125 link['href'] = a[:filename]
126 link['href'] += fragment unless fragment.nil?
5141aa7 Updated bunches of stuff. Let's see if we're done.
Louis St-Amour authored
127 break
128 end
129 end
130 end
131 end
c3b6999 Mostly done, just have to clean up unique IDs and add sub-sections to…
Louis St-Amour authored
132 (doc2/"//*[@align]").remove_attr('align')
133 (doc2/"//*[@width]").remove_attr('width')
134 (doc2/"//*[@height]").remove_attr('height')
cbaf9f9 Works, and output every issue from 1 to 12\!
Louis St-Amour authored
135 doc2.xpath('//pre').each do |p|
136 if p.text.strip.length == 0
137 p.remove
138 else
139 p.inner_html = p.inner_html.gsub(%r{^\n+}, "\n")
140 end
141 end
142 headings = doc2.xpath("//h2|//h3")
143 puts article[:url]
144 headings.each_with_index do |heading, j|
145 heading['id'] ||= "epubsection#{j+1}"
146 if heading.name.downcase == 'h2' || article[:sections].empty?
5141aa7 Updated bunches of stuff. Let's see if we're done.
Louis St-Amour authored
147 article[:sections] << {
148 :title => heading.text,
149 :id => heading['id'],
150 :sections => []
151 }
152 else
153 article[:sections].last[:sections] << {
154 :title => heading.text,
155 :id => heading['id']
156 }
157 end
158 end
c3b6999 Mostly done, just have to clean up unique IDs and add sub-sections to…
Louis St-Amour authored
159
cbaf9f9 Works, and output every issue from 1 to 12\!
Louis St-Amour authored
160 (doc2/'a img').each do |img|
c3b6999 Mostly done, just have to clean up unique IDs and add sub-sections to…
Louis St-Amour authored
161 image_url = img.parent[:href]
cbaf9f9 Works, and output every issue from 1 to 12\!
Louis St-Amour authored
162 if(image_url =~ %r{^http://journal.code4lib.org/wp-content/uploads/})
163 image_filename = "images/#{issue[:files].length}.#{image_url.split('.')[-1]}"
164 img['src'] = image_filename
165 img.parent.replace(img)
166 File.open("#{dir}/OEBPS/"+image_filename, 'w') do |f|
167 f.write open(image_url).read
168 end
169 issue[:files] << image_filename
c3b6999 Mostly done, just have to clean up unique IDs and add sub-sections to…
Louis St-Amour authored
170 end
171 end
172
173 (doc2/"img").each do |img|
174 image_url = img[:src]
175 if(img[:src] =~ %r{^http://})
176 image_filename = "images/#{issue[:files].length}.#{image_url.split('.')[-1]}"
177 img['src'] = image_filename
178 File.open("#{dir}/OEBPS/"+image_filename, 'w') do |f|
179 f.write open(image_url).read
180 end
181 issue[:files] << image_filename
182 end
183 end
184
185 # I've no idea why Nokogiri is doing this, but it is, so we must gsub...
186 doctype_old = <<-EOS
187 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
188 <?xml version="1.0" encoding="utf-8"??>
189 EOS
190 doctype_new = <<-EOS
191 <?xml version="1.0" encoding="UTF-8"?>
192 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
193 "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
194 EOS
195 html_old = <<-EOS
196 <html xmlns="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
197 EOS
198 html_new = <<-EOS
199 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
200 EOS
201 xhtml = doc2.to_xhtml(:indent => 0, :encoding => 'UTF-8')
202 xhtml.gsub!(doctype_old, doctype_new)
203 xhtml.gsub!(html_old, html_new)
204 File.open("#{dir}/OEBPS/#{article[:filename]}", 'w') do |f|
205 f.write xhtml
206 end
207 end
208
209 # Delete article template from tmp dir when done ...
210 FileUtils.rm_f "#{dir}/OEBPS/template.html"
211
212 [dir+'/OEBPS/toc.ncx', dir+'/OEBPS/content.opf'].each do |filename|
213 result = ERB.new(File.read(filename)).result
214 File.open(filename, 'w') { |f| f.write result }
215 end
216
217 # Thank you mkepub, for inspiring the following rubyzip code -- Louis.
218 epubname = opts[:issue]+'.epub'
219 FileUtils.rm_f epubname
220
221 os = Zip::ZipOutputStream.new(epubname)
222 os.put_next_entry("mimetype", nil, nil, Zlib::NO_COMPRESSION)
223 os << "application/epub+zip"
224 os.close
225
226 zipfile = Zip::ZipFile.open(epubname)
227 Dir["#{dir}/**/*"].each do |path|
228 archive_path = path.sub("#{dir}/", "")
229 if !File.directory?(path) && !(archive_path == "mimetype")
230 zipfile.add(archive_path, path)
231 end
232 end
233 zipfile.commit
234
cbaf9f9 Works, and output every issue from 1 to 12\!
Louis St-Amour authored
235 File.open("#{opts[:issue]}.log", 'w') do |f|
236 f.write `java -jar epubcheck-1.1/epubcheck-1.1.jar #{epubname} 2>&1`
237 end
Something went wrong with that request. Please try again.