Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 194 lines (173 sloc) 6.998 kb
59697f5 Created comments and added bundler to satisfy install instructions.
Louis St-Amour authored
1 ###############################################################################
2 # #
3 # # # ### # # #
4 # ### # # ### ## ## # # ### ### # # ### ### ### #
5 # # # # # # # # # # # # ### ## # # # # # # # # # #
6 # # ### ### # # # ### ## # ### ### ### ### # ### #
7 # # ### # # #
8 # #
c3b6999 Mostly done, just have to clean up unique IDs and add sub-sections to TO...
Louis St-Amour authored
9 # journal2epub.rb 0.1 (c) 2011 Louis St-Amour, MIT Licensed (Expat) #
59697f5 Created comments and added bundler to satisfy install instructions.
Louis St-Amour authored
10 # Command-line tool to convert an issue of code4lib journal into epub format. #
11 # #
12 # Installation (ideally with rvm): gem install bundler && bundle install #
13 # #
c3b6999 Mostly done, just have to clean up unique IDs and add sub-sections to TO...
Louis St-Amour authored
14 # Usage: ruby journal2epub.rb --issue <number> #
15 # e.g. ruby journal2epub.rb --issue 12 will convert #
59697f5 Created comments and added bundler to satisfy install instructions.
Louis St-Amour authored
16 # http://journal.code4lib.org/issues/issue12 into 12.epub #
17 ###############################################################################
18
c3b6999 Mostly done, just have to clean up unique IDs and add sub-sections to TO...
Louis St-Amour authored
19 require 'open-uri'
20 require 'fileutils'
21 require 'digest/sha1'
22 require 'erb'
23
59697f5 Created comments and added bundler to satisfy install instructions.
Louis St-Amour authored
24 require 'rubygems'
25 require 'bundler/setup'
26
c3b6999 Mostly done, just have to clean up unique IDs and add sub-sections to TO...
Louis St-Amour authored
27 require 'nokogiri'
28 require 'trollop'
29 require 'mime/types'
30 require 'zip/zip'
31 require 'tidy_ffi'
32
33 # Thanks to Sam Stephenson, http://refactormycode.com/codes/281-given-a-hash-of-variables-render-an-erb-template
34 class Hash
35 def to_binding(object = Object.new)
36 object.instance_eval("def binding_for(#{keys.join(",")}) binding end")
37 object.binding_for(*values)
38 end
39 end
40
41 opts = Trollop::options do
42 version "journal2epub.rb 0.1 (c) 2011 Louis St-Amour, MIT Licensed (Expat)"
43 banner <<-EOS
44 #{version}
45 Command-line tool to convert an issue of code4lib journal into epub format.
46
47 Usage: ruby journal2epub.rb [options]
48
49 Possible [options]
50 ================== ========================
51 EOS
52 opt :issue, "Issue number to download", :default => "12" #:type => :string
53 end
54 Trollop::die :issue, "is required" if opts[:issue].nil?
55
56 issues_url = 'http://journal.code4lib.org/issues'
57 issues_doc = Nokogiri(open(issues_url).read)
58 issues_doc.encoding = 'UTF-8'
59 issue_date = issues_doc.xpath("//*[starts-with(a,'Issue " +
60 opts[:issue] + ",')]").first.text.split(", ")[1]
61
62 url = "http://journal.code4lib.org/issues/issue"+opts[:issue]
63 doc = Nokogiri(open(url).read)
64 doc.encoding = 'UTF-8'
65 content = (doc/"#content")
66
67 issue = {
68 :uri => url,
69 :title => (content/"h1").first.text+', Code4Lib Journal',
70 :uid => 'urn:uuid:'+Digest::SHA1.hexdigest(url), # Issues are unique per URL?
71 :issn => '19405758',
72 :now => Time.now.strftime("%Y-%m-%d"),
73 :date => issue_date, # e.g. 2010-12-21
74 :articles => (content/"div.article").map do |a|
75 {
76 :id => a[:id],
77 :filename => "#{a[:id]}.xhtml",
78 :title => (a/"h2").first.text,
79 :url => (a/"a").first[:href], # Luckily always exists, always absolute.
80 :author => (a/"p.author").first.text,
81 :abstract => (a/"div.abstract").first.inner_html.strip,
82 :sections => [],
83 }
84 end,
85 :files => []
86 }
87
88 # Make a copy for us to work with...
89 dir = 'tmp'
90 FileUtils.rm_rf dir
91 FileUtils.cp_r 'epub_template', dir
92
93 issue[:articles].each do |article|
94 article_doc = Nokogiri(open(article[:url]).read)
95 article_doc.encoding = 'UTF-8'
96 div = (article_doc/"div.article").first
97 (div/"#issueDesignation").first.remove
98 article[:html] = div.to_xhtml(:indent => 0, :encoding => 'UTF-8')
99 result = ERB.new(File.read("#{dir}/OEBPS/template.html")).result(article.to_binding)
100 tidy = TidyFFI::Tidy.new(result)
101 tidy.options.drop_font_tags = true
102 tidy.options.char_encoding = 'utf8'
103 tidy.options.clean = true
104 tidy.options.output_xhtml = true
105 tidy.options.lower_literals = true
106 tidy.options.numeric_entities = true
107 tidy.options.drop_proprietary_attributes = true
108 tidy.options.alt_text = ''
109 tidy.options.doctype = 'strict'
110 tidy.options.wrap = 0
111 doc2 = Nokogiri(tidy.clean)
112 doc2.encoding = 'UTF-8'
113 #(doc2/'//meta[@name="generator"]').first.remove
114 (doc2/"//html[@lang]").remove_attr('lang')
115 (doc2/"//pre").remove_attr('name')
116 (doc2/"//a").remove_attr('name')
117 (doc2/"//*[@align]").remove_attr('align')
118 (doc2/"//*[@width]").remove_attr('width')
119 (doc2/"//*[@height]").remove_attr('height')
120
121 (doc2/"a img").each do |img|
122 image_url = img.parent[:href]
123 image_filename = "images/#{issue[:files].length}.#{image_url.split('.')[-1]}"
124 img['src'] = image_filename
125 img.parent.replace(img)
126 File.open("#{dir}/OEBPS/"+image_filename, 'w') do |f|
127 f.write open(image_url).read
128 end
129 issue[:files] << image_filename
130 end
131
132 (doc2/"img").each do |img|
133 image_url = img[:src]
134 if(img[:src] =~ %r{^http://})
135 image_filename = "images/#{issue[:files].length}.#{image_url.split('.')[-1]}"
136 img['src'] = image_filename
137 File.open("#{dir}/OEBPS/"+image_filename, 'w') do |f|
138 f.write open(image_url).read
139 end
140 issue[:files] << image_filename
141 end
142 end
143
144 # I've no idea why Nokogiri is doing this, but it is, so we must gsub...
145 doctype_old = <<-EOS
146 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
147 <?xml version="1.0" encoding="utf-8"??>
148 EOS
149 doctype_new = <<-EOS
150 <?xml version="1.0" encoding="UTF-8"?>
151 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
152 "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
153 EOS
154 html_old = <<-EOS
155 <html xmlns="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
156 EOS
157 html_new = <<-EOS
158 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
159 EOS
160 xhtml = doc2.to_xhtml(:indent => 0, :encoding => 'UTF-8')
161 xhtml.gsub!(doctype_old, doctype_new)
162 xhtml.gsub!(html_old, html_new)
163 File.open("#{dir}/OEBPS/#{article[:filename]}", 'w') do |f|
164 f.write xhtml
165 end
166 end
167
168 # Delete article template from tmp dir when done ...
169 FileUtils.rm_f "#{dir}/OEBPS/template.html"
170
171 [dir+'/OEBPS/toc.ncx', dir+'/OEBPS/content.opf'].each do |filename|
172 result = ERB.new(File.read(filename)).result
173 File.open(filename, 'w') { |f| f.write result }
174 end
175
176 # Thank you mkepub, for inspiring the following rubyzip code -- Louis.
177 epubname = opts[:issue]+'.epub'
178 FileUtils.rm_f epubname
179
180 os = Zip::ZipOutputStream.new(epubname)
181 os.put_next_entry("mimetype", nil, nil, Zlib::NO_COMPRESSION)
182 os << "application/epub+zip"
183 os.close
184
185 zipfile = Zip::ZipFile.open(epubname)
186 Dir["#{dir}/**/*"].each do |path|
187 archive_path = path.sub("#{dir}/", "")
188 if !File.directory?(path) && !(archive_path == "mimetype")
189 zipfile.add(archive_path, path)
190 end
191 end
192 zipfile.commit
193
194 puts `java -jar epubcheck-1.1/epubcheck-1.1.jar #{epubname}`
Something went wrong with that request. Please try again.