-
Notifications
You must be signed in to change notification settings - Fork 3
/
foo.py
163 lines (127 loc) · 4.74 KB
/
foo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from lxml import etree
import sys
import os
import subprocess
import shutil
import json
from common import *
# Pages under these namespaces won't be converted.
ignore_namespaces = ['Category', 'Dev', 'File', 'Help', 'MediaWiki',
'Panda3D Manual', 'Panda3D Wiki', 'Talk',
'Template', 'User talk', 'User']
# Create the pages dir, if it doesn't exist.
if not os.path.isdir('pages'):
os.mkdir('pages')
# Parse the MediaWiki xml dump.
with open(sys.argv[1]) as f:
root = etree.parse(f)
NS = dict(e = "http://www.mediawiki.org/xml/export-0.6/")
pages = root.xpath("//e:page", namespaces=NS)
paths = set()
num_errors = 0
num_images = 0
# Catalogue all images in the images dir.
all_images = {}
for image in os.listdir('manual-images'):
name = transform_title(image)
assert name not in all_images
all_images[name] = os.path.join('manual-images', image)
# The first page should be the main page.
main_page = pages.pop(0)
assert main_page.xpath("e:title/text()", namespaces=NS)[0] == 'Main Page'
t = main_page.xpath(".//e:text/text()", namespaces=NS)[-1]
# Parse the table of contents from the main page contents.
parse_toc_tree(t.strip())
write_toc_tree('toctree.json')
# Write out the toc tree in RST form for the main page.
with open("source/index.rst", "wb") as f:
children = get_page_children('Main Page') or []
f.write(b'Table of Contents\n')
f.write(b'=================\n')
f.write(b'\n')
f.write(b'.. toctree::\n')
f.write(b' :titlesonly:\n\n')
for child in children:
f.write(b' ' + (child.encode('utf-8')))
f.write(b'\n')
# Find all of the redirects.
redirects = {}
for page in pages:
title = page.xpath("e:title/text()", namespaces=NS)[0]
t = page.xpath(".//e:text/text()", namespaces=NS)
if t:
# Take the last revision.
t = t[-1].strip()
if t and t.startswith('#') and t.upper().startswith('#REDIRECT'):
# Ignore redirects.
target = t.strip().split(' ', 1)[-1].strip('[]')
redirects[title] = target
# Store the redirects to disk.
json.dump(redirects, open('redirects.json', 'w'))
# Convert all of the other pages.
for i, page in enumerate(pages):
progress = (100 * i) // (len(pages) - 1)
title = page.xpath("e:title/text()", namespaces=NS)[0]
if title in redirects:
# Ignore redirects.
continue
if ':'in title:
namespace = title.split(':', 1)[0]
if namespace in ignore_namespaces:
continue
transformed = transform_title(title)
path = get_page_path(title)
if not path:
# Not in table of contents. Skip.
continue
t = page.xpath(".//e:text/text()", namespaces=NS)
if t:
# Take the last revision.
t = t[-1].strip()
if not t:
# Ignore empty page
print("Ignoring empty page %s" % (path))
continue
# Ensure there are no two non-redirect pages with the same name.
assert path not in paths
paths.add(path)
# Make sure the parent directory exists.
parent = 'source'
if '/' in path:
parent = "source/{}".format(os.path.dirname(path))
if not os.path.isdir(parent):
os.makedirs(parent)
# Find all the image references on this page.
for image in re.findall(r'[[][[]Image:([^|\]]+)[|\]]', t):
name = transform_title(image)
source = all_images.get(name)
if not source:
print("\nWarning: missing image %s" % (source))
continue
target = os.path.join(parent, name)
shutil.copyfile(source, target)
num_images += 1
with open("source/{}.rst".format(path), "wb") as f:
#print("converting %s" % (path))
print("\x1b[1Fconverting [%+3s%%] \x1b[1m%s\x1b[m\x1b[K" % (progress, path))
# Write an anchor so we can refer to this page.
f.write(".. _{}:\n\n".format(transformed).encode('utf-8'))
f.flush()
handle = subprocess.Popen(["./convert.py", "-"], stdin=subprocess.PIPE, stdout=f)
# Prepend a first-level header containing the page title.
data = "= {} =\n". format(title.replace('CXX', 'C++')).encode("utf-8")
data += t.encode("utf-8")
handle.communicate(data)
if handle.returncode != 0:
print()
num_errors += 1
# If this page has children, write out a toc tree at the bottom.
children = get_page_children(title)
if children:
f.write(b'\n\n.. toctree::\n')
f.write(b' :maxdepth: 2\n')
f.write(b'\n')
for child in children:
f.write(b' ' + (child.encode('utf-8')))
f.write(b'\n')
print("Wrote %s files to source/ (%d had errors). %d images copied." % (len(paths), num_errors, num_images))