forked from ntoll/ntoll.github.io
-
Notifications
You must be signed in to change notification settings - Fork 0
/
migrate.py
77 lines (66 loc) · 2.29 KB
/
migrate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python
"""
Hacky script that munges the dodgy HTML generated by my old textpattern blog
and turns it into something "nicer" while also generating the JSON file used to
build the homepage and articles page.
"""
from bs4 import BeautifulSoup
from bs4.element import Comment, Tag
from os import listdir
from os.path import isfile, join
import json
from datetime import datetime
path_old = 'old_articles'
path_new = 'site/templates/articles'
files = [f for f in listdir(path_old) if isfile(join(path_old, f))]
articles = []
for f in files:
raw = open(join(path_old, f))
html = raw.read()
soup = BeautifulSoup(html)
tags = []
content = []
for item in soup.contents:
if isinstance(item, Comment):
tags.append(item)
elif isinstance(item, Tag):
content.append(item)
title = tags[0].replace('#title:', '')
slug = f.replace('.html', '')
raw_date = tags[1].replace('#date:', '')
article = {}
article['title'] = title
article['slug'] = slug
article['date'] = raw_date
articles.append(article)
articles = sorted(articles, key=lambda article: article['date'], reverse=True)
output = open('articles.json', 'w')
json.dump(articles, output, indent=2)
output.close()
template = u"""{%% extends "base.html" %%}
{%% block content %%}
<h1>%(title)s</h1>
<p class="published_on">%(published)s</p>
%(content)s
{%% endblock content %%}"""
for art in articles:
filename = art['slug'] + '.html'
raw = open(join(path_old, filename))
html = raw.readlines()
content = '\n'.join([str(line) for line in html[3:]])
date = datetime.strptime(art['date'], '%Y-%m-%d %H:%M:%S')
art['published'] = date.strftime('%A %d %B %Y (%I:%M%p)')
content = content.replace('’', "'")
content = content.replace('‘', "'")
content = content.replace('“', '"')
content = content.replace('”', '"')
content = content.replace('“', '"')
content = content.replace('”', '"')
content = content.replace('’', "'")
content = content.replace('–', '-')
content = content.replace('/images/', '/static/images/')
content = content.replace('/files/', '/static/files/')
art['content'] = content
x = open(join(path_new, filename), 'w')
x.write(template % art)
x.close()