In [149]:
import json
import os
import re

import markdownify

In [150]:
filename = "data.json"

with open(filename, encoding="utf-8") as file:
    data = json.load(file)

meta = data["db"][0]['meta']
data = data["db"][0]['data']

In [151]:
posts = data["posts"]

In [152]:
from collections import defaultdict
tags = {tag['id']:tag['name'] for tag in data["tags"]}
posts_tags = data["posts_tags"]
tag_of_posts = defaultdict(list)
for post_tag in posts_tags:
    tag_of_posts[post_tag["post_id"]].append(tags[post_tag["tag_id"]])


In [153]:


def clean_post(post):
    cleaned_post = {}
    cleaned_post['tags'] = tag_of_posts[post['id']]
    try:
        cleaned_post['markdown'] = json.loads(post['mobiledoc'])['cards'][0][1]['markdown']
    except (KeyError, IndexError):
        cleaned_post['markdown'] = markdownify.markdownify(post['html'])

    contains = ['title', 'slug', 'published_at', 'comment_id']
    for key in contains:
        cleaned_post[key] = post[key]

    return cleaned_post

In [154]:
posts[0].keys()

dict_keys(['id', 'uuid', 'title', 'slug', 'mobiledoc', 'html', 'comment_id', 'plaintext', 'feature_image', 'featured', 'type', 'status', 'locale', 'visibility', 'created_at', 'updated_at', 'published_at', 'custom_excerpt', 'codeinjection_head', 'codeinjection_foot', 'custom_template', 'canonical_url', 'email_recipient_filter', 'newsletter_id', 'lexical'])

In [155]:
clean_post(posts[0]).keys()

dict_keys(['tags', 'markdown', 'title', 'slug', 'published_at', 'comment_id'])

In [156]:
def build_mdx(post):
    mdx = f"""---
title: '{post['title']}'
slug: {post['slug']}
tags: {post['tags']}
date: '{post['published_at']}'
---

{post['markdown']}
"""
    return mdx

In [157]:
if not os.path.exists("output"):
    os.mkdir("output")

for post in posts:
    clean = clean_post(post)

    with open(f"output/{clean['slug']}.mdx", "w") as file:
        file.write(build_mdx(clean))


In [158]:
import re


def replace_cloudinary():
    for file in os.listdir("output"):
        with open("output/" + file, "r") as f:
            text = f.read()

        regex = re.compile(
            "(https://res-\d.cloudinary.com/hpoocidos/image/upload/q_auto/v1/ghost-blog-images/(.+))\)"
        )

        groups = re.findall(regex, text)

        if groups:
            for group in groups:
                text = text.replace(
                    group[0], f"/static/images/cloudinary/{group[1]}"
                )

            with open("output/" + file, "w") as f:
                f.write(text)
replace_cloudinary()

In [None]:
def replace_ghost():
    for file in os.listdir("output"):
        with open("output/" + file, "r") as f:
            text = f.read()

        regex = re.compile("(__GHOST_URL__/content/images/(\d\d\d\d/\d\d/.+))\)")

        groups = re.findall(regex, text)
        if groups:
            for group in groups:
                text = text.replace(
                    group[0], f"/static/images/ghost/{group[1]}"
                )

        text.replace("__GHOST_URL__", "https://digital-manual.xyz")

        with open("output/" + file, "w") as f:
            f.write(text)
replace_ghost()