### Note
Before running this script, install the "pathvalidate" module

see: https://pypi.org/project/pathvalidate/#installation-pip

Install

- pip install pathvalidate | https://pypi.org/project/pathvalidate/
- pip install Unidecode | https://pypi.org/project/Unidecode/
- pip install beautifulsoup4 | https://pypi.org/project/beautifulsoup4/

In [1]:
import os
import shutil
import gzip
import io
import json
import base64
from zipfile import ZipFile

# Path &File name sanitizer
from pathvalidate import sanitize_filename

# Unidecode
import unidecode

# Beautiful Soup for HTMLParsing
try: 
    from BeautifulSoup import BeautifulSoup
except ImportError:
    from bs4 import BeautifulSoup

In [2]:
# FUNCTIONS -------------------------------

# Function found on Gist 😸 --------------------
# https://gist.github.com/Garrett-R/dc6f08fc1eab63f94d2cbb89cb61c33d
# This takes the GZIPed string from the files and retruns the unzipped JSON
def gunzip_bytes_obj(bytes_obj):
    in_ = io.BytesIO()
    in_.write(bytes_obj)
    in_.seek(0)
    with gzip.GzipFile(fileobj=in_, mode='rb') as fo:
        gunzipped_bytes_obj = fo.read()

    return gunzipped_bytes_obj.decode()

# Clobber a string into a filename -------------
def make_filename(string):
    string = unidecode.unidecode(string)
    string = string.replace(" ","-")
    #string = created[0:10] + "-" + string
    #string=str(bytes(string, 'utf-8').decode('utf-8','ignore').encode("utf-8",'ignore'))
    #string=string.replace("b'","").replace("'","")
    invalid = '<>:"/\|?* ,()“”‘’\''
    for char in invalid:
        string = string.replace(char, '')
    string = sanitize_filename(string)
    
    return string


In [3]:
# SET SCRIPT VARIABLES

myVars = {
    'paths': {
        'paprika_export':   './',
        'paprika_jekyll':   '../_recipes/',
        'paprika_jsondump': '../_data/recipes/',
        'jekyll_data': '../_data/',
        'jekyll_root': '../'
    },
    'sources': {
        'paprika_export': 'data/My Recipes.paprikarecipes',
        'paprika_h_zip' : 'data/My Recipes.zip',
        'paprika_html'  : 'data/My Recipes/Recipes/'
    }
}
# Learning to Pretty Print DICTs. I like to see the data I am sculpting with
#pp = pprint.PrettyPrinter(indent=4)
#pp.pprint(myVars)


In [4]:
# If our export DIRs do not exist, create them
if not os.path.exists(myVars['paths']['paprika_jekyll']):
    os.mkdir(myVars['paths']['paprika_jekyll'])
else:
    print("Recipes Markdown Directory already exists\n")

if not os.path.exists(myVars['paths']['jekyll_data']):
    os.mkdir(myVars['paths']['jekyll_data'])
else:
    print("Jekyll Data Directory already exists\n")

if not os.path.exists(myVars['paths']['paprika_jsondump']):
    os.mkdir(myVars['paths']['paprika_jsondump'])
else:
    print("Recipes JSON Directory already exists\n")

# UNZIP HTML Export --------------------------
# Unzipping an archive with Python is a PITA when filenames have UTF-8 chars in them
# Best we unzip manually with macOS.
# IF we haven't unzipped the "My Recipes.zip" source yet, do so
#if not os.path.exists(myVars['sources']['paprika_html']):
#    with ZipFile(myVars['sources']['paprika_h_zip'], 'r') as zip_ref:
#        zip_ref.extractall('./data/')


Recipes Markdown Directory already exists

Jekyll Data Directory already exists

Recipes JSON Directory already exists



In [5]:
# IMAGES -------------------------------------
# Move the images out of the unzipped My Recipes dir to somehwere Jekyll can pick them up.

imagesSrcDir = myVars['sources']['paprika_html'] + "Images/"
imagesDstPth = myVars['paths']['jekyll_root']    + "images/"
imagesDstDir = imagesDstPth + "recipes/"

if os.path.exists(imagesSrcDir):
    if os.path.exists(imagesDstDir):
        try:
            shutil.rmtree(imagesDstDir)
        except OSError as e:
            print("Error: %s : %s" % (imagesDstDir, e.strerror))    
    moveReturn = shutil.copytree(imagesSrcDir, imagesDstDir)
    print("Destination path:", moveReturn)
    #os.rename(imagesDstPth + "Images",imagesDstPth + "pap")
else:
    print("Images folder already moved to Jekyll Root")

Destination path: ../images/recipes/


In [6]:
with ZipFile(myVars['sources']['paprika_export'], 'r') as zipObj:
    for filename in zipObj.namelist():
        #print('---------------------------------------')
        #print('OFile: ' + filename)
        
        with zipObj.open(filename) as f:
            data = f.read()
            
            # Using that function above to unGZIP the fie contents
            data_json = gunzip_bytes_obj(data)
 
            # "loads" that JSONinto a Python object
            # https://docs.python.org/3/library/json.html#basic-usage
            d = json.loads(data_json)
            
            
            # FILENAME ------------------------------
            # Prepare a cleaner filename we'll want for the Markdown Ouput (no spaces, etc)
            # Get the "created on" date of the recpie in Paprika
            created = d["created"]
            # Hammer this thing (see function above)
            fileName = make_filename(d["name"])
            mdFilePath = myVars['paths']['paprika_jekyll'] + fileName + ".md"

            
            # MARKDOWN --------------------------------
            # Create a string of Markdown
            # So this will require some "design." What do we want to include from the export?
            # How should it be styled? What do we jam into the metadata/frontmatter
            # What do we include as #tags in the body?
            
            output  = "---\n"
            output += "layout: recipe\n"
            output += "title: " + d["name"] + "\n"
            output += "name: " + d["name"] + "\n"
            output += "created: " + created + "\n"
            output += "filename: " + fileName + "\n"
            output += "---\n"
            
 #           output += "# " + d["name"] + "\n"
 #           if d["prep_time"] is not None:
 #               output += "## Prep time\n" + d["prep_time"] + "\n\n"
 #           if d["cook_time"] is not None:
 #               output += "## Cook time\n" + d["cook_time"] + "\n\n"
                
            #if d["photo_data"] is not None:
            #    output += "<img style='width:575px;heigh:575px;' width='575' height='575' src='data:imge/jpeg;base64,"+d["photo_data"]+"' />\n\n"
                
 #           if d["image_url"] is not None:
 #               output += "![]("+d["image_url"]+")\n\n"
                
 #           if d["description"] is not None:
 #               output += "## Description\n" + d["description"] + "\n\n---\n"
                
 #           if d["ingredients"] is not None:
 #               output += "## Ingredients\n" + d["ingredients"].replace("\n","<br>") + "\n\n---\n"
 #           if d["directions"] is not None:
 #               output += "## Directions\n" + d["directions"] + "\n\n---\n"

 #           if d["notes"] is not None:
 #               output += "## Notes\n" + d["notes"] + "\n\n---\n"

 #           if d["uid"] is not None and d["photo"] is not None:
 #               output += "## Photo path\n" + d["uid"] + "/" + d["photo"] + "\n\n---\n"
 #               d['photo_path'] = d["uid"] + "/" + d["photo"]
            
            #print(output)
            
            # Create/Open a text file for each recipe and write the above Markdown string into it
            f = open(mdFilePath, 'w')
            f.write(output)
            f.close()

            
            # HTML -----------------------------------
            
            # Let's put the HTMLexport into the JSON
            htmlFilePath = myVars['sources']['paprika_html'] + sanitize_filename(d["name"]) + ".html"

            if os.path.exists(htmlFilePath):
                with open(htmlFilePath, 'r') as htmlfile:
                    parsed_html = BeautifulSoup(htmlfile)
                #print(parsed_html.body.find('div', attrs={'class':'container'}).text)
                recipeHTML = {}
                #recipeHTML['recipe'] = str(parsed_html.body.find('div', attrs={'class':'recipe'}))
                recipeHTML['ingredients'] = str(parsed_html.body.find('div', attrs={'class':'ingredients'}))
                recipeHTML['directions'] = str(parsed_html.body.find('div', attrs={'class':'directions'}))
                recipeHTML['nutrition'] = str(parsed_html.body.find('div', attrs={'class':'nutrition'}))
                recipeHTML['notes'] = str(parsed_html.body.find('div', attrs={'class':'notes'}))
                d['html'] = recipeHTML

            else:
                print("FileName Error: " + htmlFilePath + "\n")



            # JSON ------------------------------------
            
            # Remove the photo_data and hash from the JSON as we will not need it anymore
            del(d['photo_data'],d['hash'],d['photo_hash'])
 
            if d["photos"] is not None:
                for embphoto in d["photos"]:
                    del(embphoto['data'],embphoto['hash'])
            # Need to explicitly set this to False if its empty because
            # Jekyll Liquid's Truthiness is a mystery.
            if d["photos"] == []:
                d["photos"] = False
            #print(d["photos"])

            # Prettify and dump the JSON to files
            json_dump = json.dumps(d, ensure_ascii=False, sort_keys=True, indent=2)
            jsonFilePath = myVars['paths']['paprika_jsondump'] + fileName + ".json"
            f = open(jsonFilePath, 'w')
            f.write(json_dump)
            f.close()


In [17]:
# comment out for now

#for filename in os.listdir('/Users/jito/GitHub/joi.github.io/_recipes'):
#    if filename.endswith(".asm") or filename.endswith(".py"): 
  #      print(os.path.join('/Users/jito/GitHub/joi.github.io/_recipes', filename))
  #      continue
 #   else:
 #       continue