Skip to content
Permalink
master
Switch branches/tags
Go to file
 
 
Cannot retrieve contributors at this time
executable file 112 lines (91 sloc) 3.05 KB
import sys
import json
import base64
from io import BytesIO
import logging
log = logging.getLogger('ocr')
log.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
log.addHandler(ch)
misperrors = {'error': 'Error'}
userConfig = {}
inputSource = ['file']
moduleinfo = {'version': '0.2', 'author': 'Alexandre Dulaunoy',
'description': 'Optical Character Recognition (OCR) module for MISP',
'module-type': ['import']}
moduleconfig = []
def handler(q=False):
# try to import modules and return errors if module not found
try:
from PIL import Image
except ImportError:
misperrors['error'] = "Please pip(3) install pillow"
return misperrors
try:
# Official ImageMagick module
from wand.image import Image as WImage
except ImportError:
misperrors['error'] = "Please pip(3) install wand"
return misperrors
try:
from pytesseract import image_to_string
except ImportError:
misperrors['error'] = "Please pip(3) install pytesseract"
return misperrors
if q is False:
return False
r = {'results': []}
request = json.loads(q)
document = base64.b64decode(request["data"])
document = WImage(blob=document)
if document.format == 'PDF':
with document as pdf:
# Get number of pages
pages = len(pdf.sequence)
log.debug("PDF with {} page(s) detected".format(pages))
# Create new image object where the height will be the number of pages. With huge PDFs this will overflow, break, consume silly memory etc…
img = WImage(width=pdf.width, height=pdf.height * pages)
# Cycle through pages and stitch it together to one big file
for p in range(pages):
log.debug("Stitching page {}".format(p + 1))
image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0)
# Create a png blob
image = img.make_blob('png')
log.debug("Final image size is {}x{}".format(pdf.width, pdf.height * (p + 1)))
else:
image = document
image_file = BytesIO(image)
image_file.seek(0)
try:
im = Image.open(image_file)
except IOError:
misperrors['error'] = "Corrupt or not an image file."
return misperrors
ocrized = image_to_string(im)
freetext = {}
freetext['values'] = ocrized
freetext['types'] = ['freetext']
r['results'].append(freetext)
return r
def introspection():
modulesetup = {}
try:
userConfig
modulesetup['userConfig'] = userConfig
except NameError:
pass
try:
inputSource
modulesetup['inputSource'] = inputSource
except NameError:
pass
return modulesetup
def version():
moduleinfo['config'] = moduleconfig
return moduleinfo
if __name__ == '__main__':
x = open('test.json', 'r')
handler(q=x.read())