In [1]:
"""
Project: QA Automated DQC PPTX
Script Developer: Nhan Tran
Updated: 01/22/24 12:00pm
Version: 1

Data Source: Tableau
Purpose: generating ad-hoc pptx report for QA DQC
Dependency: QA_DQC_layout.xlsx, QA_MOR_DQC_template.pptx, Tableau serv acc
"""

'\nProject: QA Automated DQC PPTX\nScript Developer: Nhan Tran\nUpdated: 01/22/24 12:00pm\nVersion: 1\n\nData Source: Tableau\nPurpose: generating ad-hoc pptx report for QA DQC\nDependency: QA_DQC_layout.xlsx, QA_MOR_DQC_template.pptx, Tableau serv acc\n'

# 1. App Settings

In [2]:
import pandas as pd, datetime as dt, numpy as np
import os, requests, shutil, io, subprocess, sys, PIL
import tableauserverclient as TSC
# import win32com.client as wcc
pd.options.mode.chained_assignment = None  #disable pandas warning

from pptx import Presentation


In [3]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd, numpy as np

# set the default user for dataiku api lookup
eid = 'E1724299'  

# create dataiku api object
# https://developer.dataiku.com/latest/concepts-and-examples/authinfo.html
client = dataiku.api_client()
user = client.get_user(eid)
client_usr = user.get_client_as()
ai = client_usr.get_auth_info(with_secrets=True)

# get existing connection info
dcon = client.list_connections()

folder = dataiku.Folder('PHI_DataScienceShare')  #input folder

In [4]:
import time

def pyclk(opt, *ls):  #job clocking
    if opt==1:  #start timer
        return time.time_ns()
    else:  #calculate job runtime
        lx = ls[0]
        speed = abs(lx - time.time_ns())
        if speed > 1e+9:
            s = speed/1e+9
            if s > 60:
                m = s/60
                if m > 60:
                    h = m/60
                    if type(h) == float: clk = f'{m//60:.0f}hour {m%60:.0f}min'
                    else: clk = f'{h}hour'
                else:
                    if type(m) == float: clk = f'{s//60:.0f}min {round(s%60, 2)}sec'
                    else: clk = f'{m}min'
            else: clk = f'{round(s, 2)}sec'
        else: clk = f'{speed:.0f}ns'
        return clk

stime0 = pyclk(1)  #start timer

In [5]:
# class loc:
#     d0 = os.getcwd() + '/'
#     lcl = 'C:\\'
#     p = '//clinisilonhh/ifs/PHI_Access/'
#     d1 = p + 'PHI-CO - Data Science Share/script_scheduled/QA_pptx_auto/'
#     pdr = p + 'PHI-CO - QPSIC - Quality Data/automated_tableau_report/QA_MOR_DQC_pptx/'

In [6]:
def wb_lookup(wbn):  #search Tableau server for workbook by name
    request_option = TSC.RequestOptions()
    request_option.filter.add(TSC.Filter(TSC.RequestOptions.Field.Name, TSC.RequestOptions.Operator.Equals, wbn))
    all_workbooks, pagination_item = server.workbooks.get(request_option)
    return all_workbooks[0]

def vw_lookup(wb, vnm):  #within a given Tableau workbook, search view by name
    server.workbooks.populate_views(wb)
    return [v for v in wb.views if v.name == vnm][0]

def imgExtract(df):  #Tableau png extraction; reference table as input
    print('Extraction image from Tableau...')
    server.auth.sign_in(tableau_auth)

    for w in df['workbook'].unique():
        print(' '*5 + w)
        pt2 = df[df['workbook'] == w]  #search by workbook name
        wb = wb_lookup(w)
        for v in pt2['view'].unique():
            print(' '*10 + v)
            pt3 = pt2[pt2['view'] == v]  #search by workbook's view name
            vw = vw_lookup(wb, v)
            image_req_option = TSC.ImageRequestOptions(imageresolution=TSC.ImageRequestOptions.Resolution.High)

            for i in range(pt3.shape[0]):
                pt4 = pt3.iloc[i,:]
                if pt4['f1b'] != 'ALL': image_req_option.vf(pt4['f1a'],pt4['f1b'])  #apply period filter
                if pt4['f2b'] != 'ALL': image_req_option.vf(pt4['f2a'],pt4['f2b'])  #apply fac filter
                server.views.populate_image(vw, image_req_option)  #render image for extraction

                img_encod = vw.image
                #modified reference table to include extracted image byte code
                df.loc[df['idx'] == pt4['idx'], 'tbimg'] = img_encod

    return server.auth.sign_out() #print('Done', end='\n', flush=True)

# imgExtract(tk)

In [42]:
def saveimg(df, dtmp):  #write image as temp files for pptx compiling
    #create temp folder
    if os.path.isdir(dtmp) == False: os.mkdir(dtmp)

    print('Writing image to temp folder...', end=' ')
    for i in df['idx'].unique():
        p1 = df[df['idx'] == i]
        pic = p1['tbimg'].max()
        inm = dtmp + p1['filename'].max()
        with open(inm, 'wb') as img:
            img.write(pic)

        df.loc[df['idx'] == i, 'dir'] = inm
    return print('Done', end='\n', flush=True)

# saveimg(tk)
def write_pptx(folder, root):
    #write pptx files
    buffer = io.BytesIO()  #create textwrapper object
    root.save(buffer)  #write data to textwrapper

    filen = path + "QA_MOR_DQC_test234.pptx"
    with io.BytesIO(buffer.getvalue()) as f:  #convert txt into byte object
        folder.upload_stream(filen, f)


def compiling(df, dtmp):  #compile pptx file
    print('Compiling pptx files...', end=' ')
#     display(df)
#     pptx = wcc.Dispatch("Powerpoint.Application")

    for x in df[df['campus'].notna()]['campus'].unique():
        pt1 = df[(df['page'].notna()) & ((df['campus'] == x) | (df['ptype'] == 'overview'))]
        pt1.sort_values(by=['page'], inplace=True)

        #load pptx template
#         ppt = pptx.Presentations.Open(ptmp, WithWindow=False)
        ppt = Presentation(io.BytesIO(folder.get_download_stream(ptmp).read()))

        #https://docs.microsoft.com/en-us/previous-versions/office/ff746274(v=office.15)
        #https://docs.microsoft.com/en-us/office/vba/api/PowerPoint.TextRange.Replace
#         txt1 = ppt.Slides(1).Shapes(1)
#         txt1.TextFrame.TextRange.Replace(FindWhat='campus', ReplaceWhat=x)
#         txt1.TextFrame.TextRange.Replace(FindWhat='fy8', ReplaceWhat=fy)

#         txt2 = ppt.Slides(1).Shapes(2)
#         txt2.TextFrame.TextRange.Replace(FindWhat='period', ReplaceWhat=rdt.strftime('%B %Y'))
        
#     from pptx import Presentation    
# ppt = Presentation(file_path_of_pptx)    
# search_str = '$$Name1$$'
# replace_str = 'Tom'

# for slide in ppt.slides:
#     for shape in slide.shapes:
#         if shape.has_text_frame:
#             for paragraph in shape.text_frame.paragraphs:
#                 for run in paragraph.runs:
#                     print(run.text)
#                     if(run.text.find(search_str))!=-1:
#                         run.text = run.text.replace(search_str, replace_str)
    
#         txt1 = ppt.slides[1].shapes[1]
#         txt1.text = txt1.text.replace('campus', x)
#         txt1.text = txt1.text.replace('fy8', fy)

#         txt2 = ppt.slides[1].shapes[2]
#         txt2.text = txt2.text.replace('period', rdt.strftime('%B %Y'))

        for p in pt1['page'].unique():
            pt2 = pt1[pt1['page'] == p]

            #add new slide at a designated location
            slid = ppt.slides[p]
#             slid.shape.text_frame.text = pt2['filename'].max()
#             layout = ppt.slide_masters[0].slide_layouts[p]
#             slid = ppt.slides.add_slide(layout)
#             slid = ppt.slides.add_slide(p)
            
#             image_stream = io.BytesIO()
#             with open("image_name.png", "wb") as img:
#     img.write(binary_data)
#             image_stream.write(pt2['tbimg'].max())
#             slid.shapes.add_picture(io.BytesIO(io.BytesIO(pt2['tbimg'].max()).getvalue()), left=0, top=10, width=720, height=475)
            
#             image_stream.write(pt2['tbimg'].max())
#             img = PIL.Image.open(pt2['tbimg'].max())
            img = io.BytesIO(pt2['tbimg'].max())
            slid.shapes.add_picture(img, left=0, top=10, width=720, height=475)
#             with io.BytesIO(io.BytesIO(pt2['tbimg'].max()).getvalue()) as f:
#                 slid.shapes.add_picture(f, left=0, top=10, width=720, height=475)
#                 dtmp.upload_stream(pt2['filename'].max(), f)
                
#             pic = shapes.add_picture(image_stream, left, top, width, height)
#             image_stream.close() # probably optional, but couldn't hurt
#             slid.shapes.add_picture(io.BytesIO(pt2['tbimg'].max()).seek(0), left=0, top=10, width=720, height=475)
    
#     img_stream = io.BytesIO(pt2['tbimg'].max())
#             display(img_stream)
#             slid.shapes.add_picture(img_stream, left=0, top=10, width=720, height=475)
            
#             image = PIL.Image.open(pt2['tbimg'].max())
#             with io.BytesIO() as output: 
#                 image.save(output, format="PNG")
#                 slid.shapes.add_picture(image, left=0, top=10, width=720, height=475)
                
#             image: PIL.Image = MyFunctionToGetImage()
# slide = prs.slides.add_slide(blank_slide)
# with io.BytesIO() as output:
#     image.save(output, format="GIF")
#     pic = slides.add_slide(output, left, top)
#             slid.Shapes.AddPicture(FileName=pt2['dir'].max(), LinkToFile=False, SaveWithDocument=True,
#                                    Left=0, Top=10, Width=720, Height=475)

#         ppt.SaveAs(f'{dtmp}QA_DQC_{x}.pptx')
#         ppt.Close()
        #write pptx files
        buffer = io.BytesIO()  #create textwrapper object
        ppt.save(buffer)  #write data to textwrapper

        filen = f'QA_DQC_{x}.pptx'
        with io.BytesIO(buffer.getvalue()) as f:  #convert txt into byte object
            dtmp.upload_stream(filen, f)
            f.close()
        
        
        

#     ppt.close()
#     os.system("TASKKILL /F /IM POWERPNT.EXE")  #close the pptx program

#     #create folder structure to store pptx files
#     dx = rdt.strftime('%Y-%m')
#     parnt = f'{loc.pdr}/{dx}/'.replace('/','\\')  #parent folder
#     if os.path.isdir(parnt) == False:
#         os.mkdir(parnt)

#     cmdx = f'''Move-Item -Path "{dtmp}QA_DQC_*.pptx" -Destination "{parnt}" -Force;'''
#     subprocess.call(["powershell.exe", cmdx])

    print('Done', end='\n', flush=True)
#     return print(f'PPTX file have been posted here:\n{parnt}')

# compiling(tk)

In [8]:
def package():
    print('Starting python job...')
    #temp directory for storing image
#     dtmp = loc.lcl + 'temp/'
    dtmp = dataiku.Folder('PHI_DataScienceShare')  #input folder

    # load reference table for png extraction
#     xlsx = pd.ExcelFile(xfile)
    xlsx = pd.ExcelFile(folder.get_download_stream(xfile).read())
    tk = xlsx.parse(sheet_name='filter').fillna({f:'ALL' for f in ['f1b','f2b']})
    tk['campux'] = tk['campus'].fillna('overview').str.replace(' ','')
    tk['filename'] = tk['workbook'].str.replace(' ','') + '_' + tk['view'].str.replace(' ','') + '_'
    tk['filename'] = tk['filename'].str.replace(r'[\W]+','', regex=True) + tk['campux'] + '.png'
    tk['idx'] = tk.index

    imgExtract(tk)
#     saveimg(tk, dtmp)
    compiling(tk, dtmp)
#     shutil.rmtree(dtmp)
    return print('Job completed!')

In [9]:
# folder2 = dataiku.Folder('PHI_DataScienceShare')  #input folder

# 2. User inputs

In [10]:
# Tableau token credential
# https://help.tableau.com/current/server/en-us/security_personal_access_tokens.htm
# User input token info
usr = 'svc.dscitableausx'  #username for password lookup; read-write access
psw = [k['value'] for k in ai["secrets"] if k['key'] == usr][0]

tableau_auth = TSC.TableauAuth(usr, psw, site_id='QPSIC')
server = TSC.Server('https://tableau.memorialhermann.org', use_server_version=True)

In [11]:
folder.list_paths_in_partition()

['/QA_url_v5.xlsx',
 '/QA_tableau_auto_pptx.py',
 '/QA_MOR_DQC_template.pptx',
 '/QA_DQC_layout.xlsx',
 '/hospital_package_templ_v2.pptx',
 '/FY23 MOR-DQC Reporting-MHTW_202311.pptx',
 '/QA_MOR_DQC_template_v2.pptx']

In [43]:
pfil = 'QA_MOR_DQC_template_v2.pptx'
ptmp = [x for x in folder.list_paths_in_partition() if pfil in x][0]

xfil = 'QA_DQC_layout.xlsx'
xfile = [x for x in folder.list_paths_in_partition() if xfil in x][0] 
# df = pd.read_csv(folder.get_download_stream(xfile).read())
date = dt.datetime.now()

# establish reporting period
now = dt.datetime.now()
if now.month < 7: fy = 'FY%s' %(now.year -2000)
else: fy = 'FY%s' %((now.year[-2] + 1) -2000)

per = now.strftime('%B %Y')
rdt = dt.datetime(date.year, date.month, 1)
package()
print('Total job runtime:', pyclk(0, stime0))  #end timer

Starting python job...
Extraction image from Tableau...
     Balanced Scorecard
          Patient Safety Indicators
          Hospital Acquired Infections
     QPSIC SSER Dashboard
          MOR SSER
     MHHS Management Metrics
          Landing Page
     Hand Hygiene
          Acute Care
Compiling pptx files... Done
Job completed!
Total job runtime: 37min 24.81sec
