# Slides preparation


To begin with, `source_dir` and `dest_dir` should be set properly
- `source_dir`: The storage path where offer slides are existed
- `dest_dir`: The slide images, .json files with labels, and other related files are saved in this path

In [1]:
import os
import platform
from glob import glob
import ast


is_windows = platform.system().lower() == 'windows'
if is_windows:
    source_dir = "C:\\Users\\Kun\\Desktop" 
else:
    source_dir = "/Users/kun-lin/Desktop"
source_dir = os.path.join(source_dir, f"Angebot")
dest_dir = "output"

PPTtoImg_dir = os.path.join(dest_dir, f"PPTtoImg")
fWordImg_dir = os.path.join(dest_dir, f"FewWordImg")
ct_fWordImg_dir = os.path.join(fWordImg_dir, f"CT_FewWordImg")
topshapetype_dir = os.path.join(dest_dir, f"TopShapeTypeImg")
labels_dir = os.path.join(dest_dir, f"LABELS")
checkpoint_path = os.path.join(dest_dir, f"checkpoints")


if not os.path.exists(dest_dir):
    os.makedirs(dest_dir)
if not os.path.exists(PPTtoImg_dir):
    os.makedirs(PPTtoImg_dir)
if not os.path.exists(fWordImg_dir):
    os.makedirs(fWordImg_dir)
if not os.path.exists(ct_fWordImg_dir):
    os.makedirs(ct_fWordImg_dir)  
if not os.path.exists(topshapetype_dir):
    os.makedirs(topshapetype_dir)
if not os.path.exists(labels_dir):
    os.makedirs(labels_dir)
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)

print("Source Directory: {}".format(source_dir))
print("PPT to Image Directory: {}".format(PPTtoImg_dir))
print("Few Words Image Directory: {}".format(fWordImg_dir))
print("Common Type Few Words Image Directory: {}".format(ct_fWordImg_dir))
print("Labels: {}".format(labels_dir))
print("Check Point Path: {}".format(checkpoint_path))

Source Directory: C:\Users\Kun\Desktop\Angebot
PPT to Image Directory: output\PPTtoImg
Few Word Image Directory: output\FewWordImg
Labels: output\LABELS
Check Point Path: output\checkpoints


In [2]:
import re
class Extract_info_from_directory:
    def __init__(self, _folder_src) -> None:
        self._folder_src = _folder_src

    def ver_comparison(self, ppts_):
        # For V_1_3 and V_2_1 ====> V_2_1
        max_f_dig = float("-inf")
        comp_s_dig = float("-inf")
        tar_ = ""
        for ppt in ppts_:
            match = re.search(r"V_([0-30]+)_([0-30]+)", ppt)
            if match != None:
                m = match.group(0)
                f_dig = int(match.group(1))
                s_dig = int(match.group(2))

                if f_dig > max_f_dig:
                    max_f_dig = f_dig
                    comp_s_dig = s_dig
                    tar_ = ppt
                elif f_dig == max_f_dig:
                    if s_dig > comp_s_dig:
                        comp_s_dig = s_dig
                        tar_ = ppt

        # For case V1, V2, V10 ======> V10
        max_f_dig = float("-inf")
        tar_2 = ""
        for ppt in ppts_:
            match = re.search(r"[Vv]\d+[ _]", ppt)
            if match != None:
                m = match.group(0)
                f_dig = int(re.search(r"[Vv](\d+)[ _]", m).group(1))
                if f_dig > max_f_dig:
                    max_f_dig = f_dig
                    tar_2 = ppt

        if tar_ == tar_2:
            return tar_
        if tar_ != "" and tar_2 != "":
            return max([tar_, tar_2], key=os.path.getmtime)
        if tar_ != "":
            return tar_
        if tar_2 != "":
            return tar_2

        return ""

    def key_word_selection(self, ppts_):
        list_tmp = []
        for ppt in ppts_:
            if ppt.find("angebot") != -1 or ppt.find("Angebot") != -1:
                list_tmp.append(ppt)
            if ppt.find("anlage") != -1 or ppt.find("Anlage") != -1:
                list_tmp.append(ppt)
            if ppt.find("enclosure") != -1 or ppt.find("Enclosure") != -1:
                list_tmp.append(ppt)
        list_tmp = set(list_tmp)
        return list_tmp

    def order_condition(self, str):
        if str.find("angebot") != -1 or str.find("Angebot") != -1:
            return float(os.path.getmtime(str))
        if str.find("anlage") != -1 or str.find("Anlage") != -1:
            return float(os.path.getmtime(str))
        if str.find("enclosure") != -1 or str.find("Enclosure") != -1:
            return float(os.path.getmtime(str))

    def select_target_ppt(self, ppts_):
        tar_ppt = ""
        list_ppts_ = []
        # if there are no ppts, return empty string
        if len(ppts_) < 1:
            return tar_ppt
        tar_ppt = self.ver_comparison(ppts_)

        if tar_ppt != "":
            return tar_ppt

        list_ppts_ = self.key_word_selection(ppts_)
        list_ppts_ = set([*list_ppts_, tar_ppt])
        while "" in list_ppts_:
            list_ppts_.remove("")
        if len(list_ppts_) > 0:
            tar_ppt = max(list_ppts_, key=self.order_condition)
        # if no ppt is found, return the latest modified ppt
        if tar_ppt == "":
            tar_ppt = max(ppts_, key=os.path.getmtime)

        return tar_ppt

    def traversal_directory(self):
        directory = self._folder_src
        list_ppts = []
        list_with_v = []
        list_with_anlage = []
        list_with_angebot = []
        list_with_enclosure = []
        num_ppt = 0
        # print(f"Checking ppts inside the folder {folder}")
        # Convert each ppts into PNGs in a separate directory structure with same folder name
        for folder in os.listdir(directory):
            f = os.path.join(directory, folder)
            cache_ppt = glob(os.path.join(f, "*.pptx"))
            num_ppt += len(cache_ppt)
            # create ppt file reader object

            for ppt in cache_ppt:
                if ppt.find("_V") != -1 or ppt.find("V_") != -1:
                    list_with_v.append(ppt)
                if ppt.find("_v") != -1 or ppt.find("v_") != -1:
                    list_with_v.append(ppt)
                if ppt.find("angebot") != -1 or ppt.find("Angebot") != -1:
                    list_with_angebot.append(ppt)
                if ppt.find("anlage") != -1 or ppt.find("Anlage") != -1:
                    list_with_anlage.append(ppt)
                if ppt.find("enclosure") != -1 or ppt.find("Enclosure") != -1:
                    list_with_enclosure.append(ppt)

            ppt = self.select_target_ppt(cache_ppt)
            if ppt != "":
                # print(ppt)
                list_ppts.append(ppt)

        print("Original total: ", num_ppt)
        print("With V: ", len(list_with_v))
        print("With Angebot: ", len(list_with_angebot))
        print("With Anlage: ", len(list_with_anlage))
        print("With Enclosure: ", len(list_with_enclosure))
        list_chk = [
            *list_with_v,
            *list_with_anlage,
            *list_with_angebot,
            *list_with_enclosure,
        ]
        list_chk = set(list_chk)
        print("After removing duplicate, Total: ", len(list_chk))

        return list_ppts

In [3]:
# Travesal Source Directory
extract_info_dir = Extract_info_from_directory(source_dir)
extract_ppts = extract_info_dir.traversal_directory()
for ppt in extract_ppts:
    if ppt.find(" .pptx"):
        old = ppt
        new = ppt.replace(" .pptx", ".pptx")
        os.rename(old, new)

Original total:  1517
With V:  248
With Angebot:  1517
With Anlage:  788
With Enclosure:  220
After removing duplicate, Total:  1517


In [4]:
def shape_type_collect(_shape, _dict, idx):
    if _shape.shape_type == MSO_SHAPE_TYPE.AUTO_SHAPE:
        _dict[idx]['shape_type'].append('AUTO_SHAPE')
    elif _shape.shape_type == MSO_SHAPE_TYPE.CALLOUT:
        _dict[idx]['shape_type'].append('CALLOUT')
    elif _shape.shape_type == MSO_SHAPE_TYPE.CANVAS:
        _dict[idx]['shape_type'].append('CANVAS')
    elif _shape.shape_type == MSO_SHAPE_TYPE.CHART:
        _dict[idx]['shape_type'].append('CHART')
    elif _shape.shape_type == MSO_SHAPE_TYPE.COMMENT:
        _dict[idx]['shape_type'].append('COMMENT')
    elif _shape.shape_type == MSO_SHAPE_TYPE.DIAGRAM:
        _dict[idx]['shape_type'].append('DIAGRAM')
    elif _shape.shape_type == MSO_SHAPE_TYPE.EMBEDDED_OLE_OBJECT:
        _dict[idx]['shape_type'].append('EMBEDDED_OLE_OBJECT')
    elif _shape.shape_type == MSO_SHAPE_TYPE.FORM_CONTROL:
        _dict[idx]['shape_type'].append('FORM_CONTROL')
    elif _shape.shape_type == MSO_SHAPE_TYPE.FREEFORM:
        _dict[idx]['shape_type'].append('FREEFORM')
    elif _shape.shape_type == MSO_SHAPE_TYPE.GROUP:
        _dict[idx]['shape_type'].append('GROUP')
    elif _shape.shape_type == MSO_SHAPE_TYPE.IGX_GRAPHIC:
        _dict[idx]['shape_type'].append('IGX_GRAPHIC')
    elif _shape.shape_type == MSO_SHAPE_TYPE.INK:
        _dict[idx]['shape_type'].append('INK')
    elif _shape.shape_type == MSO_SHAPE_TYPE.INK_COMMENT:
        _dict[idx]['shape_type'].append('INK_COMMENT')
    elif _shape.shape_type == MSO_SHAPE_TYPE.LINE:
        _dict[idx]['shape_type'].append('LINE')
    elif _shape.shape_type == MSO_SHAPE_TYPE.LINKED_OLE_OBJECT:
        _dict[idx]['shape_type'].append('LINKED_OLE_OBJECT')
    elif _shape.shape_type == MSO_SHAPE_TYPE.LINKED_PICTURE:
        _dict[idx]['shape_type'].append('LINKED_PICTURE')
    elif _shape.shape_type == MSO_SHAPE_TYPE.MEDIA:
        _dict[idx]['shape_type'].append('MEDIA')
    elif _shape.shape_type == MSO_SHAPE_TYPE.OLE_CONTROL_OBJECT:
        _dict[idx]['shape_type'].append('OLE_CONTROL_OBJECT')
    elif _shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
        _dict[idx]['shape_type'].append('PICTURE')
    elif _shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER:
        _dict[idx]['shape_type'].append('PLACEHOLDER')
    elif _shape.shape_type == MSO_SHAPE_TYPE.SCRIPT_ANCHOR:
        _dict[idx]['shape_type'].append('SCRIPT_ANCHOR')
    elif _shape.shape_type == MSO_SHAPE_TYPE.TABLE:
        _dict[idx]['shape_type'].append('TABLE')
    elif _shape.shape_type == MSO_SHAPE_TYPE.TEXT_BOX:
        _dict[idx]['shape_type'].append('TEXT_BOX')
    elif _shape.shape_type == MSO_SHAPE_TYPE.TEXT_EFFECT:
        _dict[idx]['shape_type'].append('TEXT_EFFECT')
    elif _shape.shape_type == MSO_SHAPE_TYPE.WEB_VIDEO:
        _dict[idx]['shape_type'].append('WEB_VIDEO')
    elif _shape.shape_type == MSO_SHAPE_TYPE.MIXED:
        _dict[idx]['shape_type'].append('MIXED')

    return _dict

def text_extract(_shape, _dict, idx):
    # print(shape.text)
    text = _shape.text.replace('\x0b', ' ') # remove vertical tab 
    text = text.replace('\uf0a0', ',')
    text_splitted = [x for x in re.split("\\n", text) if x]
    if len(text_splitted) > 0:
        _dict[idx]['contents'].append(text_splitted)
        _dict[idx]['shape_text_id'].append(_shape.shape_id)
        _dict[idx]['shape_text_name'].append(_shape.name)
        try:
            pos = {
                'left': round(_shape.left.cm,2), 'top':round(_shape.top.cm,2), 
                'width':round(_shape.width.cm,2), 'height':round(_shape.height.cm,2)
            }
        except AttributeError:
            pos = {'left': 0.0, 'top': 0.0, 
                'width': 0.0, 'height': 0.0
            }
        _dict[idx]['shape_pos'].append(pos)
        
    return _dict


def group_shape_traversal(_shape, _dict, idx, in_group = False):
    group_shapes = [
            shp for shp in _shape.shapes
            if shp.shape_type == MSO_SHAPE_TYPE.GROUP
        ]
    
    if len(group_shapes) > 0:
        for group_shape in group_shapes:     
            for shape in group_shape.shapes:
                _dict= shape_type_collect(shape, _dict, idx)
                if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
                    _dict = group_shape_traversal(shape, _dict, idx, in_group = True)
                
                if shape.has_text_frame:
                    _dict = text_extract(shape, _dict, idx)
    elif in_group:
        for shape in _shape.shapes:
            _dict= shape_type_collect(shape, _dict, idx)
            if shape.has_text_frame:
                _dict = text_extract(shape, _dict, idx)
    
    return _dict

In [5]:
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE
import re

list_pptfiles = []
for n, file in enumerate(extract_ppts):
    try:
        print(file)
        prs = Presentation(file)
    except AttributeError:
        continue
    
    content_dict = {}
    for i, slide in enumerate(prs.slides):
        # print(i)
        
        content_dict[i] = {'file_name': file, 
                           'titles': [], 
                           'contents': [], 
                           'page_num': i+1,
                           'shape_type': [], 
                           'shape_text_id': [], 
                           'shape_text_name':[], 
                           'shape_pos': []
                    }
        has_title = False
            
        if slide.shapes.title:
            text_title = slide.shapes.title.text.replace('\x0b', ' ')
            content_dict[i]['titles'].append(text_title)
            has_title = True
            
        group_shape_traversal(slide, content_dict, i)
        
        for j, shape in enumerate(slide.shapes):
            #print(shape.name)
            shape_type_collect(shape, content_dict, i)
            content_dict[i]['shape_type'] = list(set(content_dict[i]['shape_type']))

            if shape.has_text_frame:
                
                if shape.name.find('Datumsplatzhalter')!=-1:
                        continue
                if shape.name.find('Fußzeilenplatzhalter')!=-1:
                        continue
                if shape.name.find('Foliennummern')!=-1:
                        continue
                if shape.text.find('Confidential')!=-1:
                        continue
                if shape.name.find('© UNITY')!=-1:
                        continue
                # content_is_text = False                
                for paragraph in shape.text_frame.paragraphs:
                    
                    for run in paragraph.runs:
                        try:
                            f_size = run.font.size.pt

                        except AttributeError:
                            f_size = 0

                        if f_size >= 18 and not has_title: # main cover title in general 
                            content_dict[i]['titles'].append(run.text)
                        # else:
                        #     content_is_text = True
                        #     break

                # if content_is_text:
                text = shape.text.replace('\x0b', ' ') # remove vertical tab 
                text = text.replace('\uf0a0', ',')
                text = text.replace('\uf0e8', '') # \uf0e8 is the hexadecimal representation of the Unicode character \uFFFD
                text = text.replace('\t', ' ') # remove tab
                text = text.replace('\uf0b7', '') # remove bullet
                text = text.replace('©', '')
                
                if len(content_dict[i]['titles']) > 1: # if there are more than one title
                    has_title = True
                    tile_by_piece = content_dict[i]['titles']
                    title = ''
                    for str in tile_by_piece:
                        if str != '' and not str.isspace():
                            if str[-1] == ' ':
                                title += ''.join(str)
                            elif str[-1] != ' ':
                                title += ' '+ str
                        
                    content_dict[i]['titles'] = [title]
                        
                text_splitted = [x for x in re.split("\\n", text) if x]
                
                # if has_title and content_dict[i]['titles'][0] in text_splitted:
                #     text_splitted.remove(content_dict[i]['titles'][0])
                if len(text_splitted) > 0:
                    content_dict[i]['contents'].append(text_splitted)
                    content_dict[i]['shape_text_id'].append(shape.shape_id)
                    content_dict[i]['shape_text_name'].append(shape.name)
                    try:
                        pos = {'left': round(shape.left.cm,2), 'top':round(shape.top.cm,2), 
                            'width':round(shape.width.cm,2), 'height':round(shape.height.cm,2)
                        }
                    except AttributeError:
                        pos = {'left': 0.0, 'top': 0.0, 
                            'width': 0.0, 'height': 0.0
                        }
                    
                    content_dict[i]['shape_pos'].append(pos)
                                              
    list_pptfiles.append(content_dict)

C:\Users\Kun\Desktop\Angebot\10831\Enclosure 1 – Service Description to offer No. 10831.pptx
C:\Users\Kun\Desktop\Angebot\11137\Enclosure 1 - Service Description to offer No. 11137.pptx
C:\Users\Kun\Desktop\Angebot\11149\Anlage1_Dialog@Cloud.pptx
C:\Users\Kun\Desktop\Angebot\11255\220204_UNITY_Impuls_Blanc_&_Fischer.pptx
C:\Users\Kun\Desktop\Angebot\11371\11371_Anlage_1.pptx
C:\Users\Kun\Desktop\Angebot\11453\Anlage_1_UC44 2021_V_2_0.pptx
C:\Users\Kun\Desktop\Angebot\11580\Franz_Haniel_Akquisition_Process_V_0_1.pptx
C:\Users\Kun\Desktop\Angebot\11594\Angebot_11594_eng.pptx
C:\Users\Kun\Desktop\Angebot\11610\Angebot_Sumitomo-Demag_Getting Agile and Lean_V_1_0.pptx
C:\Users\Kun\Desktop\Angebot\11637\Anlage_1_Leistungsbeschreibung_11637.pptx
C:\Users\Kun\Desktop\Angebot\11815\Anlage 1 Leistungsbeschreibung zu Angebot Nr. 11815.pptx
C:\Users\Kun\Desktop\Angebot\11863\Anlage_1_Leistungsbeschreibung_11863.pptx
C:\Users\Kun\Desktop\Angebot\11981\Anlage1_EED_11981_V_2_0.pptx
C:\Users\Kun\Deskt

In [6]:
len(list_pptfiles)

1091

In [7]:
import pandas as pd
df = pd.DataFrame(columns=['main_title', 'file_name', 'page_num', 'shape_type', 'contents', 'shape_text_id', 'shape_text_name','left', 'top', 'width', 'height'])
for n in range(len(list_pptfiles)):
    df_pptfiles = pd.DataFrame(list_pptfiles[n]).T
    df_expend = pd.DataFrame(df_pptfiles.shape_pos.tolist()).stack().reset_index(level=1, drop=True).to_frame('shape_pos')
    # shape position and other information decompose
    df_shape_pos_expend = pd.DataFrame([j for i, j in df_expend['shape_pos'].items()], index=df_expend.index)
    df_shape_pos_expend['page_num'] = df_shape_pos_expend.index
    df_shape_pos_expend['file_name'] = df_shape_pos_expend['page_num'].apply(lambda x: df_pptfiles.loc[x]['file_name'])
    df_shape_pos_expend['main_title']= df_shape_pos_expend['page_num'].apply(lambda x: df_pptfiles.loc[0]['titles'][0] if len(df_pptfiles.loc[0]['titles'])>0 else "No Title" )
    df_shape_pos_expend['shape_type'] = df_shape_pos_expend['page_num'].apply(lambda x: df_pptfiles.loc[x]['shape_type'])
    df_shape_pos_expend['page_num'] = df_shape_pos_expend['page_num'].apply(lambda x: df_pptfiles.loc[x]['page_num'])
   
    
    df_shape_text_id = pd.DataFrame(data = df_pptfiles.shape_text_id.tolist()).stack().reset_index(level=1, drop=True).to_frame('shape_text_id')
    df_shape_text_name = pd.DataFrame(data = df_pptfiles.shape_text_name.tolist()).stack().reset_index(level=1, drop=True).to_frame('shape_text_name')
    df_contents = pd.DataFrame(data = df_pptfiles.contents.tolist()).stack().reset_index(level=1, drop=True).to_frame('contents')
    df_pptfiles = pd.concat([df_shape_pos_expend, df_shape_text_id, df_shape_text_name, df_contents], axis=1)
    df = pd.concat([df, df_pptfiles], axis=0)
    df = df.reindex(columns=['main_title', 'file_name', 'page_num', 'shape_type', 'contents', 'shape_text_id', 'shape_text_name', 'left', 'top', 'width', 'height'])


pd.set_option('display.max_colwidth', None)
df.reset_index(drop=True, inplace=True)
df = df.explode('contents', ignore_index=True)
df['shape_text_id'] = df['shape_text_id'].astype('int64')
df['page_num'] = df['page_num'].astype('int64')

In [8]:
from unstructured.cleaners.core import clean, group_broken_paragraphs, replace_unicode_quotes, remove_punctuation, clean_non_ascii_chars

def clean_data_from_unstructured(text):
    
    text = re.sub(r'(https?://\S+|www\.\S+)', 'urlurl', text)
    text = clean(text, 
          extra_whitespace = True,    
          bullets= True,
    )
    text = group_broken_paragraphs(text)
    text = replace_unicode_quotes(text)
    text = remove_punctuation(text)
    # text = clean_non_ascii_chars(text)
    text = re.sub(r'(XXX|XxX|XXx|xXX|xxX|xxx|xxxx|xx|Xx|xX|Xxxxxx)', '', text)  # add new conditions
    # text = re.sub(r'[^ \nA-Za-z0-9À-ÖØ-öø-ÿ/äÄöÖüÜß,()/€+-&.?]+', '', text)
    text = re.sub(r'[^ \nA-Za-z0-9À-ÖØ-öø-ÿ/äÄöÖüÜß,()/€+&.?-]+', '', text)

    return text

def clean_data(text):
    text = re.sub(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?', 'urlurl', text)
    text = re.sub(r'(XXX|XxX|XXx|xXX|xxX|xxx|xxxx|xx|Xx|xX|Xxxxxx)', '', text)  # add new conditions
    # text = re.sub(r'[^ \nA-Za-z0-9À-ÖØ-öø-ÿ/äÄöÖüÜß,()/€+-&.?]+', '', text)
    # text = re.sub(r'[\\\\×\^\]\[÷]', '', text)  # remove backslashes and other characters
    return text

In [9]:
df = df.drop(columns={'left','top', 'width', 'height'})
df['contents'] = df['contents'].str.strip()
df['contents'] = df['contents'].apply(lambda x: clean_data_from_unstructured(x))
df = df[df['contents'] != '']
df = df[df['contents'].str.len()>1]
df.head()

Unnamed: 0,main_title,file_name,page_num,shape_type,contents,shape_text_id,shape_text_name
0,"iCert 2.0 Change Management - Design Sprint & SupportEnclosure 1 – Service Description to offer No. 10831 Munich, 20 th June 2022, UNITY AG",C:\Users\Kun\Desktop\Angebot\10831\Enclosure 1 – Service Description to offer No. 10831.pptx,1,"[PICTURE, FREEFORM, TEXT_BOX]",iCert 20 Change Management \n\n\n\n Design Sprint Support,5,Textfeld 4
1,"iCert 2.0 Change Management - Design Sprint & SupportEnclosure 1 – Service Description to offer No. 10831 Munich, 20 th June 2022, UNITY AG",C:\Users\Kun\Desktop\Angebot\10831\Enclosure 1 – Service Description to offer No. 10831.pptx,1,"[PICTURE, FREEFORM, TEXT_BOX]",Enclosure 1 Service Description to offer No 10831,5,Textfeld 4
2,"iCert 2.0 Change Management - Design Sprint & SupportEnclosure 1 – Service Description to offer No. 10831 Munich, 20 th June 2022, UNITY AG",C:\Users\Kun\Desktop\Angebot\10831\Enclosure 1 – Service Description to offer No. 10831.pptx,1,"[PICTURE, FREEFORM, TEXT_BOX]",Munich 20th June 2022 UNITY AG,5,Textfeld 4
4,"iCert 2.0 Change Management - Design Sprint & SupportEnclosure 1 – Service Description to offer No. 10831 Munich, 20 th June 2022, UNITY AG",C:\Users\Kun\Desktop\Angebot\10831\Enclosure 1 – Service Description to offer No. 10831.pptx,2,"[PLACEHOLDER, AUTO_SHAPE, FREEFORM, TABLE, GROUP, TEXT_BOX, EMBEDDED_OLE_OBJECT]",B Intranet,90,Rectangle: Rounded Corners 89
5,"iCert 2.0 Change Management - Design Sprint & SupportEnclosure 1 – Service Description to offer No. 10831 Munich, 20 th June 2022, UNITY AG",C:\Users\Kun\Desktop\Angebot\10831\Enclosure 1 – Service Description to offer No. 10831.pptx,2,"[PLACEHOLDER, AUTO_SHAPE, FREEFORM, TABLE, GROUP, TEXT_BOX, EMBEDDED_OLE_OBJECT]",Create content and set\n\n\n\nup concept structures,90,Rectangle: Rounded Corners 89


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 477612 entries, 0 to 502934
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   main_title       477612 non-null  object
 1   file_name        477612 non-null  object
 2   page_num         477612 non-null  int64 
 3   shape_type       477612 non-null  object
 4   contents         477612 non-null  object
 5   shape_text_id    477612 non-null  int64 
 6   shape_text_name  477612 non-null  object
dtypes: int64(2), object(5)
memory usage: 29.2+ MB


In [11]:
df.to_csv('pptx_list_by_line.csv', sep = ',', index=False, header=True, encoding='utf-8-sig')
print('Sales record successfully exported into csv File')

Sales record successfully exported into csv File


# Load PPT list 

In [12]:
import pandas as pd
# Since it take too much time on traversal ppt files, Easily to import the dataframe from the excel file
df_csv = pd.read_csv('./pptx_list_by_line.csv')

In [13]:
df_csv.head(5)

Unnamed: 0,main_title,file_name,page_num,shape_type,contents,shape_text_id,shape_text_name
0,"iCert 2.0 Change Management - Design Sprint & SupportEnclosure 1 – Service Description to offer No. 10831 Munich, 20 th June 2022, UNITY AG",C:\Users\Kun\Desktop\Angebot\10831\Enclosure 1 – Service Description to offer No. 10831.pptx,1,"['PICTURE', 'FREEFORM', 'TEXT_BOX']",iCert 20 Change Management \n\n\n\n Design Sprint Support,5,Textfeld 4
1,"iCert 2.0 Change Management - Design Sprint & SupportEnclosure 1 – Service Description to offer No. 10831 Munich, 20 th June 2022, UNITY AG",C:\Users\Kun\Desktop\Angebot\10831\Enclosure 1 – Service Description to offer No. 10831.pptx,1,"['PICTURE', 'FREEFORM', 'TEXT_BOX']",Enclosure 1 Service Description to offer No 10831,5,Textfeld 4
2,"iCert 2.0 Change Management - Design Sprint & SupportEnclosure 1 – Service Description to offer No. 10831 Munich, 20 th June 2022, UNITY AG",C:\Users\Kun\Desktop\Angebot\10831\Enclosure 1 – Service Description to offer No. 10831.pptx,1,"['PICTURE', 'FREEFORM', 'TEXT_BOX']",Munich 20th June 2022 UNITY AG,5,Textfeld 4
3,"iCert 2.0 Change Management - Design Sprint & SupportEnclosure 1 – Service Description to offer No. 10831 Munich, 20 th June 2022, UNITY AG",C:\Users\Kun\Desktop\Angebot\10831\Enclosure 1 – Service Description to offer No. 10831.pptx,2,"['PLACEHOLDER', 'AUTO_SHAPE', 'FREEFORM', 'TABLE', 'GROUP', 'TEXT_BOX', 'EMBEDDED_OLE_OBJECT']",B Intranet,90,Rectangle: Rounded Corners 89
4,"iCert 2.0 Change Management - Design Sprint & SupportEnclosure 1 – Service Description to offer No. 10831 Munich, 20 th June 2022, UNITY AG",C:\Users\Kun\Desktop\Angebot\10831\Enclosure 1 – Service Description to offer No. 10831.pptx,2,"['PLACEHOLDER', 'AUTO_SHAPE', 'FREEFORM', 'TABLE', 'GROUP', 'TEXT_BOX', 'EMBEDDED_OLE_OBJECT']",Create content and set\n\n\n\nup concept structures,90,Rectangle: Rounded Corners 89


In [14]:
df_csv[df_csv['contents'].isnull()]

Unnamed: 0,main_title,file_name,page_num,shape_type,contents,shape_text_id,shape_text_name
214068,"Analysis and Concept for data-driven business models Project approach March 17, 2022 | UNITY AG",C:\Users\Kun\Desktop\Angebot\21189\Swisslog_Angebotsskizze_V2.pptx,35,"['LINE', 'PLACEHOLDER', 'AUTO_SHAPE', 'FREEFORM', 'GROUP']",,287,Rechteck 286
214070,"Analysis and Concept for data-driven business models Project approach March 17, 2022 | UNITY AG",C:\Users\Kun\Desktop\Angebot\21189\Swisslog_Angebotsskizze_V2.pptx,35,"['LINE', 'PLACEHOLDER', 'AUTO_SHAPE', 'FREEFORM', 'GROUP']",,290,Rechteck 289
214072,"Analysis and Concept for data-driven business models Project approach March 17, 2022 | UNITY AG",C:\Users\Kun\Desktop\Angebot\21189\Swisslog_Angebotsskizze_V2.pptx,35,"['LINE', 'PLACEHOLDER', 'AUTO_SHAPE', 'FREEFORM', 'GROUP']",,297,Rechteck 296


In [15]:
df_csv = df_csv[~df_csv['contents'].isnull()]
df_csv['contents'] = df_csv['contents'].apply(lambda x: x.replace('\n', ''))
df_csv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 477609 entries, 0 to 477611
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   main_title       477609 non-null  object
 1   file_name        477609 non-null  object
 2   page_num         477609 non-null  int64 
 3   shape_type       477609 non-null  object
 4   contents         477609 non-null  object
 5   shape_text_id    477609 non-null  int64 
 6   shape_text_name  477609 non-null  object
dtypes: int64(2), object(5)
memory usage: 29.2+ MB


In [16]:
df_csv.head()

Unnamed: 0,main_title,file_name,page_num,shape_type,contents,shape_text_id,shape_text_name
0,"iCert 2.0 Change Management - Design Sprint & SupportEnclosure 1 – Service Description to offer No. 10831 Munich, 20 th June 2022, UNITY AG",C:\Users\Kun\Desktop\Angebot\10831\Enclosure 1 – Service Description to offer No. 10831.pptx,1,"['PICTURE', 'FREEFORM', 'TEXT_BOX']",iCert 20 Change Management Design Sprint Support,5,Textfeld 4
1,"iCert 2.0 Change Management - Design Sprint & SupportEnclosure 1 – Service Description to offer No. 10831 Munich, 20 th June 2022, UNITY AG",C:\Users\Kun\Desktop\Angebot\10831\Enclosure 1 – Service Description to offer No. 10831.pptx,1,"['PICTURE', 'FREEFORM', 'TEXT_BOX']",Enclosure 1 Service Description to offer No 10831,5,Textfeld 4
2,"iCert 2.0 Change Management - Design Sprint & SupportEnclosure 1 – Service Description to offer No. 10831 Munich, 20 th June 2022, UNITY AG",C:\Users\Kun\Desktop\Angebot\10831\Enclosure 1 – Service Description to offer No. 10831.pptx,1,"['PICTURE', 'FREEFORM', 'TEXT_BOX']",Munich 20th June 2022 UNITY AG,5,Textfeld 4
3,"iCert 2.0 Change Management - Design Sprint & SupportEnclosure 1 – Service Description to offer No. 10831 Munich, 20 th June 2022, UNITY AG",C:\Users\Kun\Desktop\Angebot\10831\Enclosure 1 – Service Description to offer No. 10831.pptx,2,"['PLACEHOLDER', 'AUTO_SHAPE', 'FREEFORM', 'TABLE', 'GROUP', 'TEXT_BOX', 'EMBEDDED_OLE_OBJECT']",B Intranet,90,Rectangle: Rounded Corners 89
4,"iCert 2.0 Change Management - Design Sprint & SupportEnclosure 1 – Service Description to offer No. 10831 Munich, 20 th June 2022, UNITY AG",C:\Users\Kun\Desktop\Angebot\10831\Enclosure 1 – Service Description to offer No. 10831.pptx,2,"['PLACEHOLDER', 'AUTO_SHAPE', 'FREEFORM', 'TABLE', 'GROUP', 'TEXT_BOX', 'EMBEDDED_OLE_OBJECT']",Create content and setup concept structures,90,Rectangle: Rounded Corners 89


In [17]:
df_csv=df_csv.groupby(['file_name', 'page_num', 'shape_type'])['contents'].apply(lambda x: ' '.join(x)).reset_index()

# PPT To Image

In [18]:
import win32com
import win32com.client
from PIL import Image

def output_file(ppt_path, img_path):
    
    file_name = os.path.basename(ppt_path)  
    if file_name.endswith(('ppt', 'pptx')):
        exec_path = os.path.abspath(img_path)  # image output path
        ktr =  ppt_path.split('\\')[-2]
        exec_path = os.path.join(exec_path, ktr)
        name = file_name.split('.')[0]
        prefix_file_name = '_'.join([ktr, name])
        image_dir_path = os.path.join(exec_path, name)  # image dir path
        if not os.path.exists(image_dir_path):
            os.makedirs(image_dir_path)  

        return image_dir_path, prefix_file_name
    else:
        raise Exception('please check the file extension is ppt/pptx! \n')

def ppt2png(ppt_path, img_path, size=(960, 540)):
    """
    PPT convert to PNG
    :param ppt_path: ppt file path
    :param img_path: output image file path
    """
    if os.path.exists(ppt_path):
        output_path, prefix_file_name = output_file(ppt_path, img_path)  # file exist?
        print(output_path)

        ppt_app = win32com.client.Dispatch('PowerPoint.Application')
        ppt = ppt_app.Presentations.Open(ppt_path, WithWindow=False)
        
        for slide in ppt.Slides:
            if not slide.SlideShowTransition.Hidden:
                openPath = os.path.join(output_path, '_'.join([prefix_file_name, f'{slide.SlideIndex}.PNG']))
                # openPath = os.path.join(output_path, f'{slide.SlideIndex}.PNG')
                slide.Export(openPath, 
                             'PNG', 
                             size[0],
                             size[1]
                )
        # ppt.SaveAs(output_path, 18)  
        ppt_app.Quit()  # close resource
        
    else:
        raise Exception('please check the path \n')


for idx, ppt in enumerate(df_csv['file_name'].unique()):
    ppt2png(ppt, PPTtoImg_dir)


c:\Users\Kun\Desktop\transformer_imp\output\PPTtoImg\10831\Enclosure 1 – Service Description to offer No
c:\Users\Kun\Desktop\transformer_imp\output\PPTtoImg\11137\Enclosure 1 - Service Description to offer No
c:\Users\Kun\Desktop\transformer_imp\output\PPTtoImg\11149\Anlage1_Dialog@Cloud
c:\Users\Kun\Desktop\transformer_imp\output\PPTtoImg\11255\220204_UNITY_Impuls_Blanc_&_Fischer
c:\Users\Kun\Desktop\transformer_imp\output\PPTtoImg\11371\11371_Anlage_1
c:\Users\Kun\Desktop\transformer_imp\output\PPTtoImg\11453\Anlage_1_UC44 2021_V_2_0
c:\Users\Kun\Desktop\transformer_imp\output\PPTtoImg\11580\Franz_Haniel_Akquisition_Process_V_0_1
c:\Users\Kun\Desktop\transformer_imp\output\PPTtoImg\11594\Angebot_11594_eng
c:\Users\Kun\Desktop\transformer_imp\output\PPTtoImg\11610\Angebot_Sumitomo-Demag_Getting Agile and Lean_V_1_0
c:\Users\Kun\Desktop\transformer_imp\output\PPTtoImg\11637\Anlage_1_Leistungsbeschreibung_11637
c:\Users\Kun\Desktop\transformer_imp\output\PPTtoImg\11815\Anlage 1 Leistun

## alignment the number of pages from dataframe 

In [19]:
df_num = 0
img_num = 0

for file_path in df_csv['file_name'].unique():
    # from dataframe to get the all page number of the file
    df_spec = df_csv[df_csv['file_name'] == file_path]
    page_from_df = df_spec['page_num'].unique()
    page_from_df = ["%d.PNG"%page for page in page_from_df]
    df_num += len(page_from_df)

    # from PPTtoImg to get the all page number of the file
    file_name = os.path.basename(file_path)
    file_name = file_name.split('.')[0]
    ktr = file_path.split('\\')[-2]
    ktr_path = os.path.join(os.path.abspath(PPTtoImg_dir), ktr)
    exact_path = os.path.join(ktr_path, file_name)
    imgs = glob(os.path.join(exact_path, "*.PNG"))
    page_from_output = [os.path.basename(img).split('_')[-1] for img in imgs]
    img_num += len(page_from_output)

    # get removed file path
    diff = list(set(page_from_df).difference(set(page_from_output)))
    diff = [int(i.split('.')[0]) for i in diff]
    df_csv = df_csv[~((df_csv['file_name'] == file_path) & (df_csv['page_num'].isin(diff)))]
    # print(diff)
print(df_num, img_num)

31847 26521


In [20]:
df_csv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26468 entries, 0 to 31846
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   file_name   26468 non-null  object
 1   page_num    26468 non-null  int64 
 2   shape_type  26468 non-null  object
 3   contents    26468 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.0+ MB


In [21]:

df_csv.to_csv('alignment_pptx_list.csv', sep = ',', index=False, header=True, encoding='utf-8-sig')
print('Sales record successfully exported into csv File')

Sales record successfully exported into csv File


# Build DocArray structure

In [4]:
import pandas as pd
def convert_to_list(data):
    return ast.literal_eval(data)
# Since it take too much time on traversal ppt files, Easily to import the dataframe from the csv file
df_da = pd.read_csv('alignment_pptx_list.csv', converters={'shape_type': convert_to_list})
# for Mac OS
if not is_windows:
    df_da['file_name'] = df_da['file_name'].apply(lambda x: x.replace('C\\Users\\Kun\\Desktop\\Angebot', source_dir))
    df_da['file_name'] = df_da['file_name'].apply(lambda x: x.replace('\\', '/'))

df_da = df_da[~df_da['contents'].isnull()]
df_da.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26468 entries, 0 to 26467
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   file_name   26468 non-null  object
 1   page_num    26468 non-null  int64 
 2   shape_type  26468 non-null  object
 3   contents    26468 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.0+ MB


## Docarray implementation

In [2]:
from typing import List
from docarray import dataclass, Document, DocumentArray
from docarray.typing import Image, Text, JSON, URI

In [8]:
@dataclass
class MySlide:
    Content: Text #chunk
    ImgPath: Image #chunk
    ShapeType: List[str] #tag
    PageNum: int #tag

@dataclass
class MyPPT:
    FileName: Text
    Slide: List[MySlide]

group_slide = DocumentArray()
uni_path = df_da['file_name'].unique()

for path in uni_path:
    for idx, row in df_da[df_da['file_name'] == path].iterrows():
        # print(row['file_name'])
        exec_path = os.path.abspath(PPTtoImg_dir)  # image output path
        if is_windows:
            # for windows OS
            ktr = row['file_name'].split('\\')[-2]
        else:
            # for Mac OS
            ktr = row['file_name'].split('/')[-2]
        name = os.path.basename(row['file_name'])
        name = name.split('.')[0]
        name = '_'.join([ktr, name, "%d.PNG" % row['page_num']])
        imgpath = os.path.join(exec_path, name)
        # imgpath = os.path.join(image_dir_path, )
        m = MySlide(
                    Content = row['contents'],
                    ImgPath = imgpath,
                    ShapeType = row['shape_type'],
                    PageNum = row['page_num'],
                )
        
        group_slide.append(Document(m))
            
docPPT = Document(MyPPT(FileName = path, Slide = group_slide))

In [7]:
# embedding visualization
# import torchvision
# def preproc(I: Image):
#     return (I.set_image_tensor_shape((200, 200))  
#              .set_image_tensor_normalization() 
#              .set_image_tensor_channel_axis(-1, 0))
# docPPT.Slide['@.[ImgPath]'].apply(preproc)
# model = torchvision.models.resnet50(pretrained=True)  
# docPPT.Slide['@.[ImgPath]'].embed(model)
# docPPT.Slide['@.[ImgPath]'].plot_embeddings(image_sprites=True, channel_axis = 0, image_source='uri' )

In [7]:
# print(docPPT.FileName)
# print(docPPT.Slide[3].Content.text)
# print(docPPT.Slide[55].ImgPath.tensor)
# for i in docPPT.Slide['@.[Content][:1]']:
#     print(i.text)

# for i in docPPT.Slide['@c:2']:
#     print(i)

# Move slide images to destinated path

In [6]:
# import shutil
# def dst(path):
#     # all files and directories
#     file_list = os.listdir(path)
#     # is empty under the directory?
#     if file_list:
#         # traverse all files
#         for file in file_list:
#             path0 = path+os.sep+file
#             # is directory?
#             if os.path.isdir(path0):
#                 dst(path0)
#             else:
#                 print(f"{path0}")
#                 global n
#                 n+=1
#                 if file.find('.PNG') != -1:
#                     if is_windows:
#                         shutil.move(path0, 'C:\\Users\\Kun\\Desktop\\transformer_imp\\output\\PPTtoImg')
#                     else:
#                         shutil.move(path0, '/Users/kun-lin/Documents/slide-classification/output/PPTtoImg/')
#     else:
#         pass
# # what we would like to traverse
# if is_windows:
#     path = 'C:\\Users\\Kun\\Desktop\\transformer_imp\\output\\PPTtoImg'
# else:
#     path = '/Users/kun-lin/Documents/slide-classification/output/PPTtoImg'
# # path = PPTtoImg_dir
# n = 0
# dst(path)
# print(f"Total files: {n}")