In [1]:
from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Emu
import zipfile
import os

#### Clearing functions : 

In [127]:
def clear_header(doc):
    for section in doc.sections:
        header = section.header
        for paragraph in header.paragraphs:
            paragraph.clear()

            
def clear_footer(doc):
    for section in doc.sections:
        footer = section.footer
        for element in list(footer._element):
            footer._element.remove(element)

#### Unzipping and Reading : 

In [4]:
def read_xml(docx_path,file):
    with zipfile.ZipFile(docx_path, 'r') as zip_ref:
        all_files = zip_ref.namelist()
        header_file = next((s for s in all_files if file in s), None)
        if header_file:
            with zip_ref.open(header_file) as file:
                return file.read().decode('utf-8')
        else:
            print(f"Pas de {file} trouvé")
            return None

In [17]:
import zipfile
import re

def read_xmls(docx_path, file_type):
    if file_type not in ["header", "footer"]:
        print("Invalid file type, 'header' or 'footer'.")
        return None

    file_pattern = re.compile(rf'word/{file_type}\d*\.xml')
    contents = {}

    with zipfile.ZipFile(docx_path, 'r') as zip_ref:
        all_files = zip_ref.namelist()
        relevant_files = filter(file_pattern.match, all_files)

        for file_name in relevant_files:
            with zip_ref.open(file_name) as file:
                content = file.read().decode('utf-8')
                contents[file_name] = content

    return contents

In [21]:
docx_path = "test.docx"
headers = read_xmls(docx_path, "header")
footers = read_xmls(docx_path, "footer")

In [28]:
headers

{'word/header1.xml': '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\r\n<w:hdr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.

#### Reading information related to picture/text : 

 ##### Picture info : 

In [50]:
import xml.etree.ElementTree as ET

def extract_img(xml_text):
    namespaces = {
        'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
        'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
        'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
        'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
        'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
    }

    tree = ET.ElementTree(ET.fromstring(xml_text))
    root = tree.getroot()
    
    images_info = []
    drawings = root.findall('.//w:drawing', namespaces)
    
    for drawing in drawings:
        anchors = drawing.findall('.//wp:anchor', namespaces)
        inlines = drawing.findall('.//wp:inline', namespaces)
        
        for anchor in anchors + inlines:
            positionH = anchor.find('.//wp:positionH', namespaces)
            posH_offset = positionH.find('.//wp:posOffset', namespa=ces).text if positionH is not None else None
            positionV = anchor.find('.//wp:positionV', namespaces)
            posV_offset = positionV.find('.//wp:posOffset', namespaces).text if positionV is not None else None

            image_info = {
                'name': anchor.find('.//pic:cNvPr', namespaces).get('name'),
                'cx': anchor.find('.//wp:extent', namespaces).get('cx'),
                'cy': anchor.find('.//wp:extent', namespaces).get('cy'),
                'position_horizontal': posH_offset,
                'position_vertical': posV_offset
            }
            images_info.append(image_info)
    
    return images_info

In [53]:
xml_t = read_xml("test.docx","header")
h=extract_img(xml_t['word/header1.xml'])
h

[{'name': 'Image 1',
  'cx': '1371600',
  'cy': '660400',
  'position_horizontal': None,
  'position_vertical': None}]

In [64]:
import xml.etree.ElementTree as ET

def extract_img(xml_text):
    namespaces = {
        'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
        'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
        'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
        'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
        'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
    }

    tree = ET.ElementTree(ET.fromstring(xml_text))
    root = tree.getroot()
    
    images_info = []
    drawings = root.findall('.//w:drawing', namespaces)
    
    for drawing in drawings:
        anchors = drawing.findall('.//wp:anchor', namespaces)
        inlines = drawing.findall('.//wp:inline', namespaces)
        
        for item in anchors + inlines:
            positionH = item.find('.//wp:positionH', namespaces)
            posH_offset = positionH.find('.//wp:posOffset', namespaces).text if positionH is not None else None
            positionV = item.find('.//wp:positionV', namespaces)
            posV_offset = positionV.find('.//wp:posOffset', namespaces).text if positionV is not None else None

            is_inline = item.tag.endswith('inline')
            if is_inline:
                default_horizontal_position = 0
                default_vertical_position = 0

                image_info = {
                    'name': item.find('.//pic:cNvPr', namespaces).get('name'),
                    'cx': item.find('.//wp:extent', namespaces).get('cx'),
                    'cy': item.find('.//wp:extent', namespaces).get('cy'),
                    'position_horizontal': default_horizontal_position,
                    'position_vertical': default_vertical_position,
                    'is_inline': is_inline
                }
            else:
                image_info = {
                    'name': item.find('.//pic:cNvPr', namespaces).get('name'),
                    'cx': item.find('.//wp:extent', namespaces).get('cx'),
                    'cy': item.find('.//wp:extent', namespaces).get('cy'),
                    'position_horizontal': posH_offset,
                    'position_vertical': posV_offset,
                    'is_inline': is_inline
                }
            images_info.append(image_info)
    
    return images_info


xml_t = read_xml("template.docx","header")
h=extract_img(xml_t['word/header1.xml'])
h

[{'name': 'Image 4',
  'cx': '607695',
  'cy': '609600',
  'position_horizontal': '2707005',
  'position_vertical': '-271780',
  'is_inline': False},
 {'name': 'Image 2',
  'cx': '571500',
  'cy': '571500',
  'position_horizontal': '5780405',
  'position_vertical': '-233045',
  'is_inline': False},
 {'name': 'Image 1',
  'cx': '1371600',
  'cy': '660400',
  'position_horizontal': '-569595',
  'position_vertical': '-321945',
  'is_inline': False}]

In [202]:
xml_t=read_xml("new_template.docx","header")
print(extract_img(xml_t['word/header2.xml']), "\n\n", doc_width("new_template.docx"))

[{'name': 'Image 3', 'cx': '863600', 'cy': '415290', 'position_horizontal': '5653405', 'position_vertical': '-230505', 'is_inline': False}, {'name': 'Image 2', 'cx': '1371600', 'cy': '660400', 'position_horizontal': '1843405', 'position_vertical': '-334645', 'is_inline': False}, {'name': 'Image 1', 'cx': '657451', 'cy': '659408', 'position_horizontal': '-798195', 'position_vertical': '-334645', 'is_inline': False}] 

 7560310


In [None]:
images_info = [{'name': 'image4',
  'cx': '607695',
  'cy': '609600',
  'position_horizontal': '2707005',
  'position_vertical': '-271780',
  'is_inline': False},
 {'name': 'image2',
  'cx': '571500',
  'cy': '571500',
  'position_horizontal': '5780405',
  'position_vertical': '-233045',
  'is_inline': False},
 {'name': 'image3',
  'cx': '1371600',
  'cy': '660400',
  'position_horizontal': '-569595',
  'position_vertical': '-321945',
  'is_inline': False}] 

# Picture Insertion (Table + Works) #

### 1 Inserter ##

In [203]:
from docx import Document
from docx.shared import Inches

def clear_header(doc):
    for section in doc.sections:
        header = section.header
        for paragraph in header.paragraphs:
            paragraph.clear()

def add_image_table(source_doc_path, image_path, image_cell_index, width=None, height=None):
    doc = Document(source_doc_path)
    header = doc.sections[0].header
    clear_header(doc)

    table = header.add_table(1, 6, Inches(6))
    cell = table.cell(0, image_cell_index)
    paragraph = cell.paragraphs[0] if cell.paragraphs else cell.add_paragraph()
    run = paragraph.add_run()
    run.add_picture(image_path, width=width, height=height)

    return doc

source_doc = "test.docx"
doc = Document(source_doc)
doc = add_image_table(source_doc, "infi.jpeg", 1, width=Inches(1))
doc.save("z.docx")

In [181]:
# Positioner

In [None]:
image_info=[{'name': 'Image 4',
  'cx': '607695',
  'cy': '609600',
  'position_horizontal': '2707005',
  'position_vertical': '-271780',
  'is_inline': False},
 {'name': 'Image 2',
  'cx': '571500',
  'cy': '571500',
  'position_horizontal': '5780405',
  'position_ver  a tical': '-233045',
  'is_inline': False},
 {'name': 'Image 1',
  'cx': '1371600',
  'cy': '660400',
  'position_horizontal': '-569595',
  'position_vertical': '-321945',
  'is_inline': False}]

In [190]:
from docx import Document
def dimension(source_path):
    doc = Document(source_path)
    section = doc.sections[0]
    L = section.page_width
    width

7560310

In [239]:
image_info=[{'name': 'Image 4',
  'cx': '607695',
  'cy': '609600',
  'position_horizontal': '2707005',
  'position_vertical': '-271780',
  'is_inline': False},
 {'name': 'Image 2',
  'cx': '571500',
  'cy': '571500',
  'position_horizontal': '5780405',
  'position_ver  a tical': '-233045',
  'is_inline': False},
 {'name': 'Image 1',
  'cx': '1371600',
  'cy': '660400',
  'position_horizontal': '-569595',
  'position_vertical': '-321945',
  'is_inline': False}]
document_width = 7560310
cell_divider(image_info, document_width)

[{'name': 'Image 4',
  'cx': '607695',
  'cy': '609600',
  'position_horizontal': '2707005',
  'position_vertical': '-271780',
  'is_inline': False,
  'cell': 3},
 {'name': 'Image 2',
  'cx': '571500',
  'cy': '571500',
  'position_horizontal': '5780405',
  'position_ver  a tical': '-233045',
  'is_inline': False,
  'cell': 6},
 {'name': 'Image 1',
  'cx': '1371600',
  'cy': '660400',
  'position_horizontal': '-569595',
  'position_vertical': '-321945',
  'is_inline': False,
  'cell': 1}]

In [194]:
def doc_width(source_doc):
    doc=Document(source_doc)
    section=doc.sections[0]
    L=section.page_width
    return L

In [208]:
images_info

[{'name': 'image4',
  'cx': '607695',
  'cy': '609600',
  'position_horizontal': '2707005',
  'position_vertical': '-271780'},
 {'name': 'image2',
  'cx': '571500',
  'cy': '571500',
  'position_horizontal': '5780405',
  'position_vertical': '-233045'},
 {'name': 'image3',
  'cx': '1371600',
  'cy': '660400',
  'position_horizontal': '-569595',
  'position_vertical': '-321945'}]

In [211]:
k=read_xmls("new.docx","header")
k

{'word/header3.xml': '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\r\n<w:hdr xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.

In [224]:
extract_img(k["word/header2.xml"])

[{'name': 'infi.jpeg',
  'cx': '584200',
  'cy': '584200',
  'position_horizontal': '6286500',
  'position_vertical': '194945',
  'is_inline': False}]

In [218]:
doc_width("new.docx")

7560310