This tutorial will teach you how to transform a docx into a pandas dataframe and then save it as a CSV file.

Prerequisites:
* you will need Python installed on your computer
* make sure that your file is a .docx and not a .doc

## Install the appropriate Python packages

In [1]:
#specific to extracting information from word documents
import os
import zipfile

#other tools useful in extracting the information from our document
import re

#to read XML and JSON
from lxml import etree
import json

#to use dataframes
import pandas as pd

## Anatomy of a .docx file

What is a docx file? Well, it is in fact an archive of xml files!
Let's look at it.

- Rename the file extension from .docx to .zip 
- Unzip the newly renamed file

In [2]:
#file = "burmese/burmese_selection"
#file = "turkish/turkish_sample"
file = "india/india_sample"

docxFileName = file+".docx"
docxZip = zipfile.ZipFile(docxFileName)
documentXML = docxZip.read('word/document.xml')
stylesXML = docxZip.read('word/styles.xml')
et = etree.XML(documentXML)
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

If you want to see what are the XML files hidden in your docx file you can use: `namelist()`

In [3]:
#docxZip.namelist()

In [4]:
#Title
#ReferenceOld
#ReferenceNew
#Item date
#Content description
#Physical description
#Creator
#Language


p = './w:r//w:t'
ReferenceOld_xpath = './w:r[w:rPr[w:rStyle[@w:val="ReferenceOld"]]]/w:t'
ReferenceNew_xpath = './w:r[w:rPr[w:rStyle[@w:val="ReferenceNew"]]]/w:t'
PhysicalDescription_xpath = './w:r[w:rPr[w:rStyle[@w:val="PhysicalDescription"]]][w:t]/w:t'
ContentDescription_xpath = './w:pPr[w:pStyle[@w:val="ContentDescription"]]/following-sibling::w:r/w:t'
title_xpath = './w:pPr[w:pStyle[@w:val="Title"]]/following-sibling::w:r/w:t'
date_xpath = './w:r[w:rPr[w:rStyle[@w:val="ItemDate"]]][w:t]/w:t'

In [5]:
table = pd.DataFrame()

ReferenceOld_dic = {}
ReferenceNew_dic = {}
title_dic = {}
ContentDescription_dic = {}
PhysicalDescription_dic = {}
date_dic = {}

ReferenceOld = float('NaN')
ReferenceNew = float('NaN')
title = ''
PhysicalDescription = ''
ContentDescription = ''
date = ''

def populate_series(key, value, dictionary, mode):
    if key not in dictionary:
        dictionary[key] = value
    if mode=='content': 
        if value not in dictionary[key]:
            dictionary[key]+="\n"+value
            value = ''
    return dictionary, value
    

def get_info(para, xpath, field):
    if para.xpath(xpath, namespaces=ns):
        text = para.xpath(xpath, namespaces=ns)
        field = " ".join([t.text.strip() for t in text]).strip('\n')
    return field



for i, para in enumerate(et.xpath('//w:p', namespaces=ns)):
    
    ##TODO: here we could add some contraints in case there is just old or new refernce (or we add it to the guidelines)
    
    ###extract info from word
    
    ReferenceOld = get_info(para, ReferenceOld_xpath, ReferenceOld)
    ReferenceNew = get_info(para, ReferenceNew_xpath, ReferenceNew)
    title = get_info(para, title_xpath, title)
    PhysicalDescription = get_info(para, PhysicalDescription_xpath, PhysicalDescription)
    ContentDescription = get_info(para, ContentDescription_xpath, ContentDescription)
    date = get_info(para, date_xpath, date)
           
    ###create searies
   
    ReferenceOld_dic, ReferenceOld = populate_series(ReferenceOld, ReferenceOld, ReferenceOld_dic, "ref")
    ReferenceNew_dic, ReferenceNew = populate_series(ReferenceOld, ReferenceNew, ReferenceNew_dic, "ref")
    title_dic, title = populate_series(ReferenceOld, title, title_dic, "content")
    date_dic, date = populate_series(ReferenceOld, date, date_dic, "content")
    ContentDescription_dic, ContentDescription = populate_series(ReferenceOld, ContentDescription, ContentDescription_dic, "content")
    PhysicalDescription_dic, PhysicalDescription = populate_series(ReferenceOld, PhysicalDescription, PhysicalDescription_dic, "content")

            

table = pd.DataFrame.from_dict({'ReferenceOld':pd.Series(ReferenceOld_dic),'ReferenceNew':pd.Series(ReferenceNew_dic),'Title':pd.Series(title_dic), 'PhysicalDescription':pd.Series(PhysicalDescription_dic), 'ContentDescription':pd.Series(ContentDescription_dic), 'Item date':pd.Series(date_dic)}
)

table = table.applymap(lambda x: x.strip('\n') if type(x)==str else x)

header_list = ['ReferenceOld','ReferenceNew', 'Title', 'PhysicalDescription','ContentDescription', 'Item date','Language','Creator']
table = table.reindex(columns = header_list) 
table = table.dropna(how='all', subset=['ReferenceOld', 'ReferenceNew'])

table.to_csv(file+".csv", encoding='utf-8-sig') 

table

Unnamed: 0,ReferenceOld,ReferenceNew,Title,PhysicalDescription,ContentDescription,Item date,Language,Creator
1,1,Mss. Eur. G. 1 .,"[“ Rec d from Exam rs Office"" 5 Oct. 1814.]\...","50 x 30 cm, pp. 264.","There is no general title, but the manuscript ...",1914 and 1919.,,
2,2,Mss. Eur. D. 2,A Decree of the Holy Congregation Generall for...,"30 x 18.5 cm, ' Foll . 3.","The watermarks are (a) Arms, Quarterly: 1st an...",,,
3,3,MSS. Eur. G. 2 .,Peticion of ye East India Company.,"44 x 32' cm. One sheet, framed and hung in the...",This document is reproduced (actual size) in R...,,,
4,4,MSS.Eur . D.3 .,"[Purchased 14 July 1916.]\n[ Batavia's , Statu...","33 x '21 cm. pp., vi, 200","This volume is lettered "" Batasia's Statut B...",,,
5,5,MSS Eur F.1,[JOSIAH WEBBE?],39 x 25 cm. pp. 126.,"Verbael , uijt afgesonden en aengekomen brie...",,,


In [None]:
print(ReferenceNew_dic)

In [None]:
for i, para in enumerate(et.xpath('//w:p', namespaces=ns)):
    
    
    check = para.xpath(ReferenceOld_xpath, namespaces=ns)
    if check:
        text = para.xpath(ReferenceOld_xpath, namespaces=ns)
        ReferenceOld = " ".join([t.text.strip() for t in text]).strip('\n')
        print(ReferenceOld)

In [None]:
#find the initial position of each record

cat_idx_pos = []

for i, para in enumerate(et.xpath('//w:p', namespaces=ns)):
    if para.xpath(ReferenceOld_xpath, namespaces=ns):
        #print(i,para)
        cat_idx_pos.append(i)
print(cat_idx_pos)
#print(etree.tostring(cats[0], pretty_print=True))


In [None]:
# create ranges 

records = []
for i,cat_id in enumerate(cat_idx_pos):
    if cat_id != cat_idx_pos[-1]:
        print
        idx = range(cat_id, cat_idx_pos[i+1])
        records.append([x for x in idx])
print(records)



In [None]:
# create a catalogue including the relevant paras
record = []
catalogue_record_list = []

for x in records:
        for ix in x:
            catalogue_record = etree.Element("record")
            for i, para in enumerate(et.xpath('//w:p', namespaces=ns)):
                if i==ix:
                    catalogue_record.append(copy.deepcopy(para))
        catalogue_record_list.append(catalogue_record)
        
    
          
        #print(etree.tostring(catalogue_record, pretty_print=True))
        
len(catalogue_record_list)

In [None]:

table_turkish = pd.DataFrame()
table = pd.DataFrame()

catalogue_record_list = catalogue_record_list[0]
for cat_rec in catalogue_record_list:
    for i, para in enumerate(cat_rec.xpath('./w:p', namespaces=ns)):

        check = para.xpath(ReferenceOld_xpath, namespaces=ns)
        if check:
            ReferenceOld = para.xpath(ReferenceOld_xpath, namespaces=ns)[0].text.strip()
        else:
            ReferenceOld = ''

        if para.xpath(title_xpath, namespaces=ns):
            title = para.xpath(title_xpath, namespaces=ns)[0].text.strip()
        else: 
            title = ''

        if para.xpath(PhysicalDescription_xpath, namespaces=ns):
            text = para.xpath(PhysicalDescription_xpath, namespaces=ns)
            PhysicalDescription = " ".join([t.text.strip() for t in text]).strip(' \n')
        else: 
            PhysicalDescription = ''

            
        if para.xpath(ContentDescription_xpath, namespaces=ns):
            text = para.xpath(ContentDescription_xpath, namespaces=ns)
            description = " ".join([t.text.strip() for t in text]).strip(' \n')
            pilcrow = """
                """
            description = re.sub(r"[\r\n]+", pilcrow, description)
        else:
            description = ''

        if para.xpath(date_xpath, namespaces=ns):
            date = para.xpath(date_xpath, namespaces=ns)[0].text.strip()
        else: 
            date = ''

        table = table.append({'ReferenceOld':ReferenceOld, 'title': title, 'PhysicalDescription': PhysicalDescription, 'description': description, 'date': date}, ignore_index=True)
        table.to_csv('test.csv', encoding="utf-8") 

In [None]:
#approccio diverso


catalogue_record_list = catalogue_record_list[0]
for cat_rec in catalogue_record_list:
    for i, para in enumerate(cat_rec.xpath('./w:p', namespaces=ns)):
        catalogue = {}
        
        check = para.xpath(ReferenceOld_xpath, namespaces=ns)
        if check:
            ReferenceOld = para.xpath(ReferenceOld_xpath, namespaces=ns)[0].text.strip()
        else:
            ReferenceOld = ''

        if para.xpath(title_xpath, namespaces=ns):
            title = para.xpath(title_xpath, namespaces=ns)[0].text.strip()
        else: 
            title = ''

        if para.xpath(PhysicalDescription_xpath, namespaces=ns):
            text = para.xpath(PhysicalDescription_xpath, namespaces=ns)
            PhysicalDescription = " ".join([t.text.strip() for t in text]).strip(' \n')
        else: 
            PhysicalDescription = ''

            
        if para.xpath(ContentDescription_xpath, namespaces=ns):
            text = para.xpath(ContentDescription_xpath, namespaces=ns)
            description = " ".join([t.text.strip() for t in text]).strip(' \n')
            pilcrow = """
                """
            description = re.sub(r"[\r\n]+", pilcrow, description)
        else:
            description = ''

        if para.xpath(date_xpath, namespaces=ns):
            date = para.xpath(date_xpath, namespaces=ns)[0].text.strip()
        else: 
            date = ''

In [None]:
for a in catalogue_record_list[0].xpath('w:p/w:r/w:t', namespaces=ns):
    print(a.text)

In [None]:
catalogue_record_list[0].xpath('w:p/w:r', namespaces=ns)

In [None]:
et.xpath('//w:p', namespaces=ns