# ZIPFILE CSV

In [30]:
import zipfile
import xml.etree.ElementTree as ET
import csv

# Define the namespace used in WordprocessingML
WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
TEXT = WORD_NAMESPACE + 't'
TABLE = WORD_NAMESPACE + 'tbl'
ROW = WORD_NAMESPACE + 'tr'
CELL = WORD_NAMESPACE + 'tc'

# Open the docx file as a zip archive and read the document.xml file
with zipfile.ZipFile('reg3.docx') as docx:
    tree = ET.XML(docx.read('word/document.xml'))

# Iterate over tables and extract text from each cell
table_count = 1  # To keep track of table number for naming files
for table in tree.iter(TABLE):
    table_data = []  # List to store rows for the current table
    
    for row in table.iter(ROW):
        row_data = []  # List to hold the data of a single row
        for cell in row.iter(CELL):
            cell_text = ''.join(node.text or '' for node in cell.iter(TEXT))
            if cell_text.strip():  # Only add non-empty cells
                row_data.append(cell_text.strip())
        if row_data:  # Only append rows that have data
            table_data.append(row_data)
    
    # Save each table to a separate CSV file
    if table_data:  # If there's data in the table
        csv_filename = f'table{table_count}.csv'
        with open(csv_filename, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerows(table_data)
        
        print(f"Table {table_count} data has been saved to {csv_filename}")
        table_count += 1  # Increment to get the next table number


Table 1 data has been saved to table1.csv
Table 2 data has been saved to table2.csv
Table 3 data has been saved to table3.csv
Table 4 data has been saved to table4.csv
Table 5 data has been saved to table5.csv


# CSV TABULAR FORMAT

In [31]:
import pandas as pd
from docx import Document

# Load the Word document
document = Document('reg3.docx')

# Get all the tables in the document
tables = document.tables

# Iterate over each table
for table_index, table in enumerate(tables):
    data = []  # List to store row data for the current table
    keys = None  # Will store the header keys for the current table
    
    # Iterate over the rows in the table
    for i, row in enumerate(table.rows):
        text = (cell.text.strip() for cell in row.cells)  # Strip extra spaces
        
        if i == 0:
            keys = tuple(text)  # First row is considered as column headers
            continue
        
        # Create a dictionary with keys as column headers and row values
        row_data = dict(zip(keys, text))
        data.append(row_data)
    
    # Create a DataFrame for the current table
    df = pd.DataFrame(data)
    
    # Create a dynamic filename for each table (e.g., table1.csv, table2.csv, etc.)
    csv_filename = f'table{table_index + 1}.csv'
    
    # Save the DataFrame as a CSV file
    df.to_csv(csv_filename, index=False)
    
    # Print the DataFrame for this table
    print(f"DataFrame for Table {table_index + 1} saved as {csv_filename}")
    print(df)
    print("\n" + "-"*40 + "\n")


DataFrame for Table 1 saved as table1.csv
    E/ECE/324/Add.2/Rev.5−E/ECE/TRANS/505/Add.2/Rev.5
0                                         9 June 2020

----------------------------------------

DataFrame for Table 2 saved as table2.csv
        Class Angle of divergence\nα Illumination angles (in degrees)
0                                                           ±5°\n±20°
1      IA, IB             20'\n1°30'                         100\n2.5
2  IIIA, IIIB             20'\n1°30'                           150\n8

----------------------------------------

DataFrame for Table 3 saved as table3.csv
  Colour Angle of divergence\nα Illumination angles (in degrees)
0                                                         0\n±50
1  White             20'\n1°30'                          400\n15
2  Amber             20'\n1°30'                          250\n10
3    Red             20'\n1°30'                           100\n4

----------------------------------------

DataFrame for Table 4 saved as t

# JSON TABULAR FORMAT

In [33]:
import pandas as pd
from docx import Document

# Load the Word document
document = Document('reg3.docx')

# Get all the tables in the document
tables = document.tables

# Iterate over each table
for table_index, table in enumerate(tables):
    data = []  # List to store row data for the current table
    keys = None  # Will store the header keys for the current table
    
    # Iterate over the rows in the table
    for i, row in enumerate(table.rows):
        text = (cell.text.strip() for cell in row.cells)  # Strip extra spaces
        
        if i == 0:
            keys = tuple(text)  # First row is considered as column headers
            continue
        
        # Create a dictionary with keys as column headers and row values
        row_data = dict(zip(keys, text))
        data.append(row_data)
    
    # Create a DataFrame for the current table
    df = pd.DataFrame(data)
    
    # Create a dynamic filename for each table (e.g., table1.json, table2.json, etc.)
    json_filename = f'table{table_index + 1}.json'
    
    # Save the DataFrame as a JSON file
    df.to_json(json_filename, orient='records', lines=True)  # orient='records' for a list of dicts
    
    # Print the DataFrame for this table
    print(f"DataFrame for Table {table_index + 1} saved as {json_filename}")
    print(df)
    print("\n" + "-"*40 + "\n")


DataFrame for Table 1 saved as table1.json
    E/ECE/324/Add.2/Rev.5−E/ECE/TRANS/505/Add.2/Rev.5
0                                         9 June 2020

----------------------------------------

DataFrame for Table 2 saved as table2.json
        Class Angle of divergence\nα Illumination angles (in degrees)
0                                                           ±5°\n±20°
1      IA, IB             20'\n1°30'                         100\n2.5
2  IIIA, IIIB             20'\n1°30'                           150\n8

----------------------------------------

DataFrame for Table 3 saved as table3.json
  Colour Angle of divergence\nα Illumination angles (in degrees)
0                                                         0\n±50
1  White             20'\n1°30'                          400\n15
2  Amber             20'\n1°30'                          250\n10
3    Red             20'\n1°30'                           100\n4

----------------------------------------

DataFrame for Table 4 saved a

# STRINGIFIED TABULAR DATA

In [32]:
import pandas as pd
from docx import Document

# Function to convert DataFrame to a string representation
def df_to_string(df):
    return df.to_string(index=False)  # Convert DataFrame to string without the index

# Load the Word document
document = Document('reg3.docx')

# Get all the tables in the document
tables = document.tables

# Iterate over each table
for table_index, table in enumerate(tables):
    data = []  # List to store row data for the current table
    keys = None  # Will store the header keys for the current table
    
    # Iterate over the rows in the table
    for i, row in enumerate(table.rows):
        text = (cell.text.strip() for cell in row.cells)  # Strip extra spaces
        
        if i == 0:
            keys = tuple(text)  # First row is considered as column headers
            continue
        
        # Create a dictionary with keys as column headers and row values
        row_data = dict(zip(keys, text))
        data.append(row_data)
    
    # Create a DataFrame for the current table
    df = pd.DataFrame(data)
    
    # Convert the DataFrame to a string
    df_string = df_to_string(df)
    
    # Print the stringified DataFrame for the current table
    print(f"Stringified DataFrame for Table {table_index + 1}:")
    print(df_string)
    print("\n" + "-"*40 + "\n")


Stringified DataFrame for Table 1:
 E/ECE/324/Add.2/Rev.5−E/ECE/TRANS/505/Add.2/Rev.5
                                       9 June 2020

----------------------------------------

Stringified DataFrame for Table 2:
     Class Angle of divergence\nα Illumination angles (in degrees)
                                                         ±5°\n±20°
    IA, IB             20'\n1°30'                         100\n2.5
IIIA, IIIB             20'\n1°30'                           150\n8

----------------------------------------

Stringified DataFrame for Table 3:
Colour Angle of divergence\nα Illumination angles (in degrees)
                                                        0\n±50
 White             20'\n1°30'                          400\n15
 Amber             20'\n1°30'                          250\n10
   Red             20'\n1°30'                           100\n4

----------------------------------------

Stringified DataFrame for Table 4:
      Number of annex   Number of paragraph   

# TEXT ONLY EXTRACTION

In [38]:
import docx

def saveText(filename, output_txt):
    # Load the document
    doc = docx.Document(filename)
    with open(output_txt, 'w', encoding='utf-8') as file:
        # Iterate over paragraphs and write to the output file
        for para in doc.paragraphs:
            file.write(para.text + '\n')  # Adds each paragraph with a newline


In [None]:
# Example usage
saveText('reg6.docx', 'output.txt')