## Qualitative data analysis with Microsoft Word comments & Python

#####  Instructions based on article - https://carstenknoch.com/2020/02/qualitative-data-analysis-with-microsoft-word-comments-python-updated/
###### Given a .docx file, extract a CSV list of all tagged (commented) text




In [None]:
# Import the necessary libraries.
# Install the Beautiful Soup library.
!pip install beautifulsoup4

# Install lxml
!pip install lxml

# Import the requests and Beautiful Soup libraries.
from bs4 import BeautifulSoup
import zipfile
import csv
import re
from io import BytesIO
from IPython.display import display, FileLink
from ipywidgets import FileUpload
import os

# Optional - Ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Given a .docx file, extract a CSV list of all tagged (commented) text
# Updated code using chatgpt to troubleshoot


# Step 1: Upload files using Jupyter's file upload widget
# Files must be closed in order to select them.
upload = FileUpload(accept=".docx", multiple=True)
display(upload)

In [None]:
# Create dictionaries outside the loop to store data from all files
all_start_loc = {}
all_end_loc = {}

if upload.value:
    for filename, file_info in upload.value.items():
        print(f"Processing file: {filename}")
        content = BytesIO(file_info['content'])

        try:
            unzip = zipfile.ZipFile(content)

            # Extract comments.xml
            comments = BeautifulSoup(unzip.read('word/comments.xml'), 'lxml')

            # Extract document.xml
            doc = unzip.read('word/document.xml').decode()

            # Find comment start and end locations
            start_loc = {x.group(1): x.start() for x in re.finditer(r'<w:commentRangeStart.*?w:id="(.*?)"', doc)}
            end_loc = {x.group(1): x.end() for x in re.finditer(r'<w:commentRangeEnd.*?w:id="(.*?)".*?>', doc)}

            # Update the dictionaries for all files
            all_start_loc.update(start_loc)
            all_end_loc.update(end_loc)

        except zipfile.BadZipFile:
            print(f"Error: {filename} is not a valid zip file.")

In [None]:
# Get the absolute path of the directory
directory_path = os.path.abspath('/'.join(filename.split('/')[:-1]))

In [None]:
print (directory_path)

In [None]:
# Write to CSV using the dictionaries
csv_path = os.path.join(directory_path, 'responses.csv')

with open(csv_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
    csvw = csv.writer(csvfile)

    # Loop through comments using all_start_loc and all_end_loc
    for c_id, start in all_start_loc.items():
        end = all_end_loc[c_id]

        xml = re.sub(r'(<w:p .*?>)', r'\1 ', doc[start:end + 1])
        label_text = ''.join(BeautifulSoup(xml, 'lxml').findAll(text=True))
        comment_text = ''.join(comments.find(attrs={'w:id': c_id}).findAll(text=True))

        # Write to CSV
        csvw.writerow([comment_text, label_text])