In [1]:
import zipfile
import csv
from bs4 import BeautifulSoup as Soup
import re
from dataclasses import dataclass
import ipywidgets as widgets
from IPython.display import display

In [2]:
@dataclass
class WordDocScraper:
    path: str
    comments: list = None

    def extract_comments(self):
        unzip = zipfile.ZipFile(self.path)
        comments_xml = unzip.read('word/comments.xml')
        comments_soup = Soup(comments_xml, 'lxml')
        
        doc = unzip.read('word/document.xml').decode()
        start_loc = {x.group(1): x.start() for x in re.finditer(r'<w:commentRangeStart.*?w:id="(.*?)"', doc)}
        end_loc = {x.group(1): x.end() for x in re.finditer(r'<w:commentRangeEnd.*?w:id="(.*?)".*?>', doc)}

        comments_list = []
        for c in comments_soup.find_all('w:comment'):
            c_id = c.attrs['w:id']
            xml = re.sub(r'(<w:p .*?>)', r'\1 ', doc[start_loc[c_id]:end_loc[c_id] + 1])
            comment_text = ''.join(c.findAll(text=True))
            label_text = ''.join(Soup(xml, 'lxml').findAll(text=True))
            comments_list.append((comment_text, label_text))
        
        unzip.close()
        self.comments = comments_list

def main(file_paths):
    with open('output.csv', 'w', newline='', encoding='utf-8-sig') as f:
        csv_writer = csv.writer(f)
        csv_writer.writerow(['Filename', 'Comment Text', 'Label Text'])
        
        for path in file_paths:
            scraper = WordDocScraper(path)
            scraper.extract_comments()
            for comment, label in scraper.comments:
                csv_writer.writerow([path.split('/')[-1], comment, label])

# File upload widget
uploader = widgets.FileUpload(
    accept='.docx',
    multiple=True,
    description='Upload'
)

# Callback function for upload event
def on_upload(change):
    file_paths = []
    for filename, file_info in uploader.value.items():
        with open(filename, 'wb') as f:
            f.write(file_info['content'])
        file_paths.append(filename)
    main(file_paths)
    print("Comments extracted and saved to output.csv")

uploader.observe(on_upload, names='value')
display(uploader)


FileUpload(value={}, accept='.docx', description='Upload', multiple=True)

Comments extracted and saved to output.csv


