In [1]:
import pandas as pd
import os
import sys
from pathlib import Path
from datetime import datetime
import subprocess

from docx import Document
from pptx import Presentation

# pandas formatting
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', 200)
pd.set_option('display.float_format', '{:.1f}'.format)


meeting_folder = "sept_24_coordinators_meeting"

In [2]:
def extract_docx_content(file_path):
    """Extract text from .docx file with structure information."""
    doc = Document(file_path)
    content = []
    
    for para in doc.paragraphs:
        if para.text.strip():
            element_type = 'heading' if para.style.name.startswith('Heading') else 'paragraph'
            content.append({
                'text': para.text,
                'element_type': element_type,
                'style': para.style.name
            })
    
    if doc.tables:
        for table_idx, table in enumerate(doc.tables):
            for row_idx, row in enumerate(table.rows):
                row_data = [cell.text for cell in row.cells]
                content.append({
                    'text': ' | '.join(row_data),
                    'element_type': 'table_row',
                    'style': f'table_{table_idx}_row_{row_idx}'
                })
    
    return content

def extract_pptx_content(file_path):
    """Extract text from .pptx file with slide information."""
    prs = Presentation(file_path)
    content = []
    
    for slide_idx, slide in enumerate(prs.slides):
        for shape_idx, shape in enumerate(slide.shapes):
            if hasattr(shape, "text") and shape.text.strip():
                content.append({
                    'text': shape.text,
                    'element_type': 'slide_text',
                    'style': f'slide_{slide_idx}_shape_{shape_idx}'
                })
            
            if shape.has_table:
                table = shape.table
                for row_idx, row in enumerate(table.rows):
                    row_data = [cell.text for cell in row.cells]
                    content.append({
                        'text': ' | '.join(row_data),
                        'element_type': 'table_row',
                        'style': f'slide_{slide_idx}_table_row_{row_idx}'
                    })
        
        if slide.has_notes_slide:
            notes_text = slide.notes_slide.notes_text_frame.text
            if notes_text.strip():
                content.append({
                    'text': notes_text,
                    'element_type': 'slide_notes',
                    'style': f'slide_{slide_idx}_notes'
                })
    
    return content

def process_meeting_folder(folder_path):
    """Process all files in meeting folder and return structured DataFrame."""
    all_data = []
    
    for file_name in sorted(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, file_name)
        
        if not os.path.isfile(file_path):
            continue
        
        file_ext = Path(file_path).suffix.lower()
        content = []
        
        if file_ext == '.docx':
            content = extract_docx_content(file_path)
        elif file_ext == '.pptx':
            content = extract_pptx_content(file_path)
        
        for item in content:
            all_data.append({
                'source_file': file_name,
                'source_type': file_ext,
                'text': item['text'],
                'element_type': item['element_type'],
                'style': item['style'],
                'extraction_date': datetime.now().isoformat()
            })
    
    df = pd.DataFrame(all_data)
    return df

# Phase 2: Data Parsing

In [3]:
print("Ensuring correct versions of dependencies...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "python-docx"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "python-pptx"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "langdetect"])

print("\nDependencies installed/upgraded successfully!")

Ensuring correct versions of dependencies...

Dependencies installed/upgraded successfully!


## Step 1: Extract text from Word documents

In [4]:
docx_files = [f for f in os.listdir(meeting_folder) if f.endswith('.docx')]
print(f"Found {len(docx_files)} Word documents:")
for f in docx_files:
    print(f"  - {f}")

docx_data = []
for file_name in docx_files:
    file_path = os.path.join(meeting_folder, file_name)
    print(f"\nProcessing: {file_name}")
    content = extract_docx_content(file_path)
    print(f"  Extracted {len(content)} elements")
    for item in content:
        docx_data.append({
            'source_file': file_name,
            'source_type': '.docx',
            'text': item['text'],
            'element_type': item['element_type'],
            'style': item['style']
        })

df_docx = pd.DataFrame(docx_data)
print(f"\nTotal DOCX rows: {len(df_docx)}")
df_docx.head(10)

Found 7 Word documents:
  - Centralization of web and publication.docx
  - Coordinators F2F Agenda.docx
  - F2F Action Items.docx
  - F2F Meeting Notes (draft).docx
  - F2F Meeting Report (near final).docx
  - F2F Meeting Report (near final)_TG_FR_LS_Final.docx
  - Options and best practices for timely publication v2.docx

Processing: Centralization of web and publication.docx
  Extracted 25 elements

Processing: Coordinators F2F Agenda.docx
  Extracted 60 elements

Processing: F2F Action Items.docx
  Extracted 18 elements

Processing: F2F Meeting Notes (draft).docx
  Extracted 174 elements

Processing: F2F Meeting Report (near final).docx
  Extracted 173 elements

Processing: F2F Meeting Report (near final)_TG_FR_LS_Final.docx
  Extracted 173 elements

Processing: Options and best practices for timely publication v2.docx
  Extracted 47 elements

Total DOCX rows: 670


Unnamed: 0,source_file,source_type,text,element_type,style
0,Centralization of web and publication.docx,.docx,Centralization of web and publication,heading,Heading 1
1,Centralization of web and publication.docx,.docx,Goals: to find efficiencies (time and capacity) in the publication process of the CSAS wheel.,paragraph,Normal
2,Centralization of web and publication.docx,.docx,Challenges: As per the 2018 evaluation to reduce publications timelines.,paragraph,Normal
3,Centralization of web and publication.docx,.docx,Task of the NCR web and pub team (adapted from CSAS Roles and Responsibilities 2015 document),heading,Heading 2
4,Centralization of web and publication.docx,.docx,Information management and technology,paragraph,List Paragraph
5,Centralization of web and publication.docx,.docx,Liaise with CDOS on issues related to CSAS database,paragraph,List Paragraph
6,Centralization of web and publication.docx,.docx,Contribute to modernization of IT Tools,paragraph,List Paragraph
7,Centralization of web and publication.docx,.docx,Management of national shared drive (content and access),paragraph,List Paragraph
8,Centralization of web and publication.docx,.docx,Knowledge dissemination,paragraph,List Paragraph
9,Centralization of web and publication.docx,.docx,Publish CSAS documents,paragraph,List Paragraph


## Step 2: Extract text from PowerPoint presentations

In [5]:
pptx_files = [f for f in os.listdir(meeting_folder) if f.endswith('.pptx')]
print(f"Found {len(pptx_files)} PowerPoint presentations:")
for f in pptx_files:
    print(f"  - {f}")

pptx_data = []
for file_name in pptx_files:
    file_path = os.path.join(meeting_folder, file_name)
    print(f"\nProcessing: {file_name}")
    content = extract_pptx_content(file_path)
    print(f"  Extracted {len(content)} elements")
    for item in content:
        pptx_data.append({
            'source_file': file_name,
            'source_type': '.pptx',
            'text': item['text'],
            'element_type': item['element_type'],
            'style': item['style']
        })

df_pptx = pd.DataFrame(pptx_data)
print(f"\nTotal PPTX rows: {len(df_pptx)}")
df_pptx.head(10)

Found 5 PowerPoint presentations:
  - CSAS Publications.pptx
  - CSAS Transformation update-FR.pptx
  - CSAS Transformation update.pptx
  - Process vs Product.pptx
  - Survival exericise.pptx

Processing: CSAS Publications.pptx
  Extracted 23 elements

Processing: CSAS Transformation update-FR.pptx
  Extracted 61 elements

Processing: CSAS Transformation update.pptx
  Extracted 61 elements

Processing: Process vs Product.pptx
  Extracted 15 elements

Processing: Survival exericise.pptx
  Extracted 32 elements

Total PPTX rows: 192


Unnamed: 0,source_file,source_type,text,element_type,style
0,CSAS Publications.pptx,.pptx,CSAS Publications,slide_text,slide_0_shape_0
1,CSAS Publications.pptx,.pptx,CSAS Coordinators F2F Meeting\nSeptember 2024,slide_text,slide_0_shape_1
2,CSAS Publications.pptx,.pptx,Objective of Discussion,slide_text,slide_1_shape_0
3,CSAS Publications.pptx,.pptx,To further explore options for facilitating timely publications\nTo develop recommendations for the Science Executive Committee (as part of reporting on status of overdue publications)\n\n“overdue...,slide_text,slide_1_shape_1
4,CSAS Publications.pptx,.pptx,Review of Status,slide_text,slide_2_shape_0
5,CSAS Publications.pptx,.pptx,Date​ | “Overdue publications”​\n(from meetings in 2020 and earlier)​,table_row,slide_2_table_row_0
6,CSAS Publications.pptx,.pptx,December 2021​ | 455​,table_row,slide_2_table_row_1
7,CSAS Publications.pptx,.pptx,February 2022​ | 381​,table_row,slide_2_table_row_2
8,CSAS Publications.pptx,.pptx,March 2022​ | 377​,table_row,slide_2_table_row_3
9,CSAS Publications.pptx,.pptx,May 2022​ | 342​,table_row,slide_2_table_row_4


## Step 3: Combine into master DataFrame

In [6]:
df_raw = pd.concat([df_docx, df_pptx], ignore_index=True)
print(f"Combined raw extraction: {len(df_raw)} total rows")
print(f"\nBreakdown by source type:")
print(df_raw['source_type'].value_counts())
print(f"\nBreakdown by element type:")
print(df_raw['element_type'].value_counts())

# Add index for tracking
df_raw.insert(0, 'row_id', range(1, len(df_raw) + 1))

print(f"\nFirst 10 rows:")
df_raw.head(10)

Combined raw extraction: 862 total rows

Breakdown by source type:
source_type
.docx    670
.pptx    192
Name: count, dtype: int64

Breakdown by element type:
element_type
paragraph      421
table_row      295
slide_text     120
heading         23
slide_notes      3
Name: count, dtype: int64

First 10 rows:


Unnamed: 0,row_id,source_file,source_type,text,element_type,style
0,1,Centralization of web and publication.docx,.docx,Centralization of web and publication,heading,Heading 1
1,2,Centralization of web and publication.docx,.docx,Goals: to find efficiencies (time and capacity) in the publication process of the CSAS wheel.,paragraph,Normal
2,3,Centralization of web and publication.docx,.docx,Challenges: As per the 2018 evaluation to reduce publications timelines.,paragraph,Normal
3,4,Centralization of web and publication.docx,.docx,Task of the NCR web and pub team (adapted from CSAS Roles and Responsibilities 2015 document),heading,Heading 2
4,5,Centralization of web and publication.docx,.docx,Information management and technology,paragraph,List Paragraph
5,6,Centralization of web and publication.docx,.docx,Liaise with CDOS on issues related to CSAS database,paragraph,List Paragraph
6,7,Centralization of web and publication.docx,.docx,Contribute to modernization of IT Tools,paragraph,List Paragraph
7,8,Centralization of web and publication.docx,.docx,Management of national shared drive (content and access),paragraph,List Paragraph
8,9,Centralization of web and publication.docx,.docx,Knowledge dissemination,paragraph,List Paragraph
9,10,Centralization of web and publication.docx,.docx,Publish CSAS documents,paragraph,List Paragraph


## Step 4: Validate extraction quality

In [7]:
print("=== EXTRACTION QUALITY ASSESSMENT ===\n")

print(f"Total rows extracted: {len(df_raw)}")
print(f"Total characters: {df_raw['text'].str.len().sum():,}")
print(f"Average text length per row: {df_raw['text'].str.len().mean():.1f} chars")

print(f"\n--- Null values ---")
print(df_raw.isnull().sum())

print(f"\n--- Text length distribution ---")
print(df_raw['text'].str.len().describe())

print(f"\n--- Files processed ---")
for source_file in sorted(df_raw['source_file'].unique()):
    count = len(df_raw[df_raw['source_file'] == source_file])
    total_chars = df_raw[df_raw['source_file'] == source_file]['text'].str.len().sum()
    print(f"  {source_file:<60} {count:>4} rows  {total_chars:>8,} chars")

print(f"\n--- Element types extracted ---")
for elem_type in sorted(df_raw['element_type'].unique()):
    count = len(df_raw[df_raw['element_type'] == elem_type])
    print(f"  {elem_type:<30} {count:>4} rows")

print("\nExtraction completed successfully!")

=== EXTRACTION QUALITY ASSESSMENT ===

Total rows extracted: 862
Total characters: 103,971
Average text length per row: 120.6 chars

--- Null values ---
row_id          0
source_file     0
source_type     0
text            0
element_type    0
style           0
dtype: int64

--- Text length distribution ---
count   862.0
mean    120.6
std     135.0
min       1.0
25%      28.0
50%      72.0
75%     161.5
max     865.0
Name: text, dtype: float64

--- Files processed ---
  CSAS Publications.pptx                                         23 rows     1,665 chars
  CSAS Transformation update-FR.pptx                             61 rows     7,141 chars
  CSAS Transformation update.pptx                                61 rows     5,561 chars
  Centralization of web and publication.docx                     25 rows     2,662 chars
  Coordinators F2F Agenda.docx                                   60 rows     5,478 chars
  F2F Action Items.docx                                          18 rows       470 