# Sept 24 Coordinators Meeting - Text Extraction & Analysis
Processing meeting documents through extraction, cleaning, categorization, and summarization.

In [20]:
import os
import pandas as pd
from pathlib import Path
import sys
import json

# Setup
meeting_folder = "sept_24_coordinators_meeting"
data_dir = os.path.join(meeting_folder, 'data')
os.makedirs(data_dir, exist_ok=True)

# Import helper functions
sys.path.insert(0, os.getcwd())
from extract_helpers import extract_docx_content, extract_pptx_content
from categorize_helpers import detect_language, normalize_text, categorize_content, detect_themes, THEMES
from summary_helpers import export_summaries, export_theme_analysis
from analysis_helpers import (analyze_patterns, identify_conflicts, identify_high_priority_items,
                               generate_stakeholder_summary, generate_follow_up_actions, export_analysis)

print("Setup complete. Ready to process meeting documents.")

Setup complete. Ready to process meeting documents.


## Phase 2-3: Extract and Clean Data

In [21]:
# Extract all documents
all_data = []
for file_name in sorted(os.listdir(meeting_folder)):
    file_path = os.path.join(meeting_folder, file_name)
    if not os.path.isfile(file_path):
        continue

    file_ext = Path(file_path).suffix.lower()
    content = []

    if file_ext == '.docx':
        content = extract_docx_content(file_path)
    elif file_ext == '.pptx':
        content = extract_pptx_content(file_path)

    for item in content:
        all_data.append({
            'source_file': file_name,
            'source_type': file_ext,
            'text': item['text'],
            'element_type': item['element_type'],
            'style': item['style']
        })

df_raw = pd.DataFrame(all_data)
df_raw.insert(0, 'row_id', range(1, len(df_raw) + 1))

print(f"Extracted {len(df_raw)} rows from {df_raw['source_file'].nunique()} documents")

Extracted 862 rows from 12 documents


## Phase 3-4: Categorize and Refine

In [22]:
# Add language detection and categorization
df_clean = df_raw.copy()
df_clean['language'] = df_clean['text'].apply(detect_language)
df_clean['text_normalized'] = df_clean['text'].apply(normalize_text)

# Categorize content
categorization = df_clean['text'].apply(categorize_content)
df_clean['action_categories'] = categorization.apply(lambda x: x['action_categories'])
df_clean['recommendation_categories'] = categorization.apply(lambda x: x['recommendation_categories'])
df_clean['contention_categories'] = categorization.apply(lambda x: x['contention_categories'])

# Add strength scores
df_clean['action_strength'] = df_clean['action_categories'].apply(len)
df_clean['recommendation_strength'] = df_clean['recommendation_categories'].apply(len)
df_clean['contention_strength'] = df_clean['contention_categories'].apply(len)

# Mark document status
df_clean['document_status'] = 'primary'
df_clean.loc[df_clean['source_file'] == 'F2F Meeting Report (near final).docx', 'document_status'] = 'superseded'
df_clean.loc[df_clean['source_file'].isin([
    'CSAS Transformation update.pptx', 
    'CSAS Transformation update-FR.pptx'
]), 'document_status'] = 'primary_translated'

# Create primary dataset (exclude superseded)
df_primary = df_clean[df_clean['document_status'] != 'superseded'].copy()

# Export processed data
df_clean.to_pickle(os.path.join(data_dir, 'meeting_data_refined.pkl'))
df_primary.to_pickle(os.path.join(data_dir, 'meeting_data_primary.pkl'))

print(f"Processed: {len(df_clean)} total rows, {len(df_primary)} primary rows")

Processed: 862 total rows, 689 primary rows


## Phase 5: Quality Assessment

In [23]:
# Quick quality check
qa_metrics = {
    'total_rows': len(df_primary),
    'documents': df_primary['source_file'].nunique(),
    'languages': df_primary['language'].nunique(),
    'action_items': (df_primary['action_strength'] > 0).sum(),
    'recommendations': (df_primary['recommendation_strength'] > 0).sum(),
    'contentions': (df_primary['contention_strength'] > 0).sum(),
    'null_values': df_primary.isnull().sum().sum(),
    'language_detection_rate': f"{(df_primary['language'] != 'unknown').sum() / len(df_primary) * 100:.1f}%"
}

print("Quality Assessment:")
for key, value in qa_metrics.items():
    print(f"  {key}: {value}")

Quality Assessment:
  total_rows: 689
  documents: 11
  languages: 20
  action_items: 86
  recommendations: 130
  contentions: 91
  null_values: 0
  language_detection_rate: 95.6%


## Phase 6: Final Summarization

In [24]:
# Load primary dataset and add themes
df_summary = pd.read_pickle(os.path.join(data_dir, 'meeting_data_primary.pkl'))
df_summary['themes'] = df_summary['text'].apply(detect_themes)

# Export summaries
action_summary, recommendations_summary, contentions_summary = export_summaries(df_summary, data_dir)
theme_counts = export_theme_analysis(df_summary, THEMES, data_dir)

print(f"Summary exported:")
print(f"  Action items: {action_summary['total_action_items']}")
print(f"  Recommendations: {recommendations_summary['total_recommendations']}")
print(f"  Contentions: {contentions_summary['total_contentions']}")
print(f"  Themes: {len(theme_counts)}")

Summary exported:
  Action items: 17
  Recommendations: 19
  Contentions: 2
  Themes: 6


## Phase 7: Analysis & Recommendations

### Finding 1: High Priority Action Items

In [52]:
# Load high priority action items
with open(os.path.join(data_dir, 'phase7_high_priority.json'), 'r') as f:
    high_priority_data = json.load(f)

# Convert to DataFrame for display
if high_priority_data['action_items']:
    action_items_df = pd.DataFrame(high_priority_data['action_items'])
    action_items_df = action_items_df[['source_file', 'text', 'action_strength', 'action_categories']].rename(columns={
        'source_file': 'Source File',
        'text': 'Action Item',
        'action_strength': 'Strength',
        'action_categories': 'Categories'
    })
    
    print(f'HIGH PRIORITY ACTION ITEMS ({len(action_items_df)} identified)\n')
    for _, row in action_items_df[['Action Item']].iterrows():
        print(row.values[0])
        print()
else:
    print('HIGH PRIORITY ACTION ITEMS\nNone identified')

HIGH PRIORITY ACTION ITEMS (4 identified)

ACTION: CSAS Coordinators to identify regional rep for Phase 1 Task Team; Task Team to identify process to ensure efficiencies are realized; once Phase 1 is implemented, Task Team will consider options for Phase 2.

Issue: Timelines from meeting to publication of document can be lengthy. Some steps can be compressed to reduce timelines while many cannot.  The readiness of the document can influence timelines at many steps.  For example, poorly prepared materials can result in delays including challenges reaching consensus (e.g. not all documentation/analyses presented), and to follow-up meetings or emails to conclude the meeting and/or to complete the SAR, as well as more time and effort to complete the documents. A working paper should not require major revisions, apart from changes requested during the peer-review meeting.  

Allow time for the Steering Committee (SC) to meet their responsibility (CSAS Guideline on the Role and Responsibilit

### Finding 2: Top Recommendations

In [54]:
# Load high priority recommendations
if high_priority_data['recommendations']:
    recommendations_df = pd.DataFrame(high_priority_data['recommendations'])
    recommendations_df = recommendations_df[['source_file', 'text', 'recommendation_strength', 'recommendation_categories']].rename(columns={
        'source_file': 'Source File',
        'text': 'Recommendation',
        'recommendation_strength': 'Strength',
        'recommendation_categories': 'Categories'
    })
    print(f'TOP RECOMMENDATIONS ({len(recommendations_df)} identified)\n')
    for _, row in recommendations_df[['Recommendation']].iterrows():
        print(row.values[0])
        print()
else:
    print('TOP RECOMMENDATIONS\nNone identified')

TOP RECOMMENDATIONS (1 identified)

ACTION: CSAS Coordinators to identify regional rep for Phase 1 Task Team; Task Team to identify process to ensure efficiencies are realized; once Phase 1 is implemented, Task Team will consider options for Phase 2.



### Finding 3: Critical Issues/Contentions

In [33]:
# Load high priority contentions
if high_priority_data['contentions']:
    contentions_df = pd.DataFrame(high_priority_data['contentions'])
    contentions_df = contentions_df[['source_file', 'text', 'contention_strength', 'contention_categories']].rename(columns={
        'source_file': 'Source File',
        'text': 'Issue/Contention',
        'contention_strength': 'Strength',
        'contention_categories': 'Categories'
    })
    print(f'CRITICAL ISSUES/CONTENTIONS ({len(contentions_df)} identified)\n')
    print(contentions_df.to_string(index=False))
else:
    print('CRITICAL ISSUES/CONTENTIONS')
    print(f'\nGood news: No critical contentions identified')

CRITICAL ISSUES/CONTENTIONS

Good news: No critical contentions identified


### Finding 4: Theme-Based Stakeholder Summary

In [55]:
# Load stakeholder summary
with open(os.path.join(data_dir, 'phase7_stakeholder_summary.json'), 'r') as f:
    stakeholder_data = json.load(f)

# Convert to DataFrame for display
theme_summary_list = []
for theme, data in stakeholder_data.items():
    theme_summary_list.append({
        'Theme': theme,
        'Total Items': data['total_items'],
        'Actions': data['action_items'],
        'Recommendations': data['recommendations'],
        'Issues': data['issues']
    })

theme_summary_df = pd.DataFrame(theme_summary_list).sort_values('Total Items', ascending=False)
print(f'THEME-BASED SUMMARY ({len(theme_summary_df)} themes)\n')
display(theme_summary_df)

THEME-BASED SUMMARY (6 themes)



Unnamed: 0,Theme,Total Items,Actions,Recommendations,Issues
0,General,403,23,6,33
2,Publications,216,57,83,45
1,Process/Best Practices,102,22,98,31
4,Transformation,51,7,13,9
5,Web/Centralization,21,9,10,5
3,Survival/Exercise,9,2,1,0


### Finding 5: Action Items Distribution

In [57]:
# Load patterns
with open(os.path.join(data_dir, 'phase7_patterns.json'), 'r') as f:
    patterns_data = json.load(f)

# Action items by source file
action_by_source = patterns_data['action_item_distribution']['by_source_file']
action_source_df = pd.DataFrame(list(action_by_source.items()), columns=['Source File', 'Action Items Count'])
action_source_df = action_source_df.sort_values('Action Items Count', ascending=False)

print('ACTION ITEMS DISTRIBUTION BY SOURCE FILE\n')
display(action_source_df)

# Action items by theme
print('\n\nACTION ITEMS DISTRIBUTION BY THEME\n')
action_by_theme = patterns_data['action_item_distribution']['by_theme']
action_theme_df = pd.DataFrame(list(action_by_theme.items()), columns=['Theme', 'Action Items Count'])
action_theme_df = action_theme_df.sort_values('Action Items Count', ascending=False)
display(action_theme_df)

ACTION ITEMS DISTRIBUTION BY SOURCE FILE



Unnamed: 0,Source File,Action Items Count
6,F2F Meeting Notes (draft).docx,41
8,Options and best practices for timely publication v2.docx,28
2,CSAS Transformation update.pptx,14
3,Centralization of web and publication.docx,7
9,Process vs Product.pptx,6
0,CSAS Publications.pptx,4
1,CSAS Transformation update-FR.pptx,3
4,Coordinators F2F Agenda.docx,2
5,F2F Action Items.docx,1
7,F2F Meeting Report (near final)_TG_FR_LS_Final.docx,1




ACTION ITEMS DISTRIBUTION BY THEME



Unnamed: 0,Theme,Action Items Count
2,Publications,74
1,Process/Best Practices,31
0,General,25
4,Transformation,11
5,Web/Centralization,10
3,Survival/Exercise,3


### Finding 6: Follow-Up Actions & Recommendations

In [62]:
# Load follow-up actions
with open(os.path.join(data_dir, 'phase7_follow_up_actions.json'), 'r') as f:
    follow_up_data = json.load(f)

# Convert to DataFrame
follow_up_df = pd.DataFrame(follow_up_data)
follow_up_df = follow_up_df[['type', 'count', 'description']].rename(columns={
    'type': 'Follow-Up Category',
    'count': 'Item Count',
    'description': 'Description'
})

print(f'RECOMMENDED FOLLOW-UP ACTIONS ({len(follow_up_df)} identified)\n')
pd.set_option('display.max_rows', 120)
display(follow_up_df)

RECOMMENDED FOLLOW-UP ACTIONS (103 identified)



Unnamed: 0,Follow-Up Category,Item Count,Description
0,High Priority Actions,4,Multiple strong action signals requiring immediate attention
1,Process/Best Practices - High Contention,31,30% of Process/Best Practices content flagged with concerns
2,Process/Best Practices - High Contention,31,30% of Process/Best Practices content flagged with concerns
3,Process/Best Practices - High Contention,31,30% of Process/Best Practices content flagged with concerns
4,Process/Best Practices - High Contention,31,30% of Process/Best Practices content flagged with concerns
5,Process/Best Practices - High Contention,31,30% of Process/Best Practices content flagged with concerns
6,Process/Best Practices - High Contention,31,30% of Process/Best Practices content flagged with concerns
7,Process/Best Practices - High Contention,31,30% of Process/Best Practices content flagged with concerns
8,Process/Best Practices - High Contention,31,30% of Process/Best Practices content flagged with concerns
9,Process/Best Practices - High Contention,31,30% of Process/Best Practices content flagged with concerns


### Finding 7: Conflict Analysis

In [37]:
# Load conflicts
with open(os.path.join(data_dir, 'phase7_conflicts.json'), 'r') as f:
    conflicts_data = json.load(f)

conflicts_count = conflicts_data['total_conflicts']

if conflicts_count > 0:
    conflicts_df = pd.DataFrame(conflicts_data['conflicts'])
    conflicts_df = conflicts_df[['source_file', 'text', 'action_categories', 'contention_categories']].rename(columns={
        'source_file': 'Source File',
        'text': 'Conflicting Item',
        'action_categories': 'Action Signals',
        'contention_categories': 'Contention Signals'
    })
    print(f'CONFLICT ANALYSIS ({conflicts_count} items with conflicting signals)\n')
    print(conflicts_df.to_string(index=False))
else:
    print('CONFLICT ANALYSIS')
    print(f'\nGood news: No significant conflicts detected ({conflicts_count} items with both action and contention signals)')

CONFLICT ANALYSIS

Good news: No significant conflicts detected (0 items with both action and contention signals)
