# Sept 24 Coordinators Meeting - Text Extraction & Analysis
Processing meeting documents through extraction, cleaning, categorization, and summarization.

In [20]:
import os
import pandas as pd
from pathlib import Path
import sys

# Setup
meeting_folder = "sept_24_coordinators_meeting"
data_dir = os.path.join(meeting_folder, 'data')
os.makedirs(data_dir, exist_ok=True)

# Import helper functions
sys.path.insert(0, os.getcwd())
from extract_helpers import extract_docx_content, extract_pptx_content
from categorize_helpers import detect_language, normalize_text, categorize_content, detect_themes, THEMES
from summary_helpers import export_summaries, export_theme_analysis
from analysis_helpers import (analyze_patterns, identify_conflicts, identify_high_priority_items,
                               generate_stakeholder_summary, generate_follow_up_actions, export_analysis)

print("Setup complete. Ready to process meeting documents.")

Setup complete. Ready to process meeting documents.


## Phase 2-3: Extract and Clean Data

In [21]:
# Extract all documents
all_data = []
for file_name in sorted(os.listdir(meeting_folder)):
    file_path = os.path.join(meeting_folder, file_name)
    if not os.path.isfile(file_path):
        continue

    file_ext = Path(file_path).suffix.lower()
    content = []

    if file_ext == '.docx':
        content = extract_docx_content(file_path)
    elif file_ext == '.pptx':
        content = extract_pptx_content(file_path)

    for item in content:
        all_data.append({
            'source_file': file_name,
            'source_type': file_ext,
            'text': item['text'],
            'element_type': item['element_type'],
            'style': item['style']
        })

df_raw = pd.DataFrame(all_data)
df_raw.insert(0, 'row_id', range(1, len(df_raw) + 1))

print(f"Extracted {len(df_raw)} rows from {df_raw['source_file'].nunique()} documents")

Extracted 862 rows from 12 documents


## Phase 3-4: Categorize and Refine

In [22]:
# Add language detection and categorization
df_clean = df_raw.copy()
df_clean['language'] = df_clean['text'].apply(detect_language)
df_clean['text_normalized'] = df_clean['text'].apply(normalize_text)

# Categorize content
categorization = df_clean['text'].apply(categorize_content)
df_clean['action_categories'] = categorization.apply(lambda x: x['action_categories'])
df_clean['recommendation_categories'] = categorization.apply(lambda x: x['recommendation_categories'])
df_clean['contention_categories'] = categorization.apply(lambda x: x['contention_categories'])

# Add strength scores
df_clean['action_strength'] = df_clean['action_categories'].apply(len)
df_clean['recommendation_strength'] = df_clean['recommendation_categories'].apply(len)
df_clean['contention_strength'] = df_clean['contention_categories'].apply(len)

# Mark document status
df_clean['document_status'] = 'primary'
df_clean.loc[df_clean['source_file'] == 'F2F Meeting Report (near final).docx', 'document_status'] = 'superseded'
df_clean.loc[df_clean['source_file'].isin([
    'CSAS Transformation update.pptx', 
    'CSAS Transformation update-FR.pptx'
]), 'document_status'] = 'primary_translated'

# Create primary dataset (exclude superseded)
df_primary = df_clean[df_clean['document_status'] != 'superseded'].copy()

# Export processed data
df_clean.to_pickle(os.path.join(data_dir, 'meeting_data_refined.pkl'))
df_primary.to_pickle(os.path.join(data_dir, 'meeting_data_primary.pkl'))

print(f"Processed: {len(df_clean)} total rows, {len(df_primary)} primary rows")

Processed: 862 total rows, 689 primary rows


## Phase 5: Quality Assessment

In [23]:
# Quick quality check
qa_metrics = {
    'total_rows': len(df_primary),
    'documents': df_primary['source_file'].nunique(),
    'languages': df_primary['language'].nunique(),
    'action_items': (df_primary['action_strength'] > 0).sum(),
    'recommendations': (df_primary['recommendation_strength'] > 0).sum(),
    'contentions': (df_primary['contention_strength'] > 0).sum(),
    'null_values': df_primary.isnull().sum().sum(),
    'language_detection_rate': f"{(df_primary['language'] != 'unknown').sum() / len(df_primary) * 100:.1f}%"
}

print("Quality Assessment:")
for key, value in qa_metrics.items():
    print(f"  {key}: {value}")

Quality Assessment:
  total_rows: 689
  documents: 11
  languages: 20
  action_items: 86
  recommendations: 130
  contentions: 91
  null_values: 0
  language_detection_rate: 95.6%


## Phase 6: Final Summarization

In [24]:
# Load primary dataset and add themes
df_summary = pd.read_pickle(os.path.join(data_dir, 'meeting_data_primary.pkl'))
df_summary['themes'] = df_summary['text'].apply(detect_themes)

# Export summaries
action_summary, recommendations_summary, contentions_summary = export_summaries(df_summary, data_dir)
theme_counts = export_theme_analysis(df_summary, THEMES, data_dir)

print(f"Summary exported:")
print(f"  Action items: {action_summary['total_action_items']}")
print(f"  Recommendations: {recommendations_summary['total_recommendations']}")
print(f"  Contentions: {contentions_summary['total_contentions']}")
print(f"  Themes: {len(theme_counts)}")

Summary exported:
  Action items: 17
  Recommendations: 19
  Contentions: 2
  Themes: 6


## Phase 7: Analysis & Recommendations

In [26]:
# Load summary data
df_analysis = pd.read_pickle(os.path.join(data_dir, 'meeting_data_summary.pkl'))

# Run analysis
patterns = analyze_patterns(df_analysis)
conflicts = identify_conflicts(df_analysis)
high_priority = identify_high_priority_items(df_analysis)
stakeholder_summary = generate_stakeholder_summary(df_analysis)
follow_ups = generate_follow_up_actions(df_analysis)

# Export all analysis
exported_files = export_analysis(patterns, conflicts, high_priority, stakeholder_summary, follow_ups, data_dir)

print("Phase 7 Analysis Complete:")
print(f"  Conflicts identified: {len(conflicts)}")
print(f"  High priority items: {len(high_priority['action_items']) + len(high_priority['recommendations']) + len(high_priority['contentions'])}")
print(f"  Themes with summaries: {len(stakeholder_summary)}")
print(f"  Follow-up actions: {len(follow_ups)}")
print(f"\nExported files:")
for key, value in exported_files.items():
    print(f"  {value}")

Phase 7 Analysis Complete:
  Conflicts identified: 0
  High priority items: 5
  Themes with summaries: 6
  Follow-up actions: 103

Exported files:
  phase7_patterns.json
  phase7_conflicts.json
  phase7_high_priority.json
  phase7_stakeholder_summary.json
  phase7_follow_up_actions.json
