In [None]:
import os
import sys
import math
import statistics
import numpy as np
from collections import Counter
import re # For tokenization
from langdetect import detect, DetectorFactory # For language detection
from langdetect.lang_detect_exception import LangDetectException
# import pdfplumber # Commented out: No longer used for MD file processing
from io import StringIO # To capture print output
import string # For character entropy
import warnings # Added: To manage warnings
# Removed: from pdfminer.pdfparser import PDFSyntaxWarning # Removed: Specific import causing error
from scipy.stats import entropy # Added: For KL divergence calculation

import nltk
# Ensure necessary NLTK resources are downloaded
nltk.download('punkt') # Corrected from 'punkt_tab'
from nltk.tokenize import word_tokenize # For word tokenization

import pandas as pd # For DataFrame handling (if needed later)

# Ensure consistent language detection results
DetectorFactory.seed = 0

# --- Configuration ---
# Directory containing the MD files
########### source_directory = r'D:\Dataset\Lagerugpijn\LR_EPDs' # Use a raw string for the path, renamed from pdf_directory
source_directory = r'D:\Dataset\Lagerugpijn\synthetic_epds_ORG'
output_markdown_file = 'synth_md_analysis_report.md' 
numerical_data_csv_file = 'synth_MD_numerical_stats.csv'
#source_directory = r'D:\Dataset\Lagerugpijn\pseudonymized-epds' # Updated to point to the correct directory for MD files
#output_markdown_file = 'pseudo_md_analysis_report.md' # Name of the output Markdown file, changed from pdf_analysis_report.md
#numerical_data_csv_file = 'pseudo_MD_numerical_stats.csv'

PMI_BIGRAM_FREQ_THRESHOLD = 3 # Minimum frequency for a bigram to be included in Average PMI calculation

# --- Suppress Warnings ---
# Suppress warnings originating from any pdfminer module (still potentially relevant if other PDF tools are used indirectly or in future)
warnings.filterwarnings("ignore", module='pdfminer\..*')
# You could add other filters here if other specific warnings are bothersable, e.g.,
# warnings.filterwarnings("ignore", message="some specific message")

# --- Data Storage ---
file_data = [] # List to store dictionaries, each containing data for one file
all_extracted_text_for_vocab = "" # String to accumulate all text for vocabulary analysis (for overall vocab and PMI)
all_extracted_chars = "" # String to accumulate all characters (for overall char entropy)
all_char_counts = [] # List to store character counts for each file (for overall average)

report_output = StringIO() # Use StringIO to capture print statements

# Redirect stdout to capture print output
original_stdout = sys.stdout
sys.stdout = report_output

# --- Helper Function for MD Text Extraction and Basic Structural Element Detection ---
def analyze_md_content(md_path):
    """
    Extracts text from a Markdown file and performs basic detection of figures and tables.
    Returns extracted text, table count, figure count, and 0 for annotation count.
    """
    extracted_text = ""
    table_count = 0
    figure_count = 0
    annotation_count = 0 # Markdown files don't have a direct equivalent to PDF annotations

    try:
        with open(md_path, 'r', encoding='utf-8', errors='ignore') as f:
            extracted_text = f.read()

        # Count figures (Markdown image syntax: ![alt text](image_url))
        figure_count = len(re.findall(r'!\[.*?\]\(.*?\)', extracted_text))

        # Count tables: A heuristic looking for Markdown table header separator patterns.
        # This counts lines that look like |---|---| or similar.
        # This is a rough estimate of the number of tables.
        lines = extracted_text.splitlines()
        for line in lines:
            s_line = line.strip()
            # Heuristic for Markdown table separator line:
            # 1. Contains '---'
            # 2. Contains '|'
            # 3. Consists primarily of '-', '|', ':', and whitespace.
            if '---' in s_line and '|' in s_line:
                is_separator_candidate = True
                for char_in_line in s_line:
                    if char_in_line not in ['-', '|', ':', ' ', '\t']: # Allow tabs as well
                        is_separator_candidate = False
                        break
                if is_separator_candidate:
                    # Avoid counting simple horizontal rules like "---" if they somehow pass
                    if not re.fullmatch(r'-{3,}', s_line.replace('|','').replace(':','').strip()):
                        table_count += 1
        
    except FileNotFoundError:
        original_stdout.write(f"Warning: File not found: {os.path.basename(md_path)}\n")
        return "", 0, 0, 0 # Return empty data for this file
    except Exception as e:
        original_stdout.write(f"Warning: Error reading or processing MD file {os.path.basename(md_path)}: {e}\n")
        return "", 0, 0, 0 # Return empty data for this file

    return extracted_text, table_count, figure_count, annotation_count


# --- Helper Function for Language Detection ---
def detect_language(text):
    """Detects the language of the input text."""
    if not text.strip():
        return "N/A (No text)"
    try:
        sample_text = text[:5000] if len(text) > 5000 else text
        if not sample_text.strip():
             return "N/A (No text in sample)"
        with warnings.catch_warnings():
             warnings.filterwarnings("ignore", category=UserWarning)
             return detect(sample_text)
    except LangDetectException:
        return "Undetected"
    except Exception as e:
        original_stdout.write(f"Warning: Error during language detection: {e}\n")
        return "Error"

# --- Helper Functions for Information Theory Metrics ---
def calculate_shannon_entropy(items):
    """Calculates Shannon entropy for a list of items (chars or words)."""
    if not items:
        return 0.0
    counts = Counter(items)
    total_items = len(items)
    entropy_val = 0.0 # Renamed to avoid conflict with scipy.stats.entropy
    for count in counts.values():
        probability = count / total_items
        if probability > 0:
             entropy_val -= probability * math.log2(probability)
    return entropy_val

# --- Helper Function for canonical Divergence and JSD Calculation ---
def calculate_ca_divergence(text1, text2, unit='word', smoothing=1e-9, base=2):
    """
    Calculates the canonical Jensen-Shannon Divergence (JSD) between two texts.
    Uses NLTK's word_tokenize for robust word-level tokenization.
    """
    if not text1 or not text2:
        return np.nan

    if unit == 'char':
        tokens1 = list(text1)
        tokens2 = list(text2)
    elif unit == 'word':
        tokens1 = word_tokenize(text1)
        tokens2 = word_tokenize(text2)
    else:
        raise ValueError("unit must be 'char' or 'word'")

    if not tokens1 or not tokens2:
        return np.nan

    vocab = list(set(tokens1 + tokens2))
    counts1 = Counter(tokens1)
    counts2 = Counter(tokens2)

    p1 = np.array([counts1.get(token, 0) + smoothing for token in vocab], dtype=np.float64)
    p2 = np.array([counts2.get(token, 0) + smoothing for token in vocab], dtype=np.float64)
    p1 /= p1.sum()
    p2 /= p2.sum()

    M = 0.5 * (p1 + p2)
    # Using scipy.stats.entropy for KL divergence part of JSD
    jsd = 0.5 * entropy(p1, M, base=base) + 0.5 * entropy(p2, M, base=base)
    return jsd

# --- Helper Function for Bigram Calculation ---
def calculate_avg_bigram_pmi(text, min_freq=3):
    """
    Calculates the average Pointwise Mutual Information (PMI) for word bigrams
    that occur at least min_freq times.
    """
    if not text:
        return 0.0
    words = re.findall(r'\b\w+\b', text.lower())
    if len(words) < 2:
        return 0.0

    word_counts = Counter(words)
    bigram_counts = Counter(zip(words[:-1], words[1:]))

    total_words = len(words)
    pmi_values = []
    for bigram, bigram_count in bigram_counts.items():
        if bigram_count >= min_freq:
            word1, word2 = bigram
            p_w1 = word_counts[word1] / total_words if total_words > 0 else 0
            p_w2 = word_counts[word2] / total_words if total_words > 0 else 0
            p_w1_w2 = bigram_count / total_words if total_words > 0 else 0
            if p_w1 > 0 and p_w2 > 0 and p_w1_w2 > 0:
                 pmi = math.log2(p_w1_w2 / (p_w1 * p_w2))
                 pmi_values.append(pmi)
    if not pmi_values:
        return 0.0
    return np.mean(pmi_values)


# --- Analysis ---
original_stdout.write(f"Analyzing Markdown files in directory: {source_directory}\n") # Updated message

if not os.path.isdir(source_directory):
    original_stdout.write(f"Error: Directory not found at {source_directory}\n")
else:
    for entry_name in os.listdir(source_directory):
        entry_path = os.path.join(source_directory, entry_name)

        # Check if the entry is a file and ends with .md (case-insensitive)
        if os.path.isfile(entry_path) and entry_name.lower().endswith('.md'): # Changed from .pdf to .md
            file_info = {}
            file_info['filename'] = entry_name
            file_info['filepath'] = entry_path

            try:
                file_size_bytes = os.path.getsize(entry_path)
                file_info['storage_size_bytes'] = file_size_bytes
                file_info['storage_size_mb'] = file_size_bytes / (1024 * 1024)
            except Exception as e:
                original_stdout.write(f"Warning: Could not get size for {entry_name}: {e}\n")
                file_info['storage_size_bytes'] = 0
                file_info['storage_size_mb'] = 0

            # Use analyze_md_content for .md files
            text, table_count, figure_count, annotation_count = analyze_md_content(entry_path)

            file_info['extracted_text'] = text
            file_info['char_count'] = len(text)
            all_char_counts.append(file_info['char_count'])

            tokens = re.findall(r'\b\w+\b', text.lower())
            word_count = len(tokens)
            file_info['word_count'] = word_count
            file_info['tokens'] = tokens

            unique_words_in_file = set(tokens)
            file_info['unique_word_count'] = len(unique_words_in_file)

            file_info['char_entropy'] = calculate_shannon_entropy(list(text))
            file_info['word_entropy'] = calculate_shannon_entropy(tokens)
            file_info['average_pmi'] = calculate_avg_bigram_pmi(text, min_freq=PMI_BIGRAM_FREQ_THRESHOLD)

            all_extracted_text_for_vocab += text + " "
            all_extracted_chars += text

            file_info['table_count'] = table_count
            file_info['figure_count'] = figure_count
            file_info['annotation_count'] = annotation_count # Will be 0 for MD files from analyze_md_content
            file_info['has_tables'] = table_count > 0
            file_info['has_figures'] = figure_count > 0
            file_info['has_annotations'] = annotation_count > 0

            file_info['language'] = detect_language(text)
            file_info['estimated_info_content_bits_filesize'] = file_info.get('storage_size_bytes', 0) * 8 # Use .get for safety

            file_data.append(file_info)

    overall_char_entropy = calculate_shannon_entropy(list(all_extracted_chars))
    overall_tokens = re.findall(r'\b\w+\b', all_extracted_text_for_vocab.lower())
    overall_word_counts = Counter(overall_tokens)
    total_words_overall = len(overall_tokens)
    overall_word_distribution = {word: count / total_words_overall for word, count in overall_word_counts.items()} if total_words_overall > 0 else {}
    overall_vocabulary = set(overall_tokens)
    overall_word_entropy = calculate_shannon_entropy(overall_tokens)
    overall_average_pmi = calculate_avg_bigram_pmi(all_extracted_text_for_vocab, min_freq=PMI_BIGRAM_FREQ_THRESHOLD)
    overall_avg_doc_length_chars = np.mean(all_char_counts) if all_char_counts else 0

    if file_data and overall_word_distribution: # Check overall_word_distribution as well
        for file_info in file_data:
            # Ensure 'extracted_text' is present before trying to use it for JSD
            if 'extracted_text' in file_info and file_info['extracted_text']:
                 jsd_word = calculate_ca_divergence(file_info['extracted_text'], all_extracted_text_for_vocab, unit='word', smoothing=1e-9, base=2)
                 file_info['js_dist'] = jsd_word
            else:
                 file_info['js_dist'] = np.nan # Set to NaN if no text
    else:
        for file_info in file_data:
             file_info['js_dist'] = np.nan

    if file_data:
        df_data_list = []
        for item in file_data:
            digits_in_filename = re.findall(r'\d+', item['filename'])
            last_five_digits = "".join(digits_in_filename)[-5:] if digits_in_filename else ""
            display_name = last_five_digits
            if len(item['filename'].replace('.', '').replace('_', '').replace('-', '').replace(' ', '')) > len(last_five_digits):
                display_name = "..." + display_name
            
            df_data_list.append({
                'Filename': display_name,
                'Size (MB)': item.get('storage_size_mb', 0.0), # Use .get for safety
                'Word Count': item.get('word_count', 0),
                'Unique Words': item.get('unique_word_count', 0),
                'Doc Length (Chars)': item.get('char_count', 0),
                'Language': item.get('language', 'N/A'),
                'Tables': item.get('table_count', 0),
                'Figures': item.get('figure_count', 0),
                'Annotations': item.get('annotation_count', 0),
                'Char Entropy': item.get('char_entropy', 0.0),
                'Word Entropy': item.get('word_entropy', 0.0),
                'Avg PMI': item.get('average_pmi', 0.0),
                'JS Dist': item.get('js_dist', np.nan)
            })
        df = pd.DataFrame(df_data_list)

        original_stdout.write("\n\n--- Pandas DataFrame Summary ---\n")
        with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):
            original_stdout.write(str(df.head()) + "\n")
        original_stdout.write(f"DataFrame shape: {df.shape}\n")

        numerical_column_names = [
            'Size (MB)', 'Word Count', 'Unique Words', 'Doc Length (Chars)',
            'Char Entropy', 'Word Entropy', 'Avg PMI', 'JS Dist'
        ]
        numerical_data = df[numerical_column_names].copy()

        original_stdout.write("\n\n--- Numerical Data DataFrame Summary ---\n")
        with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):
            original_stdout.write(str(numerical_data.head()) + "\n")
        original_stdout.write(f"Numerical DataFrame shape: {numerical_data.shape}\n")
        original_stdout.write(str(numerical_data.dtypes) + "\n")

        try:
            ## numerical_data_csv_file = 'MD_numerical_stats.csv' # Changed filename
            numerical_data.to_csv(numerical_data_csv_file, index=False)
            original_stdout.write(f"\nNumerical data saved to {numerical_data_csv_file}\n")
        except Exception as e:
            original_stdout.write(f"\nError saving numerical data to CSV: {e}\n")
    else:
        original_stdout.write("\n\n--- Pandas DataFrame Summary ---\n")
        original_stdout.write("No data to create DataFrame.\n")

    print("\n## Per-File Analysis Results")
    if file_data:
        print("\n| Filename | Size (MB) | Word Count | Unique Words | doc length | Language | Tables | Figures | Annotations | Char Entropy | Word Entropy | Avg PMI | JS Dist |")
        print("|---|---|---|---|---|---|---|---|---|---|---|---|---|")
        for file_info in file_data:
            digits_in_filename = re.findall(r'\d+', file_info['filename'])
            last_five_digits = "".join(digits_in_filename)[-5:] if digits_in_filename else ""
            display_filename = last_five_digits
            if len(file_info['filename'].replace('.', '').replace('_', '').replace('-', '').replace(' ', '')) > len(last_five_digits):
                 display_filename = "..." + display_filename
            js_dist_formatted = f"{file_info.get('js_dist', np.nan):.4f}" if not np.isnan(file_info.get('js_dist', np.nan)) else "NaN"
            print(f"| {display_filename} | {file_info.get('storage_size_mb', 0.0):.2f} | {file_info.get('word_count',0)} | {file_info.get('unique_word_count',0)} | {file_info.get('char_count',0)} | {file_info.get('language','N/A')} | {file_info.get('table_count',0)} | {file_info.get('figure_count',0)} | {file_info.get('annotation_count',0)} | {file_info.get('char_entropy',0.0):.2f} | {file_info.get('word_entropy',0.0):.2f} | {file_info.get('average_pmi',0.0):.4f} | {js_dist_formatted} |")
    else:
        print(f"No Markdown files found in the specified directory: {source_directory}") # Updated message

    print("\n## Overall Dataset Summary")
    total_files = len(file_data)
    print(f"\n### 1. Number of MD Files: {total_files}") # Updated message

    if total_files > 0:
        all_storage_sizes_bytes = [f.get('storage_size_bytes', 0) for f in file_data]
        total_storage_size_bytes = sum(all_storage_sizes_bytes)
        total_storage_size_mb = total_storage_size_bytes / (1024 * 1024) if total_storage_size_bytes > 0 else 0
        print(f"\n### 2. Storage Size")
        print(f"- **Total:** {total_storage_size_mb:.2f} MB ({total_storage_size_bytes} bytes)")
        if all_storage_sizes_bytes: # Check if list is not empty before calculating mean/min/max
            avg_file_size_mb = (statistics.mean(all_storage_sizes_bytes) / (1024 * 1024)) if all_storage_sizes_bytes else 0
            min_file_size_mb = (min(all_storage_sizes_bytes) / (1024 * 1024)) if all_storage_sizes_bytes else 0
            max_file_size_mb = (max(all_storage_sizes_bytes) / (1024 * 1024)) if all_storage_sizes_bytes else 0
            print(f"- **Average:** {avg_file_size_mb:.2f} MB")
            print(f"- **Range:** ({min_file_size_mb:.2f} MB, {max_file_size_mb:.2f} MB)")

        all_word_counts = [f.get('word_count', 0) for f in file_data]
        total_word_count = sum(all_word_counts)
        print(f"\n### 3. Textual Content Size")
        print(f"- **Total words/tokens:** {total_word_count}")
        if all_word_counts:
            avg_word_count = statistics.mean(all_word_counts) if all_word_counts else 0
            print(f"- **Average words/tokens per document:** {avg_word_count:.2f}")
            vocabulary_size = len(overall_vocabulary) # overall_vocabulary already calculated
            print(f"- **Unique tokens across dataset (vocabulary size):** {vocabulary_size}")
        else:
             print(f"- **Average words/tokens per document:** 0")
             print(f"- **Unique tokens across dataset (vocabulary size):** 0")

        total_estimated_info_content_bits_filesize = total_storage_size_bytes * 8
        print(f"\n### 4. Information Content (Estimated)")
        print(f"- **Total estimated info content (based on total file size):** {total_estimated_info_content_bits_filesize} bits")
        print(f"- *Note: This estimate is based on raw file size in bits.*")

        print(f"\n### 4b. Information Theory Metrics (Overall Dataset)")
        print(f"- **Overall Character Entropy:** {overall_char_entropy:.2f} bits/character")
        print(f"- **Overall Word Entropy:** {overall_word_entropy:.2f} bits/word")
        print(f"- **Average Bigram PMI (Threshold={PMI_BIGRAM_FREQ_THRESHOLD}):** {overall_average_pmi:.4f}")
        print(f"  *Note: Average PMI is calculated for bigrams appearing at least {PMI_BIGRAM_FREQ_THRESHOLD} times across the dataset.*")
        print(f"  *Note: Per-file JSD is calculated against the overall dataset word distribution.*")
        print(f"- **Overall Average Document Length (Characters):** {overall_avg_doc_length_chars:.2f}")

        print(f"\n### 5. Document Length Distribution (in words/tokens)")
        if all_word_counts:
            mean_len = statistics.mean(all_word_counts) if all_word_counts else 0
            median_len = statistics.median(all_word_counts) if all_word_counts else 0
            std_dev_len = statistics.stdev(all_word_counts) if len(all_word_counts) > 1 else 0
            print(f"- **Mean length:** {mean_len:.2f}")
            print(f"- **Median length:** {median_len}")
            print(f"- **Standard deviation:** {std_dev_len:.2f}")

        print(f"\n### 6. Language Distribution")
        all_languages = [f.get('language', 'N/A') for f in file_data]
        language_counts = Counter(all_languages)
        print("- **Counts:**")
        for lang, count in language_counts.most_common():
            print(f"   - {lang}: {count} files")

        print(f"\n### 7. Structural Elements Summary")
        total_tables_found = sum([f.get('table_count', 0) for f in file_data])
        total_figures_found = sum([f.get('figure_count', 0) for f in file_data])
        total_annotations_found = sum([f.get('annotation_count', 0) for f in file_data]) # Will be 0 for MD
        files_with_tables = sum([f.get('has_tables', False) for f in file_data])
        files_with_figures = sum([f.get('has_figures', False) for f in file_data])
        files_with_annotations = sum([f.get('has_annotations', False) for f in file_data]) # Will be 0 for MD

        print(f"- **Total Tables found (heuristic):** {total_tables_found}")
        print(f"- **Total Figures/Images found:** {total_figures_found}")
        print(f"- **Total Annotations found:** {total_annotations_found}") # Will be 0
        print(f"- **Files with Tables (heuristic):** {files_with_tables} ({files_with_tables/total_files*100:.2f}% if total_files > 0 else 0.00%)")
        print(f"- **Files with Figures/Images:** {files_with_figures} ({files_with_figures/total_files*100:.2f}% if total_files > 0 else 0.00%)")
        print(f"- **Files with Annotations:** {files_with_annotations} ({files_with_annotations/total_files*100:.2f}% if total_files > 0 else 0.00%)")
    else:
        print("No data collected from MD files.")

sys.stdout = original_stdout
sys.stdout.flush()

markdown_content = report_output.getvalue()
print("\n" + "="*30 + " MD File Analysis Report " + "="*30) # Updated title
print("\n*Note: Markdown rendering may vary depending on the viewer.*\n")
print(markdown_content)
print("="*79)

try:
    with open(output_markdown_file, 'w', encoding='utf-8') as f:
        f.write(markdown_content)
    print(f"\nAnalysis report also saved to {output_markdown_file}")
except Exception as e:
    print(f"\nError saving report to {output_markdown_file}: {e}")

report_output.close()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PROMET02\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Analyzing Markdown files in directory: D:\Dataset\Lagerugpijn\pseudonymized-epds


--- Pandas DataFrame Summary ---
   Filename  Size (MB)  Word Count  Unique Words  Doc Length (Chars) Language  Tables  Figures  Annotations  Char Entropy  Word Entropy   Avg PMI   JS Dist
0  ...59037   0.014140        1836           484               14386       nl       0        0            0      4.916925      8.207334  6.930547  0.169945
1  ...59684   0.013328        1657           569               13533       nl       0        0            0      4.914231      8.583490  6.922197  0.150735
2  ...60038   0.012178        1512           525               12388       nl       0        0            0      4.835315      8.450317  6.661283  0.128479
3  ...60384   0.013181        1717           497               13375       nl       0        0            0      4.812887      8.202289  6.542843  0.121215
4  ...60818   0.011514        1408           500               11675       nl       0        0          