In [7]:
import os
import json
def parse_rmd(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    data = []
    current_heading = None
    current_explanation = []
    inside_code = False
    current_code = []

    for line in lines:
        line = line.rstrip()

        # Detect heading
        if line.startswith('#'):
            current_heading = line.lstrip('#').strip()
            continue

        # Detect start of code block
        if line.startswith("```{r"):
            inside_code = True
            current_code = []
            continue
        # Detect end of code block
        if inside_code and line.startswith("```"):
            inside_code = False
            data.append({
                "heading": current_heading,
                "code": "\n".join(current_code).strip(),
                "explanation": " ".join(current_explanation).strip() if current_explanation else ""
            })
            current_explanation = []
            continue

        # Collect code
        if inside_code:
            current_code.append(line)
        else:
            if line.strip():
                current_explanation.append(line)

    return data

def parse_folder(folder_path):
    all_data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".Rmd"):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing {filename}...")
            file_data = parse_rmd(file_path)
            # Add filename for context
            for block in file_data:
                block["source_file"] = filename
            all_data.extend(file_data)

    # Save JSON in the same folder
    output_json = os.path.join(folder_path, "all_rmd_code.json")
    with open(output_json, 'w', encoding='utf-8') as out:
        json.dump(all_data, out, indent=4, ensure_ascii=False)

    print(f"✅ Extracted {len(all_data)} code blocks into {output_json}")



In [8]:
parse_folder("/Users/mathildekrafft/Desktop/class SA 2025/Text mining/Insights from Text Data 2025 Day 1/Rmd files")

Processing Insights from Text Data 2025 Day 6.Rmd...
Processing Insights from Text Data 2025 Day 4.Rmd...
Processing Insights from Text Data 2025 Day 5.Rmd...
Processing Insights from Text Data 2025 Day 1.Rmd...
Processing Insights from Text Data 2025 Day 2.Rmd...
Processing Insights from Text Data 2025 Day 3.Rmd...
✅ Extracted 219 code blocks into /Users/mathildekrafft/Desktop/class SA 2025/Text mining/Insights from Text Data 2025 Day 1/Rmd files/all_rmd_code.json
