In [46]:

import pandas as pd
import re

def parse_markdown(markdown_path):
    with open(markdown_path, 'r') as f:
        content = f.read()
    parsed_data_fixed = []
    effect_sections_all = re.split(r'\n####\s*', content)
    field_patterns = {
        'Status': r'Status:\s*(.*?)\n',
        'Original paper': r'Original paper:\s*(.*?)\n',
        'Critiques': r'Critique[s]*:\s*(.*?)\n',
        'Original effect size': r'Original effect size[s]*:\s*(.*?)\n',
        'Replication effect size': r'Replication effect size[s]*:\s*(.*?)\n'
    }
    for effect_section in effect_sections_all:
        lines_in_effect_section = effect_section.split('\n')
        first_line = lines_in_effect_section[0].strip()
        if re.match(r"^###\s*\*\*", first_line) is not None:
            discipline_name = first_line[4:].strip()
            continue
        effect_name = first_line
        description = next((line.strip() for line in lines_in_effect_section[1:] if line.strip()), '')
        remaining_effect_lines = '\n'.join(lines_in_effect_section[1:])
        field_data = {field: re.findall(pattern, remaining_effect_lines) for field, pattern in field_patterns.items()}
        parsed_data_fixed.append({
            'Discipline': discipline_name,
            'Effect': effect_name,
            'Description': description,
            **{field: field_data[field][0] if field_data[field] else '' for field in field_patterns.keys()}
        })
    df_fixed = pd.DataFrame(parsed_data_fixed)
    df_fixed['Discipline'] = df_fixed['Discipline'].str.strip('*_ ')
    df_fixed['Description'] = df_fixed.apply(lambda row: row['Description'].replace(f"* **{row['Effect']}**.", "", 1).strip(), axis=1)
    
    return df_fixed

def parse_critiques(critiques_data):

    critiques_data = critiques_data[['Discipline', 'Effect', 'Description', 'Critiques']]

    # Initialize lists to hold the new rows
    new_rows = []

    # Regular expression pattern for extracting critique, link, and notes
    pattern = r'\[\s*(.+?)\s*\]\((.+?)\)\s*\[*(.*?)\]*\.'
    doi_pattern = r'10\..*?(?=[#?]|$)'


    # Iterate through each row in the DataFrame
    for idx, row in critiques_data.iterrows():
        critiques_str = row['Critiques']
        critiques_list = re.findall(pattern, critiques_str)

        # Create new rows for each critique
        for critique, link, notes in critiques_list:
            new_row = row.copy()
            new_row['Critique'] = critique.strip()
            new_row['Link'] = link.strip()
            doi_matches = re.findall(doi_pattern, link.strip())
            new_row['doi'] = doi_matches[0] if doi_matches else None
            new_row['Notes'] = notes.strip()
            new_rows.append(new_row)

    # Create a new DataFrame from the list of new rows
    new_df = pd.DataFrame(new_rows)

    # Drop the original 'Critiques' column
    new_df.drop(columns=['Critiques'], inplace=True)

    return new_df


    # Save the new DataFrame to an Excel file


Manual precleaning steps to parse sections and effects more easily
- Replace ####\n by ####
- Replace \n### \n by \n### (ensure this also happens before Social Psych at top)
- Replace empty headings: ###\s+\n by nothing
- delete <p tags and subsequent image errors
- remove linebreaks within fields, indicated by \ at the end of a line

In [44]:

# Example usage
parsed_markdown = parse_markdown('FORRT effects v3.md')
parsed_markdown[['Discipline', 'Effect', 'Description']].to_csv('parsed_effects.csv', index=False)
critiques = parse_critiques(critique_df)
critiques.to_csv('parsed_critiques.csv', index=False)


In [47]:
critiques = parse_critiques(parsed_markdown)
