In [161]:

import pandas as pd
import re
import string

from urllib.parse import unquote

def parse_markdown(markdown_path):
    with open(markdown_path, 'r') as f:
        content = f.read()
    # Remove all HTML tags from content
    content = re.sub(r'<[^>]+>', '', content)
    parsed_data_fixed = []
    field_patterns = {
        'Status': r'\*{0,2}Status\*{0,2}:\s*(.*?)\n',
        'Original paper': r'\*{0,2}Original paper\(?s?\)?\*{0,2}:\s*(.*?)\n',
        'Critiques': r'\*{0,2}Critique[s]*\*{0,2}:\s*(.*?)\n',
        'Original effect size': r'\*{0,2}Original effect size[s]*\*{0,2}:\s*(.*?)\n',
        'Replication effect size': r'\*{0,2}Replication effect size[s]*\*{0,2}:\s*(.*?)\n'
    }

    discipline_sections = re.split(r'\n###\s+', content)

    for discipline in discipline_sections:
        first_line = discipline.split('\n')[0]
        discipline_name = first_line.strip(string.punctuation + string.whitespace)
        effect_sections = re.split(r'\n####\s*', discipline)[1:]

        for effect_section in effect_sections:
            lines_in_effect_section = effect_section.split('\n')
            first_line = lines_in_effect_section[0].strip()
            if re.match(r"^###\s*\*\*", first_line) is not None:
                discipline_name = first_line[4:].strip()
                continue
            effect_name = first_line
            description = next((line.strip() for line in lines_in_effect_section[1:] if line.strip()), '')
            remaining_effect_lines = '\n'.join(lines_in_effect_section[1:])
            field_data = {field: re.findall(pattern, remaining_effect_lines) for field, pattern in field_patterns.items()}
            parsed_data_fixed.append({
                'Discipline': discipline_name,
                'Effect': effect_name,
                'Description': description,
                **{field: field_data[field][0] if field_data[field] else '' for field in field_patterns.keys()}
            })
            
    df_fixed = pd.DataFrame(parsed_data_fixed)
    df_fixed['Discipline'] = df_fixed['Discipline'].str.strip('*_ ')
    df_fixed['Description'] = df_fixed.apply(
        lambda row: re.sub(
            f"^{re.escape(row['Effect'])}",
            "",
            row['Description'].strip(string.punctuation + string.whitespace)
        ).strip(string.punctuation + string.whitespace),
        axis=1
    )
    return df_fixed

def parse_critiques(parsed_data):

    parsed_data = parsed_data[['Discipline', 'Effect', 'Description', 'Critiques']]

    # Initialize lists to hold the new rows
    new_rows = []

    # Regular expression pattern for extracting critique, link, and notes
    pattern = r'\[\s*(.+?)\s*\]\((.*?(?=\)\s|\)[.,;]|\)$))\)([^[]+?(?=\[.*\]\(|$))'
    
    #pattern = r'\[\s*(.+?)\s*\]\((.+?)\)\s*\[(.*?)\]'

    doi_pattern = r'10\..*?\/[^/]*(?=[\s,;.]\)|[#?&]|\/|$)'
    n_pattern = r"\b[Nn]\s*=\s*([\d\.]+(?:,[\d]+)*)(?![\d\.])"

    # Iterate through each row in the DataFrame
    for idx, row in parsed_data.iterrows():
        critiques_str = row['Critiques']

        # Needs some complex pre-processing as there are many [] in the notes
        # Step 1: Replace "](" with a unique marker "__UNIQUE_MARKER__"
        temp_str = critiques_str.replace("](", "__UNIQUE_MARKER__")

        # Step 2: Remove all standalone closing square brackets "]"
        temp_str = re.sub(r'\](?![^(]*\()', '', temp_str)

        # Step 3: Remove all standalone opening square brackets "[" that are not followed by a marked closing square bracket
        temp_str = re.sub(r'\[(?![^\[\]]+__UNIQUE_MARKER__)', '', temp_str)

        # Step 4: Replace the unique marker back to "]("
        final_str = temp_str.replace("__UNIQUE_MARKER__", "](")

        # Step 5: Use regex to extract critique, link, and notes
        critiques_list = re.findall(pattern, final_str)


        # Create new rows for each critique
        for critique, link, notes in critiques_list:
            new_row = row.copy()
            new_row['Critique'] = critique.strip()
            new_row['Link'] = link.strip()
            doi_matches = re.findall(doi_pattern, link.strip())
            new_row['doi'] = doi_matches[0] if doi_matches else None
            # Double decoding - as some dois will be URL encoded
            new_row['doi'] = unquote(unquote(new_row['doi'])) if pd.notna(new_row['doi']) else None

            # Sort out notes
            notes_pattern = r'(.*?)citations?\s*=\s*([\d,]+)\s*\(([^)]+)\)'

            match = re.match(notes_pattern, notes.strip())
            if match:
                new_row['Notes'] = match.group(1).strip().replace("*", "").replace("_", "")
                new_row['Citations'] = match.group(2) + " (" + match.group(3) + ")"
            else:
                new_row['Notes'] = notes.strip()
                new_row['Citations'] = None
            
            new_row['N critique'] = ' & '.join(re.findall(n_pattern, new_row['Notes']))

            new_rows.append(new_row)

    # Create a new DataFrame from the list of new rows
    new_df = pd.DataFrame(new_rows)

    # Drop the original 'Critiques' column
    new_df.drop(columns=['Critiques'], inplace=True)

    return new_df

def parse_originals(parsed_data):

    parsed_data = parsed_data[['Discipline', 'Effect', 'Description', 'Original paper']]

    # Initialize lists to hold the new rows
    new_rows = []

    # Regular expression pattern for extracting elements of citation
    doi_pattern = r'10\..*?\/[^/]*(?=[\s,;.]\)|[#?&]|\/|$)'
    pattern = r"\[([^]]+)\]\((.*?(?=\)\s|\)[.,;]|\)$))\)(?:[,\s;]*?([^;]+?[0-9]{4}[a-d]?)(?:[;,. ] ?(.*$))?)?"

    notes_pattern = r'(.*?)citations?(.*?$)'
    punctuations_to_remove = ''.join([p for p in string.punctuation if p not in ['~', ')']])
    n_pattern = r"\b[Nn]\s*=\s*([\d\.]+(?:,[\d]+)*)(?![\d\.])"

    # = matches[0]


    # Iterate through each row in the DataFrame
    for idx, row in parsed_data.iterrows():
        paper_str = row['Original paper'].replace("‘", "").replace("’", "").replace("'", "")
        paper_list = re.findall(pattern, paper_str)

        # Check if matching worked
        if paper_list:
            for title, link, author_year, notes in paper_list:
                new_row = row.copy()
                new_row['Paper title'] = title.strip()
                new_row['Paper ref'] = author_year.strip(string.punctuation + string.whitespace)
                new_row['Link'] = link.strip()
                doi_matches = re.findall(doi_pattern, link.strip())
                new_row['doi'] = doi_matches[0] if doi_matches else None
                # Double decoding - as some dois will be URL encoded
                new_row['doi'] = unquote(unquote(new_row['doi'])) if pd.notna(new_row['doi']) else None
                match = re.match(notes_pattern, notes.strip())
                if match:
                    new_row['Paper Notes'] = match.group(1).strip().replace("*", "").replace("_", "").strip(punctuations_to_remove + string.whitespace)
                    new_row['Citations'] = match.group(2).strip(punctuations_to_remove + string.whitespace)
                else:
                    new_row['Paper Notes'] = notes.strip()
                    new_row['Citations'] = None
                new_row['N orig'] = ' & '.join(re.findall(n_pattern, new_row['Paper Notes']))

        else:
            # If matching fails
            new_row = row.copy()
            print(paper_str)
            new_row['Paper title'] = None
            new_row['Paper ref'] = None
            new_row['Link'] = None
            new_row['doi'] = None
            new_row['Paper Notes'] = paper_str  # Place entire string in 'Paper Notes'

        new_rows.append(new_row)


    # Create a new DataFrame from the list of new rows
    new_df = pd.DataFrame(new_rows)

    # Drop the original 'Critiques' column
    new_df.drop(columns=['Original paper'], inplace=True)

    return new_df

def parse_orig_effects(parsed_data):
   parsed_data = parsed_data[['Discipline', 'Effect', 'Description', 'Original effect size']].copy()

   parsed_data['Original effect size'] = parsed_data['Original effect size'].str.replace("_", "")

   return parsed_data 

def parse_critique_effects(parsed_data):
   parsed_data = parsed_data[['Discipline', 'Effect', 'Description', 'Replication effect size']].copy()

   parsed_data['Replication effect size'] = parsed_data['Replication effect size'].replace("_", "")

   return parsed_data 


Manual precleaning steps to parse sections and effects more easily
- Replace ####\n by ####
- Replace \n### \n by \n### (ensure this also happens before Social Psych at top)
- Replace empty headings: ###\s+\n by nothing
- delete <p tags and subsequent image errors
- remove linebreaks within fields, indicated by \ at the end of a line

Some individual fixes to original papers
- Easterlin paradox double link
- outgroup bias by a chance win or loss double link
- Brehm )
- Neonate imitation link
- Gender effects of political candidates - link
- Differential reinforcement of low rates of behaviour (DRL) - original *paper*

And to critiques:
-  Implicit God prime increases actual risky behaviour - link
- Multiple intelligences - links
- Stereotype threat on gender differences in political knowledge - added paper link: https://doi.org/10.1017/XPS.2022.35
- Gender effects of political candidates - link scope
- Desire-state attribution may govern food sharing in Eurasian jays - section label
- Status-legitimacy effect - double link
- Automatic imitation - link scope
- Transposed word effect - stray [.
- Lexical precision on lexical competition - wrong link replaced by NA for now
- stray duplicated link removed throughout: https://www.tandfonline.com/doi/full/10.1080/02699931.2018.1468732
- Left digit bias - duplicated link
- Structural brain-behaviour correlations - the association between behavioural activation and white matter integrity - is corrigendum link even visible? Needs fixing

Personal cognitive dissonance - fixed line break for original effect size

In [162]:

parsed_markdown = parse_markdown('FORRT effects v3.md')
parsed_markdown[['Discipline', 'Effect', 'Description']].to_csv('parsed_effects.csv', index=False)
critiques = parse_critiques(parsed_markdown)
critiques.to_csv('parsed_critiques.csv', index=False)
critiques.to_excel('parsed_critiques.xlsx', index=False)

originals = parse_originals(parsed_markdown)
originals.to_csv('parsed_originals.csv', index=False)
originals.to_excel('parsed_originals.xlsx', index=False)



In [147]:
import requests
import re

def get_abstract(doi):
    # Perform HTTP request to CrossRef API
    url = f"http://api.crossref.org/works/{doi}"
    response = requests.get(url)
    if response.status_code != 200:
        return None
    
    # Extract abstract text
    content = response.json()
    try:
        abstract_raw = content['message']['abstract']
    except KeyError:
        return None
    
    # Remove paragraph tags
    abstract_clean = re.sub(r'<jats:p>|<\/jats:p>', '\n', abstract_raw)
    
    # Remove other tags
    abstract_clean = re.sub(r'<[^>]*>', ' ', abstract_clean)
    
    return abstract_clean


In [106]:
effects_original = parse_orig_effects(parsed_markdown)
effects_original.to_csv('effects_original.csv', index=False)

effects_critiques = parse_critique_effects(parsed_markdown)
effects_critiques.to_excel('effects_critiques.xlsx', index=False)

In [152]:
from tqdm.notebook import tqdm_notebook

tqdm_notebook.pandas()

# Update DataFrame with abstracts and show progress
originals['Abstract'] = originals['doi'].apply(lambda x: get_abstract(x) if pd.notnull(x) else None)
critiques['Abstract'] = critiques['doi'].apply(lambda x: get_abstract(x) if pd.notnull(x) else None)


In [153]:
critiques.to_excel('parsed_critiques.xlsx', index=False)

In [146]:
# Renaming columns for clarity
originals = originals.rename(columns={
    'Paper title': 'Original_Paper_title',
    'Paper ref': 'Original_Paper_ref',
    'Link': 'Original_Link',
    'doi': 'Original_doi',
    'Paper Notes': 'Original_Paper_Notes',
    'Citations': 'Original_Citations',
    'N orig': 'Original_N'
})

critiques = critiques.rename(columns={
    'Critique': 'Critique_text',
    'Link': 'Critique_Link',
    'doi': 'Critique_doi',
    'Notes': 'Critique_Notes',
    'Citations': 'Critique_Citations',
    'N critique': 'Critique_N'
})

effects_original = effects_original.rename(columns={
    'Original effect size': 'Effect_size_original'
})

effects_critiques = effects_critiques.rename(columns={
    'Replication effect size': 'Effect_size_replication'
})

# Left joining the dataframes
result = originals.merge(critiques, on=['Discipline', 'Effect', 'Description'], how='left') \
                  .merge(effects_original, on=['Discipline', 'Effect', 'Description'], how='left') \
                  .merge(effects_critiques, on=['Discipline', 'Effect', 'Description'], how='left')

# Specifying the desired column order
column_order = [
    'Discipline', 'Effect', 'Description', 
    'Original_Paper_title', 'Original_Paper_ref', 'Original_Link', 'Original_doi', 'Original_N', 'Effect_size_original', 'Original_Paper_Notes',
    'Critique_text', 'Critique_Link', 'Critique_doi', 'Critique_N', 'Effect_size_replication',  'Critique_Notes',
    'Original_Citations', 'Critique_Citations'
]

# Reordering columns in the merged DataFrame
result = result[column_order]

result.to_excel("merged.xlsx")


In [189]:
import re

pattern = r"\[([^]]+)\]\((.*?(?=\)\s|\)[.,;]|\)$))(?:[,\s;]*?([^;]+?[0-9]{4}[a-d]?)(?:[;,. ] ?(.*$))?)?"
text = """
[Example 4](https://example.com/abc) 2021; Additional info
[Example 5](https://example.com/abc(1,2,3)he) hello
[Example 6](https://example.com/abc(1,2,3)) hello
[Example 7](https://example.com/abc;def) 2021, Some Notes
"""

matches = re.findall(pattern, text)

for i, match in enumerate(matches):
    print(f"Match {i+1}:")
    print(f"  Title: {match[0]}")
    print(f"  URL: {match[1]}")
    try:
        print(f"  Year: {match[2]}")
        print(f"  Additional Info: {match[3]}")
    except IndexError:
        print("  Year, Additional Info: Not found")


Match 1:
  Title: Example 4
  URL: https://example.com/abc
  Year: ) 2021
  Additional Info: 
Match 2:
  Title: Example 5
  URL: https://example.com/abc(1,2,3)he
  Year: 
  Additional Info: 
Match 3:
  Title: Example 6
  URL: https://example.com/abc(1,2,3)
  Year: 
  Additional Info: 
Match 4:
  Title: Example 7
  URL: https://example.com/abc;def
  Year: ) 2021
  Additional Info: Some Notes
