This Notebook is for parsing Elsvier papers

In [None]:

from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch
import json
import re
    
## Load configuration
con_file = open("config.json")
config = json.load(con_file)
con_file.close()

## Initialize client
client = ElsClient(config['apikey'])
# client.inst_token = config['insttoken']



In [3]:
pii_doc = FullDoc(sd_pii = 'S2666920X23000255')
if pii_doc.read(client):
    print ("pii_doc.title: ", pii_doc.title)
    pii_doc.write()   
else:
    print ("Read document failed.")

pii_doc.title:  An AI-enhanced pattern recognition approach to temporal and spatial analysis of children's embodied interactions


In [6]:
# print(json.dumps(pii_doc.data["originalText"]))

# print(pii_doc.title)
text = pii_doc.data["originalText"]

# Extract the relevant portion of the string
pattern = r"Published by Elsevier Ltd\.(.*?)Reference(.*)"
match = re.search(pattern, text, re.DOTALL)
section_info = match.group(1).strip()
full_text = match.group(2).strip()


# Extract the section names
pattern = r"\b(\d{1,2}(?:\.\d{1,2})?)\s+([A-Za-z\s:#]+?(?=\s\d|$))"
matches = re.findall(pattern, section_info)

sections = []
for section_number, section_name in matches:
    sections.append(f"{section_number} {section_name.strip()}")

# Use the section names to get the section contents
section_index = [full_text.find(section) for section in sections]
section_contents = []

i = 0
while i < len(sections)-1:
    section_contents.append(full_text[section_index[i] + len(sections[i]) : section_index[i+1] + len(sections[i+1])])
    i+=1

section_contents.append(full_text[section_index[i] + len(sections[i]) :])

result = [{"section": n, "content": c} for n, c  in zip(sections, section_contents)]
print(json.dumps(result))



[{"section": "1 Introduction", "content": " Artificial Intelligence (AI) has tremendous potential to support educational research. That potential stems from the way AI can support and extend a researcher's ability to detect important features of learning when analyzing video data (Cukurova, Luckin, & Clark-Wilson, 2019). A number of researchers have successfully reported using AI to identify behavioral and physiological patterns in video and audio data that manifest student learning (e.g., Andrade et al., 2016; Di Mitri et al., 2017; Luckin & Cukurova, 2019). Others have similarly utilized AI-enhanced video analysis techniques to explore the relationship between embodied human behaviors and the underlying learning processes (Abrahamson et al., 2021; Cukurova et al., 2018; Lee-Cultura et al., 2022; Worsley et al., 2021). The general idea is that AI can help researchers identify patterns of behavior that reveal how human cognition and learning take place. One of the challenges with AI-en

In [9]:
result[0]["section"]

'1 Introduction'

In [50]:
import re

# Example text
text = """
Some text before
Published by Elsevier Ltd.
This is the content you want to extract.
Published by Elsevier Ltd.
Another instance of the pattern.
"""

# Define the pattern
pattern = r"Published by Elsevier Ltd\."

# Find all matches using re.finditer()
matches = re.finditer(pattern, text)

# Count the number of matches
num_matches = sum(1 for _ in matches)

print("Number of matches:", num_matches)


Number of matches: 2


In [33]:
import re

# Example text
text = """
Some text before
Published by Elsevier Ltd.
This is the content you want to extract.
Reference
Some text after
"""

# Define the pattern
pattern = r"Published by Elsevier Ltd\.(.*?)Reference"

# Find the match using regular expression
section_info = re.search(pattern, text, re.DOTALL)

section_info.group(1).strip()

'Published by Elsevier Ltd.\nThis is the content you want to extract.\nReference'

In [63]:
import re

# Example full text
full_text = """
JOHNSON P 1 Introduction
This is the introduction section.
2 Background
This is the background section.
3 Investigative research
This is the investigative research section.
4 Results and analysis
4.1 Magic act # 1: Hidden in plain sight: When partnerships are not partnerships
This is the first subsection of the Results and analysis section.
4.2 Magic act #2: The disappearing act
This is the second subsection of the Results and analysis section.
5 Conclusion
Hi
"""

# Example list of section names
section_names = ['1 Introduction', '2 Background', '3 Investigative research', '4 Results and analysis', '5 Conclusion']

# Regular expression pattern for matching section headers
pattern = r'\b(\d+\.\s*)?(' + '|'.join(re.escape(name) for name in section_names) + r')\b'

# Find section contents using regular expressions
section_contents = []
matches = re.finditer(pattern, full_text, re.IGNORECASE)
for section_info in matches:
    section_start = section_info.end()
    if section_info.group(1):  # If there is a subsection number
        section_start += len(section_info.group(1))
    next_match = next(matches, None)
    section_end = next_match.start() if next_match else None
    section_content = full_text[section_start:section_end].strip() if section_end else full_text[section_start:].strip()
    section_contents.append(section_content)

# Print the section contents
for content in section_contents:
    print(content)
    print("hi\n")


This is the introduction section.
hi

This is the investigative research section.
hi

Hi
hi



In [73]:
import re

# Example text
text = """
JOHNSON P 1 Introduction
This is the introduction section.
2 Background: Contextualising the Newcastle 500 from a political economy perspective
This is the background section.
3 Investigative research
This is the investigative research section 2019 yes.
4 Results and analysis
This is the results and analysis section.
4.1 Magic act # 1: Hidden in plain sight: When partnerships are not partnerships
This is subsection 4.1.
4.2 Magic act #2: The disappearing act
This is subsection 4.2.
4.3 Magic act # 3: Spinning straw into gold
This is subsection 4.3.
4.4 Magic act # 4 voodoo economics: How costs are hidden or transformed into benefits
This is subsection 4.4.
4.5 Magic act # 5: Ghosting the numbers by falsifying and fudging
This is subsection 4.5.
5 Conclusion
This is the conclusion section.
References
Some references.
"""

# List of section numbers and names
sections = ['1 Introduction', '2 Background: Contextualising the Newcastle 500 from a political economy perspective', '3 Investigative research', '4 Conclusion']

# Initialize a dictionary to store section contents
section_contents = {}

# Iterate over sections
for section in sections:
    # Escape special characters in section name
    section_name = re.escape(section)
    
    # Create a regex pattern to find the section content
    pattern = r"{}([\s\S]*?)(?=\d+\s|$)".format(section_name)
    
    # Find all matches for the section content using regex
    matches = re.finditer(pattern, text, re.DOTALL)
    
    # Extract the section content from each match
    content = '\n'.join(match.group().strip() for match in matches)
    
    # Store the section content in the dictionary
    section_contents[section] = content

# Print the section contents
for section, content in section_contents.items():
    print(section)
    print(content)
    print('-' * 30)


1 Introduction
1 Introduction
This is the introduction section.
------------------------------
2 Background: Contextualising the Newcastle 500 from a political economy perspective
2 Background: Contextualising the Newcastle 500 from a political economy perspective
This is the background section.
------------------------------
3 Investigative research
3 Investigative research
This is the investigative research section
------------------------------
4 Conclusion

------------------------------
