## Main Body Scrape Testing and Final Functions

### 1) Packages and Modules

In [1]:
#Creating connection with websites:
import requests

#Regex operations for data cleaning:
import re

#Extracting relevant items from html:
from bs4 import BeautifulSoup

#Testing scraper
import time 
import pandas as pd

#Connecting with OpenAI:
import openai

#Importing api_key:
from open_ai_key import OPENAI_API_KEY


### 4) The plan for connecting all classes and methods

Create three classes: 1) Connect, 2) Scrape, 3) Clean_Filter

1) Connect's Methods and Returns:
    - request
    - Return bs4 object and confirmation of success/no success of connection

2) Scrape's Methods and Returns:
    - paper_type_new
    - extract_old
    - section_looper
    - extract_new
    - final_extract
    - Return main_keys, main_body paragraph by paragraph
    - Return Abstract
    - Return image links with figure legends (make it so that you can display images here)
    - Return Year of paper
    - Return section number
    
3) Clean_Filter's Methods and Returns:
    - convert_text (regex)
    - extract_sections
    - Return cleaned text

### 5) Sample Class Architecure Version 2

In [2]:
### Class for connecting to website
class Connect:
    """Connects to the website retrieving the data in 'HTML' format stored as a bs4 object."""
    
    def __init__(self, url):
        self.url = url
    
    def request(self):
        response = requests.get(self.url)
        if response.status_code!=200:
            print("""The request to the url could not be made.\n
            Please check if your link is correct and it works in your browser.""")
        else:
            print("Successfully retrieved information from the url.")
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup, True

In [11]:
### Class for all scraping activities
class Scrape:
    """The "Scrape" class contains all the methods needed to extract information from a paper, paragraph
       by paragraph. The only needed argument is the bs4 object created using the 'request' method from the 
       "Connect" class (Note: 'request' method also returns a second argument representing confirmation of 
       connection success.)."""
    
    def __init__(self, bs4_obj):
        self.bs4_obj = bs4_obj
    
    def paper_type_new(self):
        """Outputs a classification result, True means it is a new paper. 
        Futher along the line it serves to apply the correct scraping method."""

        if self.bs4_obj.find_all('div', attrs={"class":"JournalFullText"})==[]:
            return False
        else:
            return True
    
    def extract_abstract(self):
        """As the abstract is not part of the main body of text, a separate method created here will be used
           in the main body extraction methods below."""
        try:
            if self.bs4_obj.find('div','JournalAbstract').find('p')==None:
                abstract = self.bs4_obj.find('div','JournalAbstract').find('div','abstracttext')
            else:
                abstract = self.bs4_obj.find('div','JournalAbstract').find('p')
            return abstract
        except:
            print("This article does not have an abstract.")
    
    
    """METHODS FOR OLD HTML FORMAT EXTRACTION"""
    #################################################
    def extract_old(self):
        """Returns the main body as dictionary and names of sections within the article, skipping 
           the first one as it is not a section and represents the type of article."""
        
        #For this extraction method we need to start from the 'h2' sections themselves.
        bs4_obj = self.bs4_obj.find_all('h2')
        #Creating a dictionary to store the main body of the text
        main_body = {}
        section_count = len(bs4_obj)
        
        #Appending the abstract
        main_body['Abstract'] = [self.extract_abstract()]

        for section in range(1,section_count):
            content_list = []
            section_name = bs4_obj[section].get_text()
            data = bs4_obj[section].parent.next_sibling.next_siblings
            #Utilising section looper method
            nest_content = self.section_looper(data)
            for item in nest_content:
                content_list.append(item)

            main_body[section_name] = content_list  

        return main_body.keys(), main_body
    
    def section_looper(self,data):
        temp_list = []
        for x in data:
        #Check it's descendants for y.name=='h2'
            temp_soup = BeautifulSoup(str(x),"html.parser")
        #pprint(temp_soup)

            if temp_soup.find('h2'):
                break 
            else:
                temp_list.append(x)   

        return temp_list
    
    
    """METHODS FOR NEW HTML FORMAT EXTRACTION"""
    #################################################
    def extract_new(self):
        bs4_obj = self.bs4_obj
        main_body = {}
        section_count = len(bs4_obj.find_all('h2'))
        
        #Appending the abstract
        main_body['Abstract'] = [self.extract_abstract()]

        for i in range(1,section_count):
            content_list = []
            section_name = ''

            #Extracting the name of the section to insert as dictionary key
            tags = bs4_obj.find_all('h2')[i]
            temp_soup = BeautifulSoup(str(tags),'html.parser')
            section_name = temp_soup.get_text()
            
            if section_name=="Footnotes":
                extract_new_foot(bs4_obj)

            if section_name=="References":
                extract_new_ref(bs4_obj)


            else:
                for x in tags.next_siblings:
                    if x.name!='h2':
                        content_list.append(x)
                    else:
                        break
                        
            main_body[section_name] = content_list
        return main_body.keys(), main_body
    
    def extract_new_ref(self, bs4_obj):
        """Reference specific method, utilized above in extract_new()."""
        if bs4_obj.find('div','References')!=None:
                    for x in bs4_obj.find_all('div', 'References'):
                        for i in x.descendants:
                            if i.name=='p':
                                content_list.append(i)
    def extract_new_foot(self, bs4_obj):
        """Footnotes specific method, utilized above in extract_new()."""
        for x in bs4_obj.find('ol').descendants:
                    if x.name=='p' or x.name=='li':
                        content_list.append(x)
                    if x.name=='a':
                        if x.get('href').startswith('http'):
                            content_list.append(x.get('href'))
    
    """METHODS FOR EXTRACTING OTHER INFORMATION"""
    ##############################################
    def section_number(self):
        """Encodes the number of sections, would be used standalone or in other methods."""
#         tags = bs4_obj.find_all('h2')[i]
#         temp_soup = BeautifulSoup(str(tags),'html.parser')
#         section_name = temp_soup.get_text()
        pass
            
     def extract_article_type(self):
#          article = self.request(.find('div','header-bar-one').find('h2')?
#          return article_type?
        pass
        
    def extract_image_links(self):
        img_links = []
        for i in self.request().find_all('img', 'lazy'):
            img_links.append(i.get('data-src'))
            
        return img_links
    


In [13]:
### Cleaning class
class Clean:
    def just_text(self, main, main_keys):
        for key in list(main_keys):
            for bs4_elem in main[key]:
                index = main[key].index(bs4_elem)
                main[key].pop(index)
                
                #Additional regex
                text = bs4_elem.get_text()
                filtered_text = re.sub('\s{2,8}','',text)
                
                main[key].insert(index,filtered_text)
                
        return main

In [14]:
### Sample Architecture

def output():
    user_input = str(input("Please input the link of the paper: "))
    print("\n")

    obj = Connect(user_input)
    req_obj, confirmation = obj.request()
    scraped = Scrape(bs4_obj=req_obj)
    print("\n")

    if scraped.paper_type_new()==True and confirmation==True:
        print("Successfully connected.")
        print("This paper is new.")
        print("Using the new paper extraction method.")
        main_keys, main = scraped.extract_new()
        
        #Clean
        clean = Clean()
        clean_main = clean.just_text(main, main_keys)
        
        return main_keys, clean_main


    if scraped.paper_type_new()==False and confirmation==True:
        print("Successfully connected.")
        print("This paper is old.")
        print("Using the old paper extraction method.")
        main_keys, main = scraped.extract_old()
        
        #Clean
        clean = Clean()
        clean_main = clean.just_text(main, main_keys)
        
        return main_keys, clean_main

    elif confirmation==False:
        print("""Connecting to the link was unsuccessful. Please check if the link is copied correctly including the 
                 'https://..' part in the beggining of the link and do not include any whitespaces.""")



In [15]:
main_keys, main = output()
#https://www.frontiersin.org/articles/10.3389/neuro.06.004.2008/full

Please input the link of the paper: https://www.frontiersin.org/articles/10.3389/neuro.06.004.2008/full


Successfully retrieved information from the url.


Successfully connected.
This paper is old.
Using the old paper extraction method.


In [16]:
main_keys

dict_keys(['Abstract', 'Introduction', 'Representational Similarity Analysis – Step-By-Step', 'Empirical Results and their Interpretation', 'The Broad Potential of Representational Similarity Analysis', 'Discussion', 'Appendix', 'Methodological Details', 'Conflict of Interest Statement', 'Acknowledgements', 'Footnotes', 'References'])

In [17]:
main['Introduction']

['Relating Representations in Brains and Models',
 '\n',
 'A computational model of a single neuron (e.g., in V1) can be tested and adjusted on the basis of electrophysiological recordings of the activity of that type of neuron under a variety of circumstances (e.g., across different stimuli). This has been one successful avenue of evaluating computational models of single neurons with brain-activity data (e.g., David and Gallant, 2005; Koch, 1999; Rieke et al., 1999). This single-unit fitting approach becomes intractable, however, for computational models at a larger scale of organization, which simulate comprehensive brain information processing and include populations of units with different functional properties. A major problem in relating such models to brain-activity data is the spatial correspondency problem: Which single-cell recording or functional magnetic resonance imaging (fMRI) voxel corresponds to which unit of the computational model? Defining a one-to-one mapping betwe

### 6) Testing

In [13]:
paper_list = ['https://www.frontiersin.org/articles/10.3389/neuro.06.004.2008/full', 
              'https://www.frontiersin.org/articles/10.3389/fncom.2016.00003/full',
              'https://www.frontiersin.org/journals/oncology/articles/10.3389/fonc.2021.789659/full',
              'https://www.frontiersin.org/articles/10.3389/fimmu.2022.990900/full',
              'https://www.frontiersin.org/articles/10.3389/fmed.2022.833829/full',
              'https://www.frontiersin.org/articles/10.3389/fmicb.2022.980903/full']

#### Testing code

In [14]:
### Need a version that doesn't require user input so we can input from list automatically
def output_v2(link):
    #user_input = str(input("Please input the link of the paper: "))
    user_input = link
    print("\n")

    obj = Connect(user_input)
    req_obj, confirmation = obj.request()
    scraped = Scrape(bs4_obj=req_obj)
    print("\n")

    if scraped.paper_type_new()==True and confirmation==True:
        print("Successfully connected.")
        print("This paper is new.")
        print("Using the new paper extraction method.")
        main_keys, main = scraped.extract_new()
        
        #Clean
        clean = Clean()
        clean_main = clean.just_text(main, main_keys)
        
        return main_keys, clean_main

    if scraped.paper_type_new()==False and confirmation==True:
        print("Successfully connected.")
        print("This paper is old.")
        print("Using the old paper extraction method.")
        main_keys, main = scraped.extract_old()
        
        #Clean
        clean = Clean()
        clean_main = clean.just_text(main, main_keys)
        
        return main_keys, clean_main

    elif confirmation==False:
        print("""Connecting to the link was unsuccessful. Please check if the link is copied correctly including the 
                 'https://..' part in the beggining of the link and do not include any whitespaces.""")

In [16]:
def testing(paper_list):
    """Takes a list of paper links as an argument, connects to it and scrapes data.
       the user would then check if the scraping has been done correctly. After 30 
       seconds a user input prompt will ask whether the paper was correctly scraped or not.
       Outputs a pandas dataframe containing the perfromance results."""
    test_results = {'Link':[],
                'Result':[]}

    for link in paper_list:
        out_keys, out_main = output_v2(link)
        print(out_keys)
        print('\n')
        print(out_main)

        time.sleep(30)
        user_verdict = input('Was this paper correctly scraped?')

        if user_verdict=='y':
            test_results['Link'].append(link)
            test_results['Result'].append("Pass")
        elif user_verdict=='n':
            test_results['Link'].append(link)
            test_results['Result'].append("Fail")
            
    df = pd.DataFrame(test_results)
            
    return test_results

In [17]:
df_test = testing(paper_list)



Successfully retrieved information from the url.


Successfully connected.
This paper is old.
Using the old paper extraction method.
dict_keys(['Abstract', 'Introduction', 'Representational Similarity Analysis – Step-By-Step', 'Empirical Results and their Interpretation', 'The Broad Potential of Representational Similarity Analysis', 'Discussion', 'Appendix', 'Methodological Details', 'Conflict of Interest Statement', 'Acknowledgements', 'Footnotes', 'References'])


{'Abstract': ['A fundamental challenge for systems neuroscience is to quantitatively relate its three major branches of research: brain-activity measurement, behavioral measurement, and computational modeling. Using measured brain-activity patterns to evaluate computational network models is complicated by the need to define the correspondency between the units of the model and the channels of the brain-activity data, e.g., single-cell recordings or voxels from functional magnetic resonance imaging (fMRI). Similar corres

Was this paper correctly scraped?y


Successfully retrieved information from the url.


Successfully connected.
This paper is new.
Using the new paper extraction method.


AttributeError: 'str' object has no attribute 'get_text'