In [1]:
from bs4 import BeautifulSoup, Tag, NavigableString
from pprint import pprint

#### The below code is an illustration on how to split the data using BeautifulSoup
It shows how to extract the text, links and link texts

In [2]:
s = 'Hello world. <a href="https://www.espn.com/">ESPN</a> is a great sports website. <a href="https://www.cricbuzz.com/">Cric Buzz</a> is another sports website for crickets.'
soup = BeautifulSoup(s, 'html.parser')
out =[]
for c in soup.contents:
    if isinstance(c, NavigableString):
        out += [c]
    elif isinstance(c, Tag) and c.name=='a' and 'href' in c.attrs:
        out +=[{"link": c['href'], 'title': c.text}]

pprint(out)

['Hello world. ',
 {'link': 'https://www.espn.com/', 'title': 'ESPN'},
 ' is a great sports website. ',
 {'link': 'https://www.cricbuzz.com/', 'title': 'Cric Buzz'},
 ' is another sports website for crickets.']


#### The below example illustrate the following.

1. It will show how to extract the href values from the page source.
2. it will show how to use urljoin to create the full url from the relative url.
3. How to extract the script content and replace the script variable in the url dynamically to get the full url

In [4]:
import requests
import re
from urllib.parse import urljoin

page_url = 'https://www.sfma.org.sg/member/category/manufacturer'

info = requests.get(page_url)
soup = BeautifulSoup(info.content, 'html.parser')
links = soup.find_all('a', attrs={'class' :'plink'})
for link in links:
    if link['href'].startswith('../'):
        print(link['href']) ##There is a relative url with script value to dynamically populate

print('----getting the full url from the relative url and the script variable')

script_sections = soup.find_all('script')
for i in range(len(script_sections)):
    txt = script_sections[i].get_text()
    if txt:
        pattern = re.compile(r'permalink:\'(.*?)\'') ##patterm fpr checking the permalink value
        permlinks = re.findall(pattern, txt)
        #for each of the perma link value add it to the href that is extracted from the url 
        for i in permlinks:
            href = "../info/{{permalink}}"
            href = href.split('{')[0]+i
            print(urljoin(page_url, href)) 

../info/{{permalink}}
----getting the full url from the relative url and the script variable
https://www.sfma.org.sg/member/info/1a-catering-pte-ltd
https://www.sfma.org.sg/member/info/a-linkz-marketing-pte-ltd
https://www.sfma.org.sg/member/info/aalst-chocolate-pte-ltd
https://www.sfma.org.sg/member/info/abb-pte-ltd
https://www.sfma.org.sg/member/info/ace-synergy-international-pte-ltd
https://www.sfma.org.sg/member/info/acez-instruments-pte-ltd
https://www.sfma.org.sg/member/info/acorn-investments-holding-pte-ltd
https://www.sfma.org.sg/member/info/ad-wright-communications-pte-ltd
https://www.sfma.org.sg/member/info/added-international-s-pte-ltd
https://www.sfma.org.sg/member/info/advance-carton-pte-ltd
https://www.sfma.org.sg/member/info/agroegg-pte-ltd
https://www.sfma.org.sg/member/info/airverclean-pte-ltd
https://www.sfma.org.sg/member/info/ajinomoto-singapore-pte-ltd
https://www.sfma.org.sg/member/info/all-big-frozen-food-pte-ltd
https://www.sfma.org.sg/member/info/american-food-