In [1]:
import bs4 as bs
from requests_html import HTMLSession
import requests

class Scraper:
    def __init__(self, url,handle_exception = True):
        self.url = url
        self.session = HTMLSession()
        self.flag = False
        if handle_exception:
            try:
                self.parser = self.session.get(url)
                self.parser.raise_for_status()  # This will raise an exception for HTTP errors
                self.soup = bs.BeautifulSoup(self.parser.text, 'lxml')
            except requests.exceptions.RequestException as e:
                raise e  # Propagate the error so it can be caught in the tests
        else:
            try:
                self.parser = self.session.get(url)
                self.parser.raise_for_status()  # This will raise an exception for HTTP errors
                self.soup = bs.BeautifulSoup(self.parser.text, 'lxml')
            except:
                self.flag = True
                pass

    def get_visible_text(self):
        if self.flag:
            return "Unique__Placeholder"
        text = self.soup.get_text(separator=" ",strip = True)
        return text

    def get_links(self):
        if self.flag:
            return set()
        links = self.parser.html.absolute_links
        return links
    
        


In [2]:
# use dpr to select best 10 at each depth
import relavance
from relavance import relavance_score

# class deep_scrape:
#     def __init__(self, url,query = None,relavance_method = 'dpr' depth = 3, max_links = 50,topk = 5):
#         self.url = url
#         self.session = HTMLSession()
#         self.max_depth = depth
#         self.max_links = max_links
#         self.cur_depth = 0
#         self.stack = [url]
#         self.vis = set()
#         self.document_store = []
#         self.reference_store = []
#         self.query = query
#         self.topk = topk
#         self.relavance_method = relavance_method
#         if self.query:
#             self.relavance = relavance_score(query,relavance_method)
    
#     def recursive_scrape(self):
#         depth = self.max_depth
#         nlinks = self.max_links
#         # add text extract from the first element of stack into the document store or context
#         while len(self.stack)>0 and depth > 0 and nlinks > 0:
#             cur_len = len(self.stack)
#             self.vis.union(set(self.stack))
#             new_context,reference_urls,new_urls = self.scrape(self.stack)
            
#             # can potentially add an extra pre processing step at the cost of more api calls, to clean the scraped data.
#             # can also store vectors if needed
#             self.document_store.extend(new_context)
#             self.reference_store.extend(reference_urls)
#             new_urls = list(set(new_urls).difference(self.vis))
#             self.stack = new_urls

#             # decrease the depth if necessary
#             depth -= 1
#             nlinks -= cur_len
        
#     def scrape(self,urls):
#         sc_objs = list(map(Scraper,urls,False))
#         sc_urls = list(map(Scraper.get_links,sc_objs))
#         sc_text = list(map(Scraper.get_visible_text,sc_objs))
        
#         if self.query:
#             if self.relavance_method == 'dpr':
#                 scores = self.relavance.dpr_score(sc_text)
#             else:
#                 scores = list(map(self.relavance.bm25_score,sc_text))
#                 topk_texts = sorted(zip(scores,sc_text,urls),key=lambda x : x[0])
#                 topk_texts,topk_urls = [x[1],x[2] for x in topk_texts[:self.topk]]
                
#             return topk_texts,topk_urls, sc_urls
        
#         else:
#             sc_text,sc_urls,topk_urls
            
    
        
        
class deep_scrape:
    def __init__(self, url,query = None,relavance_method = 'dpr',depth = 3, max_links = 50,topk = 5):
        self.url = url
        self.session = HTMLSession()
        self.max_depth = depth
        self.max_links = max_links
        self.cur_depth = 0
        self.stack = [url]
        self.vis = set()
        # self.document_store = []
        # self.reference_store = []
        self.data_store = {}
        self.query = query
        self.topk = topk
        self.relavance_method = relavance_method
        if self.query:
            self.relavance = relavance_score(query,relavance_method)
    
    def recursive_scrape(self):
        depth = self.max_depth
        nlinks = self.max_links
        # add text extract from the first element of stack into the document store or context
        while len(self.stack)>0 and depth > 0 and nlinks > 0:
            cur_len = len(self.stack)
            self.vis.union(set(self.stack))
            data,new_urls = self.scrape(self.stack)
            
            # can potentially add an extra pre processing step at the cost of more api calls, to clean the scraped data.
            # can also store vectors if needed
            self.data_store.update(data)
            
            ## self.document_store.extend(new_context)
            ## self.reference_store.extend(reference_urls)
            new_urls = list(new_urls.difference(self.vis))
            self.stack = new_urls

            # update the depth and nlinks value. As this is a terminating condition for this function
            depth -= 1
            nlinks -= cur_len
        
    def scrape(self,urls):
        sc_objs = list(map(Scraper,urls,[False]*len(urls)))
        
        sc_urls = list(map(Scraper.get_links,sc_objs))
        # returns a list of sets
        # get a union of all sets in the list
        sc_urls = set().union(*sc_urls)
        
        sc_text = list(map(Scraper.get_visible_text,sc_objs))
        
        if self.query:
            if self.relavance_method == 'dpr':
                scores = self.relavance.dpr_score(sc_text)
            else:
                scores = list(map(self.relavance.bm25_score,sc_text))
                
            topk_texts = sorted(zip(scores,sc_text,urls),key=lambda x : x[0],reverse=True)
            data= {} # dictionary with url : text extracted
            for _,txt,ref in topk_texts[:self.topk]:
                data[ref] = txt
                
            return data, sc_urls
        
        else:
            data = {}
            for i in range(len(urls)):
                data[urls[i]] = sc_text[i]
            
            return data,sc_urls
        

In [1]:
import scraper
from scraper import *

In [7]:
def save_data_to_file(d, path):
    with open(path,'w') as json_file:
        json.dump(d,json_file,indent=4)

In [9]:
import json
save_data_to_file(ds.data_store,'sample_data')

In [3]:
url = 'https://www.sf.gov'
query = "I want to know more about representatives of San Fransisco and some programs available for me in the city."
ds = deep_scrape(url,max_links=3,depth = 2,relavance_method='dpr',query = query)


In [6]:
ds.data_store

{'https://www.sf.gov': "City and County of San Francisco Skip to main content English Español 中文 Filipino Menu Services Departments Search Latest Find jobs with the City Join us! “Sunset over City Hall” by Sergio Ruiz , CC BY Services Activities Things to do in San Francisco. Building Construction resources and property information. Business Starting, owning, and closing a business. Disability Services and resources for the disability community in San Francisco. Food Get free or low-cost food, meals and find local food pantries. Government Get personal records, pay taxes or fines, work or volunteer with the City. Health Getting medical care, insurance and mental health support. Homelessness Find City and external resources for people experiencing homelessness. Housing Finding and staying in housing. Immigrants Resources and programs helpful to immigrants. Jobs Find jobs, fellowships, and internships with the City of San Francisco. Problems and complaints Tell us about issues. Safety Pe

In [25]:
los = [{'https://www.sf.gov/contact-sfgov', 'https://www.sf.gov/departments', 'https://www.sf.gov/news/mayor-london-breed-names-genny-lim-san-franciscos-poet-laureate', 'https://www.sf.gov/profile/joel-engardio', 'https://www.sf.gov/topics/disability', 'https://www.sf.gov/topics/housing', 'https://www.sf.gov/topics/jobs', 'https://www.sf.gov/', 'https://www.sf.gov/profile/matt-dorsey', 'https://www.sf.gov/news/notice-town-hall-meeting-0', 'https://www.sf.gov/profile/connie-chan', 'https://www.sf.gov/topics/business', 'https://www.sf.gov/profile/brooke-jenkins', 'https://www.sf.gov/profile/paul-miyamoto', 'https://www.sf.gov/topics/activities', 'https://www.sf.gov/news/department-elections-enhances-youth-engagement-november-5-election', 'https://www.sf.gov/topics/safety', 'https://www.sf.gov/profile/shamann-walton', 'https://www.sf.gov/information/about-sfgov', 'https://www.sf.gov/topics/transportation', 'https://www.sf.gov/profile/hillary-ronen', 'https://www.sf.gov/topics/health', 'https://www.sf.gov/news/mayor-london-breed-celebrates-grand-opening-new-100-affordable-housing-project-san-franciscos', 'https://www.sf.gov/profile/david-chiu', 'https://www.sf.gov/profile/joaquin-torres', 'https://www.sfbos.org', 'https://www.sf.gov/profile/ahsha-safai', 'https://www.sf.gov/information/privacy-policy-sfgov', 'https://www.sf.gov/topics/government', 'https://www.sf.gov/topics/building', 'https://www.sf.gov/profile/catherine-stefani', 'https://www.sf.gov/profile/myrna-melgar', 'https://www.flickr.com/photos/sirgious/5599330115', 'https://www.sf.gov/topics/homelessness', 'https://www.sf.gov/profile/aaron-peskin', 'https://www.sf.gov/profile/london-breed', 'https://www.sf.gov/node/195', 'https://www.sf.gov/topics/problems-and-complaints', 'https://www.sf.gov/topics/immigrants', 'https://www.sf.gov/information/disclaimer-sfgov', 'https://www.sf.gov/news/all', 'https://www.sf.gov/profile/jose-cisneros', 'https://www.sf.gov/profile/dean-preston', 'https://www.sf.gov/services', 'https://www.sf.gov/topics/food', 'https://www.sf.gov/profile/rafael-mandelman', 'https://careers.sf.gov/'}]
los= los*2
eset = set()
eset.union(*los)

{'https://careers.sf.gov/',
 'https://www.flickr.com/photos/sirgious/5599330115',
 'https://www.sf.gov/',
 'https://www.sf.gov/contact-sfgov',
 'https://www.sf.gov/departments',
 'https://www.sf.gov/information/about-sfgov',
 'https://www.sf.gov/information/disclaimer-sfgov',
 'https://www.sf.gov/information/privacy-policy-sfgov',
 'https://www.sf.gov/news/all',
 'https://www.sf.gov/news/department-elections-enhances-youth-engagement-november-5-election',
 'https://www.sf.gov/news/mayor-london-breed-celebrates-grand-opening-new-100-affordable-housing-project-san-franciscos',
 'https://www.sf.gov/news/mayor-london-breed-names-genny-lim-san-franciscos-poet-laureate',
 'https://www.sf.gov/news/notice-town-hall-meeting-0',
 'https://www.sf.gov/node/195',
 'https://www.sf.gov/profile/aaron-peskin',
 'https://www.sf.gov/profile/ahsha-safai',
 'https://www.sf.gov/profile/brooke-jenkins',
 'https://www.sf.gov/profile/catherine-stefani',
 'https://www.sf.gov/profile/connie-chan',
 'https://www.

In [4]:
ds.recursive_scrape()

In [5]:
ds.data_store

{'https://www.sf.gov': "City and County of San Francisco Skip to main content English Español 中文 Filipino Menu Services Departments Search Latest Find jobs with the City Join us! “Sunset over City Hall” by Sergio Ruiz , CC BY Services Activities Things to do in San Francisco. Building Construction resources and property information. Business Starting, owning, and closing a business. Disability Services and resources for the disability community in San Francisco. Food Get free or low-cost food, meals and find local food pantries. Government Get personal records, pay taxes or fines, work or volunteer with the City. Health Getting medical care, insurance and mental health support. Homelessness Find City and external resources for people experiencing homelessness. Housing Finding and staying in housing. Immigrants Resources and programs helpful to immigrants. Jobs Find jobs, fellowships, and internships with the City of San Francisco. Problems and complaints Tell us about issues. Safety Pe

In [6]:
url = 'https://www.sf.gov'
#url = 'https://arxiv.org/pdf/1706.03762'

In [7]:
scraper  = Scraper(url,False)

In [12]:
sc_l = list(map(Scraper,[url],[False]))

In [8]:
scraper.soup.get_text(' ',strip = True)

"City and County of San Francisco Skip to main content English Español 中文 Filipino Menu Services Departments Search Latest Find jobs with the City Join us! “Sunset over City Hall” by Sergio Ruiz , CC BY Services Activities Things to do in San Francisco. Building Construction resources and property information. Business Starting, owning, and closing a business. Disability Services and resources for the disability community in San Francisco. Food Get free or low-cost food, meals and find local food pantries. Government Get personal records, pay taxes or fines, work or volunteer with the City. Health Getting medical care, insurance and mental health support. Homelessness Find City and external resources for people experiencing homelessness. Housing Finding and staying in housing. Immigrants Resources and programs helpful to immigrants. Jobs Find jobs, fellowships, and internships with the City of San Francisco. Problems and complaints Tell us about issues. Safety Personal safety and prepa

In [24]:
url = 'https://arxiv.org/pdf/1706.03762'
r = requests.get(url)
soup = bs(r.text, 'html.parser')

TypeError: 'module' object is not callable

In [25]:
r.text

'%PDF-1.5\n%�\n137 0 obj\n<< /Filter /FlateDecode /Length 4011 >>\nstream\nxÚµZK“ä¶‘¾ëWô‘\x15ÑÅ%^|hc\x0f’C+{#4á°zOã>°«ĞUô°È\x16\x1fj�~½3‘\t\x12`qf½»ö‰ �Ä#Ÿ_&˜=\\\x1e²‡\x1f¿ÉøùıÓ7ÿöŸeñ DZ\x19#\x1f�^\x1fDV>\x14"K3U<<�\x1f>&âp\x14Y–%\x7fê¦á Ê¤?Ï§©é»ÃóÓ\x7fÁÇú¡J«\\æîÛ4“0oö Ü4yiReôÃÓ\r¦ù‹=ÍÃ`»ép”¹N:;\x0fuëÛÓûAdI?|\x1a\x1f±«JÚ¾»Ğàxí‡é8ÙáFï7{ë‡ÏÔşˆ{€ÅhI%s8…�—\x15ŠhÖ�)\x05d"�É�iÂº;Sãr0I=Y~\x1bâ½ÇËjY¤…”ñ|ÅvU\xaddZdjwÕ/ñÂÏ!Š�©…Nµ64AÓÁW:OŞêajNs[\x0f\x07U%�Ôy\xad\x0f¢J~=\x08“Xêy±–?øk–ÉáÖ"\x1fu‘Øqª_Úf¼º\x03Ãp=Ò\x13ú\'ş¶\x7f%ÚéÊ\x1d°$7ŞŞ†¾>]-\x7fD›*’Ñş2ÛîÄä·şlÛÆI\x15¿\x01Fo8ä\x0e—«Ô¨œ\x0e7\ru7ze;ÊJ&°ÎKko¸N%’q>]©ßm\x17�mİ]æúbi<X\x11iœh¡qƒ\xad6�¥\x17·F[OÍA$~•\xadx³TVE¼5eîå\x0bdyLõ\x18O¤LZä:&‘wóè,-ô×\'Òy*�ˆIî7dD*•Œ©�ÓÃQk�|˜ovègÒ°,\xadÊP\x06F¦¹,è\x03ûz�y\x02¢\x07ö\x0c\x13²\x19<\x00(–ÌP±r§X\xa0dcCr†ÁSßMM7;M‚×©§çÛ<^¹çÊ\x1f½ôsw®‡Æò´NÃà\x19Z\x1b¼\x06B…7\'Tş€D\n3�–A÷p�"9�-µC�¹ci‘\x16¢¤cÕ\x03èÀdOÓ<¸ÕM–|Tå#µ¤æ†0À¯=-•`Ã²º÷k^ÍG6•ÏoÍ©nÉÊ�‹`×õiê\x07\x1a>õ··\x19\x0cŒÔ\x1b�‚½\x1e4ÙÊÀˆ>ß^ú–\xad¼\x1f\x1b¤æéÉ"óÕ"›\x0eæ

In [16]:
context = scraper.get_visible_text()

In [22]:
context

'%PDF-1.5\n%�\n137 0 obj\n<< /Filter /FlateDecode /Length 4011 >>\nstream\nxڵZK�䶑��W����%^|hc�C+{#4�zO�>���U���j�~�3�\t`qf���� ��#�_&�=\\��������7���e� DZ#�^DV>"K3U<<�>&�pY�%\x7f�� ʤ?ϧ�����\x7f����J�\\���4�0o� �4yiRe���\r���=��`��p��N:;u����AdI?|��Jھ���x��8��F�7{�����{��hI%s8����h֝)d"�ɞiº;S�r0I=Y~���jY����|�vU�dZdjw�/���!����N�64A��W:O��ajNs[U%��y��J~=�X�y��?�k����"u��q�_�f���p=��\'��\x7f%����$7�ކ�>]-\x7fD�*���2�����l��I�Fo8���Ԩ�7\ru7ze;�J&��Kko�N%�q>]��m�m�]��bi (��P�r�X�dcCr��S�MM7;M�ש���<^�����sw����N��Z�B�7\'T��D\n3��A�p�"9�-�C��ci���c���dO�<��M�|T�#���0��=-�`ò��k^�G6��oͩn�ʐ�`��i�>��������4����>�^��������"��"��1b\r�� �{�\'��]d���pH����:8��Dy�����X�3��:�Urm�g�y��³d��B�a�k��@���� g��<���@wZ���b��Q������n�,�N�N��I�N��s�=t�B/qڠ��i�_�R7ćw�䗿|@aZI�\x7f? h�J�@q��� "G�4*��$O#��%�/M�46d�Wgd��d+�WΎ~V�rӪL�B��8SP�ӵa\t4�բ��A��MS���"{��\r��]\r�A��v>�����G@��}���7ӵ�6Ĩ�ۢN���V��Z��_\t�����g>\r����B=���G�@�-L\t�[��%�鵶�Lב-��A��1g���:�`�ܚ��/��A�Bo�i�G��8\x7f�0ʴ�`/$з�c�����`0\'Kq8�+�bjɘkDǆ��T��e�47 �9�q

In [11]:
import os
from dotenv import load_dotenv, dotenv_values 
# loading variables from .env file
load_dotenv() 

import cohere
co = cohere.Client(api_key=os.environ['COHERE_API_KEY'],)



In [12]:
response = co.chat(
  model="command-r",
  #message="Write a title for a blog post about API design. Only output the title text."
  message = context + "The previous text is scraped from a web page, summarize the information for me"
)

print(response.text) # "The Art of API Design: Crafting Elegant and Powerful Interfaces"


The text provides an overview of the services and resources available in the City and County of San Francisco. It lists various activities and resources for residents, including job opportunities, business support, disability services, food assistance, government services, healthcare, and resources for the homeless. It also highlights the city's recent news and events, such as the appointment of Genny Lim as San Francisco's Poet Laureate and the opening of a new affordable housing project. The elected officials, including Mayor London Breed and the Board of Supervisors, are introduced, along with their contact details. The text further provides a disclaimer and privacy policy, encouraging residents to contact the city for any assistance or feedback.


In [13]:
response = co.chat(
  model="command-r",
  #message="Write a title for a blog post about API design. Only output the title text."
  message = "what was my previous messagee"
)

print(response.text)

I'm sorry, but without additional context, I cannot tell you what your previous message was, as I have no record of your past conversations or messages. Can you remember what you previously said or asked?


In [None]:
recursive_scraper()