In [12]:
import pandas as pd
import re
import requests
from lxml import html
import sys

In [13]:
class Scrapper:
    
    def __init__(self, source, link_pattern, xpathHead, xpathPicUrl, xpathText, df, max_while = 1, max_links = 9):
        self.source = source
        self.link_pattern = link_pattern
        self.xpathHead = xpathHead
        self.xpathPicUrl = xpathPicUrl
        self.xpathText = xpathText
        self.df = df
        self.max_while = max_while
        self.max_links = max_links
        self.link_storage = []
        self.link_history = []
        
    def scrape(self, seed_link):
        self.link_storage.append(seed_link)
        i = 0
        while True:
            links_list_length = len(self.link_storage)
            links = self.link_storage.copy()
            for link in links:
                if link not in self.link_history:
                    seed_page = self.load(link)
                    self.link_history.append(link)
                    if seed_page:
                        if self.is_needed(link):
                            #print('yes')
                            self.extract_info(seed_page)
                        self.extract_links(seed_page)
                if self.df.shape[0] > self.max_links:
                    break
            if links_list_length == len(self.link_storage):
                 i += 1
            if self.df.shape[0] > self.max_links or i > self.max_while:
                break
        return self.df
            
    def extract_links(self, page):
        pattern = self.source + '[a-z0-9\-\_\.\/]+'
        links = re.findall(pattern, page.text)
        for link in links:
            if not link in self.link_storage:
                self.link_storage.append(link)
    
    def load(self, link):
        response = requests.get(link)
        self.link_history.append(link)
        return response
    
    def is_needed(self, link):
        pattern = '[a-z0-9\-\_\.\/]+' + self.link_pattern
        return bool(re.search(pattern, link))
        
    def extract_info(self, page):
        tree = html.fromstring(page.content.decode('UTF-8'))
        head = tree.xpath(self.xpathHead)
        pic_url = tree.xpath(self.xpathPicUrl)
        text = tree.xpath(self.xpathText)
        #print(pic_url)
        data = [head, pic_url, text]
        self.df = self.df.append(pd.DataFrame([data], columns=list(self.df.columns)), ignore_index=True)
        #print('did extract_info')
        #return res

In [14]:
df = pd.DataFrame(columns = ['title', 'pic_url', 'text'])

scrapper_kloop = Scrapper('https://kloop.kg', '/blog/',
                           xpathHead = '//header/h1/text()',
                           xpathPicUrl = '''(//div[@class="stk-mask"]/img/@src)[1]''',
                           xpathText = '''//p[@class='stk-reset']/text()''',
                           df=df)

In [None]:
scrapper_kloop.scrape('https://kloop.kg/')

In [None]:
scrapper_kloop.df