In [None]:
from requests import Session
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from datetime import datetime
from math import log10
from random import choice
import pandas as pd
import time
import re
import pymysql
import pickle

Conexão com o banco de dados

In [None]:
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='210909', db='mysql', charset='utf8')
cur = conn.cursor()
cur.execute("USE rankrawlerdb")

Ajuste do tamanho da coluna no Pandas

In [None]:
pd.set_option('max_colwidth', 300)

Classe de Dominío de cada url. (Abandonada)

In [None]:
class Domain:
    
    def __init__(self, netloc, last_access=0, ignore=False):
        self.netloc = netloc
        self.last_access = last_access
        self.ignore = ignore
        
    def can_access(self, wait_time=1.5):
        return (time.time() - self.last_access) > wait_time

Classe de Página de cada url

In [None]:
class Page:    
    def __init__(self, url):      
        self.url = url
        self.domain = urlparse(url).netloc
        self.internal_links = set()
        self.external_links = set()
        self.links = []
        
    def get_page_info(self,search_keyword=''):
        #modificar o user_agent ajuda a deixar o bot um pouco mais humano
        user_agent = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
        session = Session()
        certificate = '/home/lucascarrias/.local/lib/python3.6/site-packages/certifi/cacert.pem' #certificado de cada usuário
        try:
            response = session.get(self.url, headers=user_agent, verify=certificate, timeout=50)
            self.status_code = response.status_code
            self.last_access = datetime.now()
            print(response)
            if response.status_code < 400:                
                bs = BeautifulSoup(response.text, 'html5lib')
                if bs:
                    self.lang = self.get_tag_attr_text(bs, 'html', 'lang')
                    self.title = self.get_tag_text(bs, 'title')
                    self.h1 = self.get_tag_text(bs, 'h1')
                    self.description = self.get_tag_attr_text(bs, 'meta', 'name', 'description', target='content')
                    self.keywords = self.get_tag_attr_text(bs, 'meta', 'name', 'keywords', target='content')
                    self.has_viewport = True if self.get_tag_attr_text(bs, 'meta', 'name', 'viewport') is not None else False
        
                    if search_keyword != '':
                        self.search_results =  self.get_search_results(bs, search_keyword)
                        self.match_sample = self.get_match_sample(bs, search_keyword)
                    
                    self.get_links(bs)
        except:
            """Em todos os casos testados os erros foram chamados quando a página se recusava a dar uma resposta de acordo
            com o esperado. Esta solução serviu para acelerar o desenvolvimento do trabalho"""
            self.status_code = 403                    
        finally:
            session.close()
        
    
    def get_search_results(self, bs_obj, keyword):
        macthes = {'in_h1':False,'in_title':False,'in_description':False ,'body':0}
        for word in set(keyword.split(' ')):
            if not macthes['in_h1'] and self.h1 is not None:
                macthes['in_h1'] = len(re.findall(f'(?i){word}', self.h1)) > 0
            if not macthes['in_title'] and self.title is not None:
                macthes['in_title'] = len(re.findall(f'(?i){word}', self.title)) > 0
            if not macthes['in_description'] and self.description is not None:
                macthes['in_description'] = len(re.findall(f'(?i){word}', self.description)) > 0
            if bs_obj.find('body') is not None:
                macthes['body'] += len(re.findall(f'(?i){word}', bs_obj.find('body').get_text()))        
        macthes['body'] = int(macthes['body']/len(set(keyword.split(' '))))        
        return macthes
    
    def get_links(self, bs_obj):
        if bs_obj is None:
            return
        links = set()
        internal_links = set()
        external_links = set()
        raw_url = f"{urlparse(self.url).scheme}://{self.domain}"
        
        if bs_obj.find("a") is not None:            
            for link in bs_obj.find_all("a", href=re.compile(r"^\/?^[^#& ]+")):
                result = re.match(r"^\/?^[^#&]+",link.attrs["href"])[0]
                if re.match(r'^//', result):
                    self.external_links.add("http:" + result)
                elif result[0] == "/":
                    self.internal_links.add(raw_url + result)
                else:
                    if re.match(r'^https?', result) is not None:
                        self.external_links.add(result)
                    elif re.match(r'^www', result) is not None:
                        self.external_links.add("https://" + result)
                    else:
                        self.internal_links.add("/".join([raw_url, result]))
        self.links = list(self.internal_links | self.external_links)
    
    def print_all_info(self):
        for key, item in self.__dict__.items():
            print(str(key) + ":" + str(item))
    
    def get_match_sample(self, bs_obj, keyword):
        if self.search_results['in_title']:
            return self.title
        elif self.search_results['in_description']:
            return self.description
        elif self.search_results['in_h1']:
            return self.h1
        elif self.search_results['body'] > 0:
            matches = re.findall(fr'(?i)\b{keyword}[^.\n<]+', bs_obj.find('body').get_text())
            if len(matches) > 0:
                return choice(matches).strip()
        return None    
    
    def __str__(self):
        return f"url: {self.url}, sample: {self.match_sample}"        
        
    @staticmethod
    def get_tag_text(bs_obj, tag):
        if bs_obj.find(tag) is not None:
            return bs_obj.find(tag).get_text().strip()
        return None
    
    @staticmethod
    def get_tag_attr_text(bs_obj, tag, attr, attr_value='.*', target=''):
        result = bs_obj.find(tag, {f"{attr}":re.compile(f'(?i){attr_value}')})
        if result is not None:            
            if target != '':
                return result[target]
            return result[attr]
        return None
 

Classe do Crawler

In [None]:
class Scraper:
     
    def __init__(self,*, ignored_domains=[]):   
        self.visited_pages = set()
        self.visited_urls = set()
        self.ignored_domains = ignored_domains
        self.found_domains = set()
        self.last_visited_domain = None
        
        
    def crawl(self,url, keyword, layer=1):
        if layer == 0:
            conn.commit()
            return
        
        start = time.perf_counter()
        
        print(f"Looking at ({layer}): " + url)
        
        self.update_domains()
        page = Page(url)
        
        if page.domain == self.last_visited_domain:
            print('Waiting...')
            time.sleep(2)
        if page.domain in self.ignored_domains:
            return
        
        page.get_page_info(keyword)
        
        self.visited_pages.add(page)
        self.last_visited_domain = page.domain
        
        if page.status_code == 403:
            self.found_domains.add(Domain(page.domain, time.time(), ignore=True))
            self.ignored_domains.append(page.domain)
            self.save_domain(page.domain, True)
        elif page.status_code < 400:
            self.save_domain(page.domain)
            self.save_config(url, keyword, layer)
            self.save_page(page)
            self.save_search(keyword, layer, page)            
        
        for url in page.links:
            if url not in self.visited_urls:
                self.visited_urls.add(url)
                self.crawl(url, keyword, layer-1)                    
            
        print(f"Searching for {keyword} on layer {layer} done in {round(time.perf_counter() - start, 2)} second(s)")
    
    def get_ranking(self, keyword):
        cur.execute('SELECT netloc, url,' +
            'hasTitle, hasDescription, hasH1, hasKeywords, hasViewport,' +
            'matchesInBody, keywordInDescription, keywordInH1, keywordInTitle ' +
            f"from result natural join search natural join config natural join page natural join domain where keyword = '{keyword}'")
        result = cur.fetchall()
        df = pd.DataFrame(list(set(result)), columns= 'netloc, url, hasTitle, hasDescription, hasH1,'
                          ' hasKeywords, hasViewport, matchesInBody, keywordInDescription, keywordInH1, keywordInTitle'.split(', '))
        knn = pickle.load(open('knn-classifier.plk', 'rb'))
        df['matchesInBody'] = df['matchesInBody'].apply(lambda x : round(log10(x+1),2))
        df['score'] = [proba[1] for proba in knn.predict_proba(df[df.columns[2:]])] + df['matchesInBody']
        df['score'] = df['score'].apply(lambda x: round((x/max(df['score']))*100,2))
        return df[['netloc','url','score']].sort_values('score', ascending=False)
    
    def get_search_history(self):
        cur.execute("SELECT keyword FROM config group by keyword");
        return cur.fetchall()
    
    def update_domains(self):
        cur.execute(f"SELECT * FROM domain where blackListed = 1")
        for bad_domain in cur.fetchall():
            self.ignored_domains.append(bad_domain[1])
            
    @staticmethod
    def save_domain(netloc, ignore=False):
        cur.execute(f"SELECT netloc FROM domain where netloc like '{netloc}'")
        if not cur.fetchone():
            cur.execute(f"INSERT INTO domain (netloc, blackListed) values ('{netloc}', {1 if ignore else 0})")

    @staticmethod
    def save_config(url, keyword, layers):
        cur.execute(f"SELECT * FROM config where keyword like '{keyword}' and startingUrl like '{url}' and layers = {layers}")
        if not cur.fetchone():
            cur.execute(f"INSERT INTO config (startingUrl, keyword, layers) values ('{url}', '{keyword}', {layers})")

    @staticmethod
    def save_page(found_page):    
        cur.execute(f"SELECT idDomain FROM domain where netloc like '{found_page.domain}'")
        domain_id = cur.fetchone()[0]
        cur.execute("INSERT INTO page (url, idDomain, hasTitle, hasH1, hasDescription, hasKeywords, hasViewport) values " \
                    f"('{found_page.url}', {domain_id}," \
                    f"{1 if found_page.title else 0},{1 if found_page.h1 else 0}, {1 if found_page.description else 0}, " \
                    f"{1 if found_page.keywords else 0}, {1 if found_page.has_viewport else 0})")

    @staticmethod
    def save_search(keyword, layer, found_page):
        cur.execute(f"SELECT idConfig FROM config where keyword like '{keyword}'")
        config_id = cur.fetchone()[0]
        cur.execute(f"SELECT idPage FROM page where url like '{found_page.url}'")
        page_id = cur.fetchone()[0]
        cur.execute(f"INSERT INTO search (idConfig, idPage, layer) values ({config_id}, {page_id}, {layer})")

        cur.execute(f"SELECT max(idSearch) FROM search")
        search_id = cur.fetchone()[0]
        results = found_page.search_results
        cur.execute("INSERT INTO result (idSearch, matchesInBody, keywordInTitle, keywordInDescription, keywordInH1) values " \
                    f"({search_id}, {results['body']}, {1 if results['in_title'] else 0}, {1 if results['in_description'] else 0}, {1 if results['in_h1'] else 0})")

In [65]:
scraper = Scraper(ignored_domains=['www.facebook.com', 'www.twitter.com'])

In [73]:
try:
    scraper.crawl("https://www.medium.com/", "job", 3)
except KeyboardInterrupt:
    print("Interrompido pelo o Usuário.")
finally:
    conn.commit()

Looking at (3): https://www.medium.com/
<Response [200]>
Looking at (2): https://www.medium.com/jobs-at-medium/work-at-medium-959d1a85284e?source=landing_home---------------------------
Waiting...
<Response [200]>
Looking at (1): https://www.medium.com/jobs-at-medium?source=post_page-----959d1a85284e----------------------
Waiting...
<Response [200]>
Searching for job on layer 1 done in 6.2 second(s)
Looking at (1): https://medium.com/topics?source=post_page-----959d1a85284e----------------------
<Response [200]>
Searching for job on layer 1 done in 1.81 second(s)
Looking at (1): https://help.medium.com/?source=post_page-----959d1a85284e----------------------
<Response [200]>
Searching for job on layer 1 done in 3.69 second(s)
Looking at (1): https://jobs.lever.co/medium?team=Editorial
<Response [200]>
Searching for job on layer 1 done in 2.96 second(s)
Looking at (1): http://medium.com/p/959d1a85284e/share/twitter?source=post_actions_footer---------------------------
<Response [200]>
S

In [68]:
for page in scraper.visited_pages:
        if 'match_sample' in page.__dict__.keys() and page.match_sample:
            print(page)
            print()

url: https://play.google.com/store/apps/details?id=com.medium.reader, sample: job!",[1583204079,469000000]



In [69]:
scraper.get_search_history()

(('python',),
 ('java',),
 ('data',),
 ('corona',),
 ('javascript',),
 ('flamengo',),
 ('corana',),
 ('browser',),
 ('thread',),
 ('job',))

In [72]:
scraper.get_ranking('flamengo').head(25)

Unnamed: 0,netloc,url,score
34,en.wikipedia.org,https://en.wikipedia.org/wiki/Clube_de_Regatas_do_Flamengo,100.0
17,en.wikipedia.org,https://en.wikipedia.org/w/index.php?title=Clube_de_Regatas_do_Flamengo,100.0
33,en.wikipedia.org,https://en.wikipedia.org/wiki/2015_Clube_de_Regatas_do_Flamengo_season,89.82
46,globoesporte.globo.com,https://globoesporte.globo.com/futebol/times/flamengo/noticia/estreia-com-derrota-do-uniforme-cor-cinza-revive-a-maldicao-das-terceiras-camisas-do-flamengo.ghtml,79.34
50,globoesporte.globo.com,https://globoesporte.globo.com/futebol/times/flamengo/noticia/apos-confusao-no-hotel-do-independiente-conmebol-vai-abrir-processo-contra-o-fla.ghtml,71.56
45,oglobo.globo.com,https://oglobo.globo.com/esportes/incendio-deixa-dez-mortos-no-ninho-do-urubu-centro-de-treinamento-do-flamengo-23437241,68.56
22,en.wikipedia.org,https://en.wikipedia.org/wiki/File:Flamengo_braz_logo.svg,67.37
43,interativos.globoesporte.globo.com,https://interativos.globoesporte.globo.com/futebol/times/flamengo/especial/ficou-marcado-na-historia,57.78
25,en.wikipedia.org,https://en.wikipedia.org/wiki/2014_FIBA_Intercontinental_Cup,56.29
3,da.wikipedia.org,https://da.wikipedia.org/wiki/Clube_de_Regatas_do_Flamengo,56.29


In [None]:
scraper.ignored_domains
conn.commit()

In [None]:
conn.close()