# Crawler em python e mysql

## Importações e configurações

In [1]:
import urllib3
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import MySQLdb
import configparser
import nltk
import re

In [2]:
def connect():
        config = configparser.ConfigParser()
        config.read('config.ini')
        user = config['DATABASE']['user']
        passwd = config['DATABASE']['passwd']
        db = config['DATABASE']['db']
        host = config['DATABASE']['host']
        port = int(config['DATABASE']['port'])
        autocommit = bool(config['DATABASE']['autocommit'])
        connection = MySQLdb.connect(user=user, passwd=passwd, db=db,
                     host=host, port=port, autocommit=autocommit)
        return connection

## Funções de Preprocessamento do Texto

In [3]:
def stemmer(corpus):
    stem = nltk.stem.RSLPStemmer()
    return [stem.stem(word) for word in corpus]

In [4]:
def removeTags(sopa):
    for tags in sopa(['script', 'style']):
        tags.decompose()
    return ' '.join(sopa.stripped_strings)

In [5]:
def createCorpus(doc):
    stop = nltk.corpus.stopwords.words('portuguese')
    stop.append('é')
    pattern = re.compile('\\w+')
    corpus = []
    lista = [p for p in pattern.findall(doc) if p != '']
    for p in lista:
        if p.lower() not in stop:
            if len(p) > 1:
                corpus.append(p.lower())
    return stemmer(corpus)

## Funções de Verificação

In [6]:
def isWordIndexed(palavra):
    connection = connect()
    cursor = connection.cursor()
    cursor.execute(f'''select idpalavra from palavras where palavra = "{palavra}"''')
    wordId = cursor.fetchone()
    if not wordId: # This word isn't in database
        cursor.close()
        connection.close()
        return 0
    
    cursor.close()
    connection.close()
    return wordId[0]

In [7]:
def isPageIndexed(url):
    connection = connect()
    cursor = connection.cursor()
    query = f'select idurl from urls where url = "{url}"'
    cursor.execute(query)
    idurl = cursor.fetchone()
    # url is not in the database
    if not idurl:
        connection.close()
        return -1
    
    idurl = idurl[0]
    query = f'select idurl from palavra_localizacao where idurl = {idurl}'
    cursor.execute(query)
    thereWordInUrl = cursor.fetchone()
    if not thereWordInUrl:
        # url is in the database but there isn't any word there.
        connection.close()
        return idurl
    connection.close()
    ## url is in the database
    print(thereWordInUrl,idurl)
    return -2

## Funções de Inserção

In [8]:
def insertUrl(url):
    connection = connect()
    cursor = connection.cursor()
    cursor.execute(f'insert into urls (url) values ("{url}")')
    pageId = cursor.lastrowid
    cursor.close()
    connection.close()
    return pageId

In [9]:
def insertWord(palavra):
    connection = connect()
    cursor = connection.cursor()
    cursor.execute(f'insert into palavras (palavra) values ("{palavra}")')
    wordId = cursor.lastrowid
    cursor.close()
    connection.close()
    return wordId

In [10]:
def insertWordLocalization(idurl,idpalavra, localization):
    connection = connect()
    cursor = connection.cursor()
    cursor.execute(f'''
        insert into palavra_localizacao (idurl, idpalavra, localizacao) 
                values ({idurl},{idpalavra}, {localization})
    ''')
    localizationId = cursor.lastrowid
    cursor.close()
    connection.close()
    return localizationId    

In [11]:
def insertPage(url, soup):
    urlId = isPageIndexed(url)
    if urlId == -2:
        return 0
    if urlId == -1:
        urlId = insertUrl(url)
    
    doc = removeTags(soup)
    corpus = createCorpus(doc)
    
    for idx,word in enumerate(corpus):
        try:
            wordId = isWordIndexed(word)
            if not wordId:
                wordId = insertWord(word)
            insertWordLocalization(urlId,wordId, idx+1)
        except:
            continue

## Crawler para obteção dos links

In [12]:
def crawl(paginas, profundidade):
    #profundidade é o numero de vezes + 1 que o crawler vai iterar sobre os links obtidos
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    for i in range(profundidade):
        novas_paginas = set(paginas)
        for pagina in paginas:
            http = urllib3.PoolManager()
            try:
                dados_pagina = http.request('GET', pagina)
            except:
                print(f"Erro ao abrir a página {pagina}")
                continue
            sopa = BeautifulSoup(dados_pagina.data, "lxml")
            links = sopa.find_all('a')
            for link in links:
                #print(str(link.contents) + ' - ' + str(link.get('href')))
                if('href' in link.attrs):
                    url = urljoin(pagina, str(link.get('href')))
                    if(url.find("''") != -1):
                        continue
                    url = url.split("#")[0]
                    if url[0:4] == 'http':
                        novas_paginas.add(url)
        paginas = novas_paginas
    return novas_paginas

## Função para popular o banco com os dados das páginas

In [13]:
def populate(urlList):
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    for url in urlList:
        http = urllib3.PoolManager()
        try:
            urlData = http.request('GET', url)
        except:
            print(f"Erro ao abrir a página {url}")
            continue
        soup = BeautifulSoup(urlData.data, "lxml")
        insertPage(url,soup)

## Teste com Wikipedia

In [14]:
lista_paginas = ['https://pt.wikipedia.org/wiki/Linguagem_de_programa%C3%A7%C3%A3o']
paginas = crawl(lista_paginas, 1)

In [15]:
paginas

{'http://books.google.com/books?&as_brr=0&as_epq=Linguagem+de+programa%C3%A7%C3%A3o',
 'http://dlang.org/overview.html',
 'http://www.acm.uiuc.edu/signet/JHSI/cr.D.3.2.html',
 'http://www.cis.gvsu.edu/~kurmasz/Teaching/OldCourses/CS451/CS451_W08/WebPage/Labs/Lab5/index.html',
 'http://www.infoescola.com/engenharia-de-software/linguagem-de-programacao-de-alto-nivel/',
 'http://www.levenez.com/lang/history.html',
 'http://www.scriptol.com/programming/languages.php',
 'http://www.wired.com/2014/07/d-programming-language/',
 'https://af.wikipedia.org/wiki/Programmeertaal',
 'https://als.wikipedia.org/wiki/Programmiersprache',
 'https://am.wikipedia.org/wiki/%E1%8B%A8%E1%8D%95%E1%88%AE%E1%8C%8D%E1%88%AB%E1%88%9D_%E1%89%8B%E1%8A%95%E1%89%8B',
 'https://an.wikipedia.org/wiki/Luengache_de_programaci%C3%B3n',
 'https://ar.wikipedia.org/wiki/%D9%84%D8%BA%D8%A9_%D8%A8%D8%B1%D9%85%D8%AC%D8%A9',
 'https://arz.wikipedia.org/wiki/%D9%84%D8%BA%D8%A9_%D8%A8%D8%B1%D9%85%D8%AC%D9%87',
 'https://as.wikipe

In [None]:
populate(paginas)

Erro ao abrir a página http://www.acm.uiuc.edu/signet/JHSI/cr.D.3.2.html
Erro ao abrir a página https://pt.wikipedia.org/w/index.php?title=Linguagem_de_programação&oldid=59464915
(129,) 129
Erro ao abrir a página https://foundation.wikimedia.org/wiki/Condições_de_Uso
