In [1]:
import requests
import re
import ast
import sys
import urllib.request
#libreria para las interfaces
from PyQt5.QtWidgets import *
from PyQt5.QtCore import pyqtSlot
#libreria de extraccion de web
from bs4 import BeautifulSoup
# libreia para raices
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

#Se deberá colocar el archivo con el indice de raices invertidos
with open('raiz_ind_inv.txt', 'r', encoding="utf-8") as f:
    data = f.read()
    inverted_dict = ast.literal_eval(data)
#Funcion para obtener las url del txt
def get_urls():
    urls = []

    for w in inverted_dict:
        for item in inverted_dict[w]:
            urls.append(item[0])

    urls = list(set(urls))
    return urls
#Funcion de ranqueo
def rank(k):
    for w in inverted_dict:
        if k == w:
            return inverted_dict[w]
#Funcion para obtener los títulos de las paginas
def get_titles(url):
    soup = BeautifulSoup(urllib.request.urlopen(url), "lxml")
    return soup.title.text
#Se normaliza el texto quitando puntuacion
def normalize_text(text):
    ps = PorterStemmer()

    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleanr = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002500-\U00002BEF"
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"
        u"\u3030"
                      "]+", re.UNICODE)
    clean_text = re.sub(cleanr, '', text)


    tokens = word_tokenize(clean_text)

    stop_words = set(stopwords.words('english'))
    filtered_words = []
    punc = '''!–—()--``==[]{};–:'"\,<>....//?@#$%^&*_~'''''

    for w in tokens:
        w = w.lower()
        if w not in stop_words:
            if w not in punc:
                if w != "''":
                    root_word = ps.stem(w)
                    filtered_words.append(root_word)

    return filtered_words

#Funcion para busqueda
def search(keywords):
    keywords_roots = normalize_text(keywords)
    matched_list = []
    frequency_list = []

    document_list = get_urls()

    for k in keywords_roots:
        matched_list.append(rank(k))

    for doc in document_list:
        counter = 0
        for url in matched_list:
            for item in url:
                if item[0] == doc:
                    counter = counter + item[1]
                    break
        frequency_tuple = (doc, counter)
        frequency_list.append(frequency_tuple)

    print(frequency_list)
    ranked_list = sorted(frequency_list, key=lambda tup: tup[1], reverse=True)
    print (ranked_list)
    return ranked_list
#Creacion y definicion de la interfaz con Qt
class App(QMainWindow):
    def __init__(self):
        super().__init__()
        self.w = None
        self.title = 'Raccon Navigator'
        self.left = 100
        self.top = 100
        self.width = 400
        self.height = 400

        self.initUI()
        #Damos el aspecto a la ventana
    def initUI(self):
        self.setWindowTitle(self.title)
        self.setGeometry(self.left, self.top, self.width, self.height)
        self.setStyleSheet("background-color: black;");

        # Creamos un label para mensaje de bienvenida
        self.label = QLabel(self)
        self.label.setGeometry(10, 5, 400, 50)
        self.label.setText("Bienvenido Raccon Navigator")
        self.label.setStyleSheet("font-size: 18px; color: #f6c3f3;")

        # Creamos textbox
        self.textbox = QLineEdit(self)
        self.textbox.move(55, 70)
        self.textbox.resize(250,40)
        self.textbox.setStyleSheet("border: 1px solid #c1c3f3;")

        # Creamos un boton
        self.button = QPushButton('Buscar', self)
        self.button.move(65,135)
        self.button.setStyleSheet("font-size: 18px; color: 'gray';")



        #Function on_click del boton
        self.button.clicked.connect(self.on_click)
        self.show()

    @pyqtSlot()
    def on_click(self):
        global keywords
        keywords = self.textbox.text()
        if self.w == None:
            self.w = ranking()
        self.w.show()
        
#Funcion de ranqueo para la busqueda
class ranking(QWidget):
    def __init__(self):
        super().__init__()
        self.resize(600, 400)
        self.setStyleSheet('font-size: 20px')
        self.initUI()

    def initUI(self):
        self.createTable()

        self.layout = QVBoxLayout()
        self.layout.addWidget(self.tableWidget)
        self.setLayout(self.layout)

        self.show()

    def createTable(self):
       # Creamos la matriz
        ranked_list = search(keywords)
        self.tableWidget = QTableWidget()
        self.tableWidget.setRowCount(3)
        self.tableWidget.setColumnCount(len(get_urls()))
        i = 0
        for item in ranked_list:
            title = get_titles(item[0])
            self.tableWidget.setItem(i,0, QTableWidgetItem(title))
            self.tableWidget.setItem(i,1, QTableWidgetItem(item[0]))
            self.tableWidget.setItem(i,2, QTableWidgetItem(str(item[1])))
            i = i + 1

        self.tableWidget.move(0,0)

if __name__ == '__main__':
    app = QApplication(sys.argv)
    ex = App()
    sys.exit(app.exec_())


[('https://es.wikipedia.org/wiki/Canis_familiaris', 0), ('https://docs.microsoft.com/es-es/archive/msdn-magazine/2019/april/artificially-intelligent-how-do-neural-networks-learn', 0), ('https://es.wikipedia.org/wiki/Procesamiento_de_lenguajes_naturales', 0), ('https://www.abc.es/cultura/musica/', 0), ('https://www.lavanguardia.com/vivo/mascotas/20201119/49314489819/perro-acompana-bano.html', 0), ('https://www.innovaportal.com/innovaportal/v/674/1/innova.front/la-computadora-el-mejor-canal-para-comercio-electronico', 0), ('https://www.xataka.com/robotica-e-ia/las-redes-neuronales-que-son-y-por-que-estan-volviendo', 0), ('https://www.kia.com/pe/util/news/que-es-computadora-automotriz-funciones.html', 0), ('https://es.wikipedia.org/wiki/Juego', 0), ('https://definicion.de/tortuga/', 0), ('https://www.esan.edu.pe/apuntes-empresariales/2019/04/desarrollo-de-software-en-que-consiste-el-modelo-cmmi/', 0), ('https://www.linguamatics.com/', 0), ('https://icono14.net/ojs/index.php/icono14/articl

URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1125)>

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
