# Projet flux Rss
## TP 1 - Feed Collector
### Import

In [1]:
import hashlib
import shelve
import time
import urllib.request
from subprocess import check_output

import feedparser
import langdetect
import chardet
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch

### Item Rss

In [35]:
class Item_RSS:
    """
    Représente un item Rss obtenu depuis le flux 

    source_feed : L url de la source du flux
    local_url : L url du fichier local contenant la page de l'item Rss
    lang : La langue utilisé dans le texte de l'item Rss
    date : La date de l'item Rss
    target_data : Le contenu de la page source de l'item Rss
    bool_write_file : Ecrit dans des fichiers les pages des liens RSS si VRAI
    """
    id = None
    title = None
    summary = None
    description = None
    all_links = None
    source_post = None
    source_feed = None
    local_url = None
    lang = None
    date = None
    target_data = None

    def __init__(self, post, feed, write_file=False, database_name=None,elastic_connection=None):
        """
        Initialise l item rss a partir des données récupérés depuis le flux

        Paramètres:
            post : L'item Rss recupéré depuis le flux
            feed : Les elements decrivants le flux
        """
        self.elastic_connection = elastic_connection
        self.database_name = database_name
        if  hasattr(post, 'title'):
            self.tile = post.title
            self.lang = langdetect.detect(post.title)
        if  hasattr(post, 'summary'):
            self.summary = post.summary
        if  hasattr(post, 'description'):
            self.description = post.description
        if hasattr(post, 'links'):
            self.all_links = post.links
        if hasattr(feed, 'link'):
            self.source_feed = feed.link
        self.integrity_construct()
        if  hasattr(post, 'link'):
            self.source_post = post.link
            self.id = hashlib.sha224(post.link.encode(encoding='UTF-8')).hexdigest()
            try:
                self.local_url = './pages/' + post.link.replace('/','').replace(':','')
                html = urllib.request.urlopen(post.link)
                soup = BeautifulSoup(html, features="html.parser")
                self.target_data = str(soup.prettify())
                if self.checkIntegrity() == False and write_file != False:
                    self.write_target_data_In_File()
            except urllib.error.HTTPError as e:
                self.target_data = None
                self.local_url = None
            except urllib.error.URLError as e:
                self.target_data = None
                self.local_url = None
        self.affichage()
        self.save_elastic()

    def affichage(self):
        """
        Affiche tous les éléments de l'item Rss si ils ne sont pas vides
        """
        if self.id != None:
            print('id : ', self.id, '\n')
        if self.title != None:
            print('title : ', self.title, '\n')
        if self.summary != None:
            print('summary : ', self.summary, '\n')
        if self.description != None:
            print('description : ', self.description, '\n')
        if self.source_post != None:
            print('source_post : ', self.source_post, '\n')
        if self.source_feed != None:
            print('source_feed : ', self.source_feed, '\n')
        if self.lang != None:
            print('lang : ', self.lang, '\n')
        if self.date != None:
            print('date : ', self.date, '\n')
        if self.target_data != None:
            print('target_data : ', self.target_data, '\n')

    def write_target_data_In_File(self):
        """
        Ecrit dans un fichier contenu à l'adresse local la page web qui est source de l'item Rss
        """
        f = open(self.local_url, "w", encoding="utf-8")
        f.write(self.target_data)
        f.close()

    def integrity_construct(self):
        """
        Calcul le hash qui déterminera si un element à changé au court du temps
        """
        integrity = ''
        if self.title != None:
            integrity += self.title
        if self.summary != None:
            integrity += self.summary
        if self.description != None:
            integrity += self.description
        if self.target_data != None:
            integrity += self.target_data
        self.integrity = hashlib.sha224(integrity.encode(encoding='UTF-8')).hexdigest()

    def save_database(self):
        """"
        Sauvegarde l'item Rss dans la base de données

        Paramètres :
            database_name : Le nom de la base de données dans laquelle sauvegarder l'item
        """
        if self.database_name != None :
            d = shelve.open(self.database_name, 'c')
            if d.__contains__(self.id) == False:
                d[self.id] = self
            else : 
                if self.integrity != d[self.id].integrity :
                    d[self.id] = self
            d.close()

    def checkIntegrity(self):
        """"
        Permet la vérification de l'exactitude des informations précédement enregistrer par rapport au données actuels

        Paramètres :
            database_name : Le nom de la base de données utilisé pour faire la vérification de l'intégrité
        Retour :
            Retourne Vrai si les données n'ont pas changé et Faux autrement
        """
        if self.database_name != None :
            ret = True
            d = shelve.open(self.database_name, 'c')
            if d.__contains__(self.id) == False:
                ret = False
            else : 
                if self.integrity != d[self.id].integrity :
                    ret = False
            d.close()
            return ret
        else :
            return False

    def save_elastic(self):
        if self.elastic_connection != None :
            self.elastic_connection.insertion_item(self)

### Crawler

In [30]:
class Crawler:
    nb_crawl_max = 3

    def __init__(self, nb_already=0):
        self.nb_already_done = nb_already 
        self.elastic_tool = ElasticTool()

    def crawl(self, _link):
        if(self.nb_already_done < self.nb_crawl_max):
            d = feedparser.parse('%s' % _link)
            for post in d.entries:
                elem = Item_RSS(post,d.feed,elastic_connection = self.elastic_tool)
                for l in elem.all_links:
                    print(l['href'])
                    c = Crawler(self.nb_already_done + 1)
                    c.crawl(l['href'])
                elem.save_database()

## TP2 - ElasticSearch
### ElasticTool

In [31]:
class ElasticTool:

    _es = None
    
    def __init__(self, _host = 'localhost', _port=9200):
        
        self._es = Elasticsearch([{'host': _host, 'port': _port}])
        if self._es.ping():
            print('ElasticSearch Tourne')
            self.creation_index()
        else:
            print('ElasticSearch ne tourne pas')

    def creation_index(self):
        if self._es.indices.exists(index="item_rss"):
            self._es.indices.create(index="item_rss", ignore=400)
        if self._es.indices.exists(index="title"):
            self._es.indices.create(index="title", ignore=400)
        if self._es.indices.exists(index="summary"):
            self._es.indices.create(index="summary", ignore=400)
        if self._es.indices.exists(index="description"):
            self._es.indices.create(index="description", ignore=400)
        if self._es.indices.exists(index="all_links"):
            self._es.indices.create(index="all_links", ignore=400)
        if self._es.indices.exists(index="source_post"):
            self._es.indices.create(index="source_post", ignore=400)
        if self._es.indices.exists(index="lang"):
            self._es.indices.create(index="lang", ignore=400)
        if self._es.indices.exists(index="date"):
            self._es.indices.create(index="date", ignore=400)
        if self._es.indices.exists(index="target_data"):
            self._es.indices.create(index="target_data", ignore=400)

    def insertion_item(self, _item):
        #self._es.index(index="item_rss", id=_item.id, body={'item' : _item})
        if(_item.title != None) : 
            self._es.index(index="title", id=_item.title, body={'id_item' : _item.id})
        if _item.summary != None :
            self._es.index(index="summary", id=_item.summary, body={'id_item' : _item.id})
        if _item.description != None :
            self._es.index(index="description", id=_item.description, body={'id_item' : _item.id})
        #if _item.all_links != None :
            #self._es.index(index="all_links", id=_item.all_links, body={'id_item' : _item.id})
        if _item.source_post != None :
            self._es.index(index="source_post", id=_item.source_post, body={'id_item' : _item.id})
        if _item.lang != None :
            self._es.index(index="lang", id=_item.lang, body={'id_item' : _item.id})
        if _item.date != None :
            self._es.index(index="date", id=_item.date, body={'id_item' : _item.id})
        if _item.target_data != None :
            self._es.index(index="target_data", id=_item.target_data, body={'id_item' : _item.id})

## TP 3 - Classifier

## Lancement

In [36]:
cr = Crawler()
cr.crawl("http://rss.cnn.com/rss/edition.rss")

{
                "@type": "Organization",
                "name": "CNN",
                "logo": {
                    "@type": "ImageObject",
                    "url": " https://cdn.cnn.com/cnn/2020/images/02/20/cnn-publisher-img.png",
                    "width": 103,
                    "height": 60
                }
            },
            "author": {
                "@type": "Person",
                "name": "Jenni Marsh and Joshua Berlinger, CNN"
            },
            "articleBody": "The Japanese Health Ministry said 957 new coronavirus cases and six virus-related deaths were reported in the country Sunday. To date, 108,796 people in Japan have been diagnosed with the virus since the pandemic began. More than 97,000 have recovered and at least 1,831 have died, while 196 are in serious condition.  Tokyo, the capital, counted 189 new cases on Sunday -- its its sixth day in a row with 100-plus cases. Hokkaido, the northernmost prefecture, reported 153 cases.  Authorities r

RequestError: RequestError(400, 'too_long_frame_exception', 'An HTTP line is larger than 4096 bytes.')