# Projet flux Rss
## TP 1 - Feed Collector
### Import

In [57]:
import hashlib
import shelve
import time
import urllib.request
from subprocess import check_output
from datetime import datetime

import feedparser
import langdetect
import chardet
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch

### Item Rss

In [63]:
class Item_RSS:
    """
    Représente un item Rss obtenu depuis le flux 

    source_feed : L url de la source du flux
    local_url : L url du fichier local contenant la page de l'item Rss
    lang : La langue utilisé dans le texte de l'item Rss
    date : La date de l'item Rss
    target_data : Le contenu de la page source de l'item Rss
    bool_write_file : Ecrit dans des fichiers les pages des liens RSS si VRAI
    """
    id = None
    title = None
    summary = None
    description = None
    all_links = None
    source_post = None
    source_feed = None
    local_url = None
    lang = None
    date = None
    target_data = None

    def __init__(self, post, feed, write_file=False, database_name=None,elastic_connection=None):
        """
        Initialise l item rss a partir des données récupérés depuis le flux

        Paramètres:
            post : L'item Rss recupéré depuis le flux
            feed : Les elements decrivants le flux
        """
        self.elastic_connection = elastic_connection
        self.database_name = database_name
        if  hasattr(post, 'title'):
            self.tile = post.title
            self.lang = langdetect.detect(post.title)
        if  hasattr(post, 'summary'):
            self.summary = post.summary
        if  hasattr(post, 'description'):
            self.description = post.description
        if hasattr(post, 'links'):
            self.all_links = post.links
        if hasattr(feed, 'link'):
            self.source_feed = feed.link
        self.integrity_construct()
        if  hasattr(post, 'link'):
            self.source_post = post.link
            self.id = hashlib.sha224(post.link.encode(encoding='UTF-8')).hexdigest()
            try:
                self.local_url = './pages/' + post.link.replace('/','').replace(':','')
                html = urllib.request.urlopen(post.link)
                soup = BeautifulSoup(html, features="html.parser")
                self.target_data = str(soup.prettify())
                if self.checkIntegrity() == False and write_file != False:
                    self.write_target_data_In_File()
            except urllib.error.HTTPError as e:
                self.target_data = None
                self.local_url = None
            except urllib.error.URLError as e:
                self.target_data = None
                self.local_url = None
        self.date = datetime.now()
        self.save_elastic()

    def affichage(self):
        """
        Affiche tous les éléments de l'item Rss si ils ne sont pas vides
        """
        self.print_id()
        self.print_title()
        self.print_summary()
        self.print_description()
        self.print_source_post()
        self.print_source_feed()
        self.print_lang()
        self.print_date()
        self.print_target_data()
    
    def print_id(self):
        if self.id != None:
            print('id : ', self.id, '\n')
    def print_title(self) :
        if self.title != None:
            print('title : ', self.title, '\n')
    def print_summary(self) : 
        if self.summary != None:
            print('summary : ', self.summary, '\n')
    def print_description(self) : 
        if self.description != None:
            print('description : ', self.description, '\n')
    def print_source_post(self) : 
        if self.source_post != None:
            print('source_post : ', self.source_post, '\n')
    def print_source_feed(self) :
        if self.source_feed != None:
            print('source_feed : ', self.source_feed, '\n')
    def print_lang(self) : 
        if self.lang != None:
            print('lang : ', self.lang, '\n')
    def print_date(self) :
        if self.date != None:
            print('date : ', self.date, '\n')
    def print_target_data(self) : 
        if self.target_data != None:
            print('target_data : ', self.target_data, '\n')




    def write_target_data_In_File(self):
        """
        Ecrit dans un fichier contenu à l'adresse local la page web qui est source de l'item Rss
        """
        f = open(self.local_url, "w", encoding="utf-8")
        f.write(self.target_data)
        f.close()

    def integrity_construct(self):
        """
        Calcul le hash qui déterminera si un element à changé au court du temps
        """
        integrity = ''
        if self.title != None:
            integrity += self.title
        if self.summary != None:
            integrity += self.summary
        if self.description != None:
            integrity += self.description
        if self.target_data != None:
            integrity += self.target_data
        self.integrity = hashlib.sha224(integrity.encode(encoding='UTF-8')).hexdigest()

    def save_database(self):
        """"
        Sauvegarde l'item Rss dans la base de données

        Paramètres :
            database_name : Le nom de la base de données dans laquelle sauvegarder l'item
        """
        if self.database_name != None :
            d = shelve.open(self.database_name, 'c')
            if d.__contains__(self.id) == False:
                d[self.id] = self
            else : 
                if self.integrity != d[self.id].integrity :
                    d[self.id] = self
            d.close()

    def checkIntegrity(self):
        """"
        Permet la vérification de l'exactitude des informations précédement enregistrer par rapport au données actuels

        Retour :
            Retourne Vrai si les données n'ont pas changé et Faux autrement
        """
        if self.database_name != None :
            ret = True
            d = shelve.open(self.database_name, 'c')
            if d.__contains__(self.id) == False:
                ret = False
            else : 
                if self.integrity != d[self.id].integrity :
                    ret = False
            d.close()
            return ret
        else :
            return False

    def save_elastic(self):
        if self.elastic_connection != None :
            self.elastic_connection.insertion_item(self)

### Crawler

In [64]:
class Crawler:
    nb_crawl_max = 1

    def __init__(self, nb_already=0, _elastic_connection = None):
        self.nb_already_done = nb_already
        if _elastic_connection != None : 
            self.elastic_tool = _elastic_connection
        else : 
            self.elastic_tool = ElasticTool()

    def crawl(self, _link):
        if(self.nb_already_done < self.nb_crawl_max):
            d = feedparser.parse('%s' % _link)
            for post in d.entries:
                elem = Item_RSS(post,d.feed,elastic_connection = self.elastic_tool)
                for l in elem.all_links:
                    c = Crawler(self.nb_already_done + 1, _elastic_connection=self.elastic_tool)
                    c.crawl(l['href'])
                elem.save_database()

## TP2 - ElasticSearch
### ElasticTool

In [93]:
class ElasticTool:

    _es = None
    
    def __init__(self):
        self._es = self.getConnection()
        self.add_all_index()

    def getConnection(self, _host = 'localhost', _port=9200):
        if self._es == None :
            self._es = Elasticsearch([{'host': _host, 'port': _port}])
        return self._es

    def add_index(self, name_index):
        if self._es.indices.exists(index=name_index):
            self._es.indices.create(index=name_index, ignore=400)

    def add_all_index(self):
        self.add_index("item_rss")
        self.add_index("title")
        self.add_index("summary")
        self.add_index("description")
        self.add_index("links")
        self.add_index("source_post")
        self.add_index("lang")
        self.add_index("date")
        self.add_index("target_data")
        self.add_index('integrity')

    def delete_index(self, name_index):
        self._es.indices.delete(index=name_index, ignore=[400, 404])

    def delete_all_index(self):
        self.delete_index("item_rss")
        self.delete_index("title")
        self.delete_index("summary")
        self.delete_index("description")
        self.delete_index("links")
        self.delete_index("source_post")
        self.delete_index("lang")
        self.delete_index("date")
        self.delete_index("target_data")
        self.delete_index('integrity')

    def insertion_all_items(self, _items):
        for i in _items :
            self.insertion_item(i)

    def insertion_item(self, _item):
        
        id_title = self.save_title(_item)
        id_summary = self.save_summary(_item)
        id_description = self.save_description(_item)
        id_all_links = self.save_all_links(_item)
        id_source_post = self.save_source_post(_item)
        id_lang = self.save_lang(_item)
        id_date = self.save_date(_item)
        id_target_data = False #self.save_target_data(_item)
        self.save_item(_item.id, id_title, id_summary, id_description, id_all_links, id_source_post, id_lang, id_date, id_target_data)

    def save_item(self, _id, _title, _summary, _description, _all_links, _source_post, _lang, _date, _target_data):
        content_body = {}
        if _title != False :
            content_body['id_title'] = _title
        if _summary != False :
            content_body['id_summary'] = _summary
        if _description != False :
            content_body['id_description'] = _description
        if _all_links != False :
            content_body['id_all_links'] = _all_links
        if _source_post != False :
            content_body['id_source_post'] = _source_post
        if _lang != False :
            content_body['id_lang'] = _lang
        if _date != False :
            content_body['id_date'] = _date
        if _target_data != False :
            content_body['id_target_data'] = _target_data
        self._es.index(index='item_rss', id=_id, body=content_body)

    def save_title(self, _item) : 
        if(_item.title != None) : 
            content_body = {
                'value' : _item.title,
                'tags' : _item.title.split(' '),
                'id_item' : _item.id
            }
            return self._es.index(index="title", body=content_body)['_id']
        return False

    def save_summary(self, _item):
        if _item.summary != None :
            content_body = {
                'value' : _item.summary,
                'tags' : _item.summary.split(' '),
                'id_item' : _item.id
            }
            return self._es.index(index="summary", body=content_body)['_id']
        return False

    def save_description(self, _item):
        if _item.description != None :
            content_body = {
                'value' : _item.description,
                'tags' : _item.description.split(' '),
                'id_item' : _item.id
            }
            return self._es.index(index="description", body=content_body)['_id']
        return False

    def save_link(self, _link, _id):
        content_body = {
            'value' : _link,
            'id_item' : _id
        }
        return self._es.index(index="links", body=content_body)['_id']

    def save_all_links(self,_item):
        id_tab_links = []
        if(_item.all_links != None):
            for l in _item.all_links:
                id_tab_links.append(self.save_link(l, _item.id))
        return id_tab_links if len(id_tab_links)>0 else False

    def save_source_post(self, _item):
        if _item.source_post != None :
            content_body = {
                'value' : _item.source_post,
                'id_item' : _item.id
            }
            return self._es.index(index="source_post", body=content_body)['_id']
        return False

    def save_lang(self, _item):
        if _item.source_post != None :
            content_body = {
                'value' : _item.lang,
                'id_item' : _item.id
            }
            return self._es.index(index="lang", body=content_body)['_id']
        return False

    def save_date(self, _item):
        if _item.date != None :
            content_body = {
                'value' : _item.date,
                'id_item' : _item.id
            }
            return self._es.index(index="date", body=content_body)['_id']
        return False

    def save_target_data(self, _item):
        if _item.target_data != None :
            content_body = {
                'value' : _item.target_data,
                'id_item' : _item.id
            }
            return self._es.index(index="target_data", body=content_body)['_id']
        return False

    def save_integrity(self, _item):
        if _item.integrity != None :
            content_body = {
                'value' : _item.integrity,
                'id_item' : _item.id
            }
            return self._es.index(index="integrity", body=content_body)['_id']
        return False
        
    def search_by_tags(self, index_name, tags, size_result=999):
        
        str_query = ''
        maxi = len(tags)
        for i in range(maxi):
            str_query += 'tags:' + tags[i]
            if i < maxi-1:
                str_query += ' AND '
                
        query_body ={
            "query": {
                "query_string": {
                    "query" : str_query
                }
            }
        }
        return self._es.search(index=index_name, body=query_body, size=size_result)


## TP 3 - Classifier

## Lancement

In [81]:
cr = Crawler()
cr.crawl("https://www.lefigaro.fr/rss/figaro_economie.xml")

In [97]:
elastic_tool = ElasticTool()
elastic_tool.search_by_tags('description',['commerces'])
elastic_tool.search_by_tags('description', ['commerces', 'rouvrir', 'pouvoir'])

tags:commerces
tags:commerces AND tags:rouvrir AND tags:pouvoir


{'took': 3,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': 8.777697,
  'hits': [{'_index': 'description',
    '_type': '_doc',
    '_id': '_l2IrnUB1UJFcz4jme6d',
    '_score': 8.777697,
    '_source': {'value': 'Elles demandent aussi de pouvoir rouvrir les commerces «non essentiels» dès le 12 novembre.',
     'tags': ['Elles',
      'demandent',
      'aussi',
      'de',
      'pouvoir',
      'rouvrir',
      'les',
      'commerces',
      '«non',
      'essentiels»',
      'dès',
      'le',
      '12',
      'novembre.'],
     'id_item': '23753b75a744f0cf134c1ca597fad4a705f023a0afc36afa8b92d745'}}]}}

In [79]:
elastic_tool.delete_all_index()