# Projet flux Rss
## TP 1 - Feed Collector
### Import

In [1]:
import hashlib
import shelve
import time
import urllib.request
from subprocess import check_output
from datetime import datetime
import fileinput

import feedparser
import langdetect
import chardet
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch

### Item Rss

In [49]:
class Item_RSS:
    """
    Représente un item Rss obtenu depuis le flux 

    source_feed : L url de la source du flux
    local_url : L url du fichier local contenant la page de l'item Rss
    lang : La langue utilisé dans le texte de l'item Rss
    date : La date de l'item Rss
    target_data : Le contenu de la page source de l'item Rss
    """
    id = None
    title = None
    summary = None
    description = None
    all_links = None
    source_post = None
    source_feed = None
    lang = None
    date = None
    target_data = None
    type_flux = None

    def __init__(self, post, feed, tool=None, type_flux=['default']):
        """
        Initialise l item rss a partir des données récupérés depuis le flux

        Paramètres:
            post : L'item Rss recupéré depuis le flux
            feed : Les elements decrivants le flux
        """
        self.tool = tool
        self.type_flux = type_flux
        self.type_predit = 'default'
        if  hasattr(post, 'title'):
            self.tile = post.title
            self.lang = langdetect.detect(post.title)
        if  hasattr(post, 'summary'):
            self.summary = post.summary
        if  hasattr(post, 'description'):
            self.description = post.description
        if hasattr(post, 'links'):
            self.all_links = post.links
        if hasattr(feed, 'link'):
            self.source_feed = feed.link
        self.integrity_construct()
        if  hasattr(post, 'link'):
            self.source_post = post.link
            self.id = hashlib.sha224(post.link.encode(encoding='UTF-8')).hexdigest()
            try:
                html = urllib.request.urlopen(post.link)
                soup = BeautifulSoup(html, features="html.parser")
                self.target_data = str(soup.prettify())
            except urllib.error.HTTPError as e:
                self.target_data = None
            except urllib.error.URLError as e:
                self.target_data = None
        self.date = datetime.now()

    def affichage(self):
        """
        Affiche tous les éléments de l'item Rss si ils ne sont pas vides
        """
        self.print_id()
        self.print_title()
        self.print_summary()
        self.print_description()
        self.print_source_post()
        self.print_source_feed()
        self.print_lang()
        self.print_date()
        self.print_target_data()
        self.print_type_flux()

    def print_id(self):
        if self.id != None:
            print('id : ', self.id, '\n')
    def print_title(self) :
        if self.title != None:
            print('title : ', self.title, '\n')
    def print_summary(self) : 
        if self.summary != None:
            print('summary : ', self.summary, '\n')
    def print_description(self) : 
        if self.description != None:
            print('description : ', self.description, '\n')
    def print_source_post(self) : 
        if self.source_post != None:
            print('source_post : ', self.source_post, '\n')
    def print_source_feed(self) :
        if self.source_feed != None:
            print('source_feed : ', self.source_feed, '\n')
    def print_lang(self) : 
        if self.lang != None:
            print('lang : ', self.lang, '\n')
    def print_date(self) :
        if self.date != None:
            print('date : ', self.date, '\n')
    def print_target_data(self) : 
        if self.target_data != None:
            print('target_data : ', self.target_data, '\n')
    def print_type_flux(self):
        if self.type_flux != None :
            print('type : ', self.type_flux, '\n')
    def print_type_predit(self):
        if self.type_predit != None :
            print('type predit : ', self.type_predit, '\n')

    def integrity_construct(self):
        """
        Calcul le hash qui déterminera si un element à changé au court du temps
        """
        integrity = ''
        if self.title != None:
            integrity += self.title
        if self.summary != None:
            integrity += self.summary
        if self.description != None:
            integrity += self.description
        if self.target_data != None:
            integrity += self.target_data
        self.integrity = hashlib.sha224(integrity.encode(encoding='UTF-8')).hexdigest()

    def save(self):
        if self.tool != None :
            self.tool.insertion_item(self)

In [36]:
class DatabaseTool:

    _db = None

    def __init__(self):
        self._db = self.getConnection()

    def getConnection(self, database_name='database'):
        return shelve.open(self.database_name, 'c')

    def insertion_items(self, _items) :
        for i in _items :
            self.insertion_item(i)

    def insertion_item(self, _item): 
        if self._db.__contains__(_item.id) == False:
            self._db[_item.id] = _item
        else : 
            if _item.integrity != self._db[_item.id].integrity :
                self._db[_item.id] = _item

    def verification_integrity(self, id_, integrity_):
        ret = True
        if self._db.__contains__(id_) == False:
            ret = False
        else : 
            if integrity_ != self._db[id_].integrity :
                ret = False
        return ret

### Crawler

In [63]:
class Crawler:
    nb_crawl_max = 1

    def __init__(self, nb_already=0, type_flux=['default'], _save_tool=1):
        self.nb_already_done = nb_already
        self.save_tool = _save_tool
        if self.save_tool == 0 : 
            self.tool = DatabaseTool()
        else : 
            self.tool = ElasticTool()

    def crawl(self, _link):
        if(self.nb_already_done < self.nb_crawl_max):
            d = feedparser.parse('%s' % _link)
            for post in d.entries:
                elem = Item_RSS(post,d.feed, tool=self.tool)
                for l in elem.all_links:
                    c = Crawler(self.nb_already_done + 1, _save_tool=self.save_tool)
                    c.crawl(l['href'])
                elem.save()

    def crawl_from_file(self, filename):
        links = [link.rstrip('\n').split(' ') for link in fileinput.input(files=(filename))]
        for link,*subjects in links:
            self.type_flux = subjects
            self.crawl(link)

## TP2 - ElasticSearch
### ElasticTool

In [66]:
class ElasticTool:

    _es = None
    
    def __init__(self):
        self._es = self.getConnection()
        self.add_all_index()

    def getConnection(self, _host = 'localhost', _port=9200):
        if self._es == None :
            self._es = Elasticsearch([{'host': _host, 'port': _port}])
        return self._es

    def affichage_etat(self):
        if self._es.ping() :
            print('ElasticSearch Tourne')
        else :
            print('ElasticSearch ne tourne pas')

    def add_index(self, name_index):
        if self._es.indices.exists(index=name_index):
            self._es.indices.create(index=name_index, ignore=400)

    def add_all_index(self):
        self.add_index("item_rss")
        self.add_index("title")
        self.add_index("summary")
        self.add_index("description")
        self.add_index("links")
        self.add_index("source_post")
        self.add_index("lang")
        self.add_index("date")
        self.add_index("target_data")
        self.add_index('integrity')
        self.add_index('type_flux')
        self.add_index('type_predit')

    def delete_index(self, name_index):
        self._es.indices.delete(index=name_index, ignore=[400, 404])

    def delete_all_index(self):
        self.delete_index("item_rss")
        self.delete_index("title")
        self.delete_index("summary")
        self.delete_index("description")
        self.delete_index("links")
        self.delete_index("source_post")
        self.delete_index("lang")
        self.delete_index("date")
        self.delete_index("target_data")
        self.delete_index('integrity')
        self.delete_index('type_flux')
        self.delete_index('type_predit')

    def insertion_all_items(self, _items):
        for i in _items :
            self.insertion_item(i)

    def insertion_item(self, _item):
        if self.verification_integrity(_item.id, _item.integrity) == False :
            id_title = self.save_title(_item)
            id_summary = self.save_summary(_item)
            id_description = self.save_description(_item)
            id_all_links = self.save_all_links(_item)
            id_source_post = self.save_source_post(_item)
            id_lang = self.save_lang(_item)
            id_date = self.save_date(_item)
            id_target_data = False #self.save_target_data(_item)
            id_integrity = self.save_integrity(_item)
            id_type_flux = self.save_type_flux(_item)
            id_type_predit = self.save_type_predit(_item)
            self.save_item(_item.id, id_title, id_summary, id_description, id_all_links, id_source_post, id_lang, id_date, id_target_data, id_integrity, id_type_flux, id_type_predit)

    def save_item(self, _id, _title, _summary, _description, _all_links, _source_post, _lang, _date, _target_data, _integrity, _type_flux, _type_predit ):
        content_body = {}
        if _title != False :
            content_body['id_title'] = _title
        if _summary != False :
            content_body['id_summary'] = _summary
        if _description != False :
            content_body['id_description'] = _description
        if _all_links != False :
            content_body['id_all_links'] = _all_links
        if _source_post != False :
            content_body['id_source_post'] = _source_post
        if _lang != False :
            content_body['id_lang'] = _lang
        if _date != False :
            content_body['id_date'] = _date
        if _target_data != False :
            content_body['id_target_data'] = _target_data
        if _integrity != False :
            content_body['id_integrity'] = _integrity
        if _type_flux != False :
            content_body['type_flux'] = _type_flux
        if _type_predit != False :
            content_body['type_predit'] = _type_predit
        self._es.index(index='item_rss', id=_id, body=content_body)

    def save_title(self, _item) : 
        if(_item.title != None) : 
            content_body = {
                'value' : _item.title,
                'tags' : _item.title.split(' '),
                'id_item' : _item.id
            }
            return self._es.index(index="title", body=content_body)['_id']
        return False

    def save_summary(self, _item):
        if _item.summary != None :
            content_body = {
                'value' : _item.summary,
                'tags' : _item.summary.split(' '),
                'id_item' : _item.id
            }
            return self._es.index(index="summary", body=content_body)['_id']
        return False

    def save_description(self, _item):
        if _item.description != None :
            content_body = {
                'value' : _item.description,
                'tags' : _item.description.split(' '),
                'id_item' : _item.id
            }
            return self._es.index(index="description", body=content_body)['_id']
        return False

    def save_link(self, _link, _id):
        content_body = {
            'value' : _link,
            'id_item' : _id
        }
        return self._es.index(index="links", body=content_body)['_id']

    def save_all_links(self,_item):
        id_tab_links = []
        if(_item.all_links != None):
            for l in _item.all_links:
                id_tab_links.append(self.save_link(l, _item.id))
        return id_tab_links if len(id_tab_links)>0 else False

    def save_source_post(self, _item):
        if _item.source_post != None :
            content_body = {
                'value' : _item.source_post,
                'id_item' : _item.id
            }
            return self._es.index(index="source_post", body=content_body)['_id']
        return False

    def save_lang(self, _item):
        if _item.source_post != None :
            content_body = {
                'value' : _item.lang,
                'id_item' : _item.id
            }
            return self._es.index(index="lang", body=content_body)['_id']
        return False

    def save_date(self, _item):
        if _item.date != None :
            content_body = {
                'value' : _item.date,
                'id_item' : _item.id
            }
            return self._es.index(index="date", body=content_body)['_id']
        return False

    def save_target_data(self, _item):
        if _item.target_data != None :
            content_body = {
                'value' : _item.target_data,
                'id_item' : _item.id
            }
            return self._es.index(index="target_data", body=content_body)['_id']
        return False

    def save_integrity(self, _item):
        if _item.integrity != None :
            content_body = {
                'value' : _item.integrity,
                'id_item' : _item.id
            }
            return self._es.index(index="integrity", body=content_body)['_id']
        return False
        
    def save_type_flux(self, _item):
        if _item.type_flux != None :
            content_body = {
                'value' : _item.type_flux,
                'id_item' : _item.id
            }
            return self._es.index(index="type_flux", body=content_body)['_id']
        return False

    def save_type_predit(self, _item):
        if _item.type_flux != None :
            content_body = {
                'value' : _item.type_predit,
                'id_item' : _item.id
            }
            return self._es.index(index="type_predit", body=content_body)['_id']
        return False

    def search_by_tags(self, index_name, tags, size_result=999):
        
        str_query = ''
        maxi = len(tags)
        for i in range(maxi):
            str_query += 'tags:' + '*'+tags[i]+'*'
            if i < maxi-1:
                str_query += ' AND '
                
        query_body ={
            "query": {
                "query_string": {
                    "query" : str_query
                }
            }
        }
        return self._es.search(index=index_name, body=query_body, size=size_result)['hits']['hits']

    def verification_integrity(self, id_, integrity_):
        res = self._es.get(index="integrity", id=id_, ignore=[400,404])
        return False

## TP 3 - Classifier

In [32]:
elastic_tool = ElasticTool()
elastic_tool.affichage_etat()

ElasticSearch Tourne


## Lancement

In [59]:
cr = Crawler()
cr.crawl("https://www.lefigaro.fr/rss/figaro_economie.xml")

TypeError: save_item() takes 12 positional arguments but 13 were given

In [68]:
cr = Crawler()
cr.crawl_from_file('flux_rss.txt')

UnicodeEncodeError: 'ascii' codec can't encode character '\u2018' in position 22: ordinal not in range(128)

In [54]:
elastic_tool = ElasticTool()
print(elastic_tool.search_by_tags('description',['essentiel']))
elastic_tool.search_by_tags('description', ['commerces', 'rouvrir'])

[]


[]

In [67]:
elastic_tool = ElasticTool()
elastic_tool.delete_all_index()