# Newspaper

Scrapeo de noticias.

https://pypi.org/project/newspaper3k/

**Articulo**

In [1]:
%pip install newspaper3k

In [2]:
from newspaper import Article

In [3]:
url='https://www.elmundo.es/tecnologia/2020/12/14/5fd755b0fdddff24688b45c2.html'

In [4]:
article=Article(url)

In [5]:
article.download()

article.parse()

article.nlp()

In [6]:
#article.html

In [7]:
article.authors

['Bruno Toledano']

In [8]:
article.publish_date

datetime.datetime(2020, 12, 14, 0, 0)

In [9]:
article.text[:200]

'La guinda de 2020: una caída generalizada de Google en todo el mundo que ha durado 35 minutos\n\nEntre las 12:55 y las 13:30, Google se ha caído y ha tenido problemas en todo el mundo, no permitiendo el'

In [10]:
article.top_image

'https://phantom-elmundo.unidadeditorial.es/f7ded90792665b811c8f3751d23db6b9/resize/1200/f/jpg/assets/multimedia/imagenes/2020/12/14/16079475000780.jpg'

In [11]:
article.movies

[]

In [12]:
article.keywords

['google',
 'la',
 'las',
 'que',
 'en',
 'sus',
 'el',
 'ha',
 'vuelven',
 'servicios',
 'normalidad',
 'otros',
 'su',
 'youtube',
 'para',
 'llega',
 'y']

In [13]:
article.summary

'Google no ha confirmado la razón de este problema, que se ha dejado sentir instantáneamente entre los millones de usuarios de Internet.\nLas propias herramientas de Google para notificar fallos confirman que sus servicios van poco a poco volviendo en sí.\nSegún Downdetector, un servicio online que detecta incidencias en servicios digitales, el problema de Google ha sido generalizado, afectando a todo el mundo.\nLa caída de Google ha afectado no sólo a sus servicios, sino también a otros ligados a estos.\nEsta caída generalizada del servicio puede suponer un grave problema para muchas de ellas en el día de hoy.'

In [14]:
article.title

'La caída de Google llega a su fin: Gmail, YouTube y otros servicios vuelven a la normalidad'

**desde web**

In [15]:
import newspaper

cnn_paper=newspaper.build('http://cnn.com')

# es un generador
for article in cnn_paper.articles:
    print(article.url)

In [16]:
for category in cnn_paper.category_urls():
    print(category)

http://cnn.com
http://cnn.com/audio
https://arabic.cnn.com
https://cnnespanol.cnn.com
https://edition.cnn.com
https://www.cnn.com
http://cnn.com/live-tv
https://money.cnn.com
https://us.cnn.com


# Newspaper scraper

In [17]:
BASE_URLS=['https://www.expansion.com/mercados/cronica-bolsa.html',
            'https://www.bolsamania.com/indice/IBEX-35/noticias',
            'https://cincodias.elpais.com/tag/ibex_35/',
            'https://www.estrategiasdeinversion.com/especial/noticias-del-ibex-35',
            'https://es.investing.com/indices/spain-35-news',
            'https://www.abc.es/economia/bolsa/ibex-35/',
            'https://www.20minutos.es/minuteca/ibex-35/',
            'https://cadenaser.com/tag/ibex_35/',
            'https://noticiasibex35.com/',
            'https://okdiario.com/tag/ibex-35/',
            'https://www.antena3.com/noticias/temas/ibex35-1',
            'https://www.dailyfx.com/espanol/ibex-35',
            'https://www.infobolsa.es/news',
            'https://www.larazon.es/tags/ibex-35/',
            'https://www.elperiodico.com/es/temas/ibex-35-8003',
            'https://es.finance.yahoo.com/quote/%5Eibex/',
            'https://www.libertaddigital.com/empresas/ibex-35/',
            'https://www.hosteltur.com/tag/ibex-35',
            'https://www.elconfidencial.com/mercados/',
            'https://noticiasdebolsa.es/',
            'https://www.finanzas.com/',
            'https://www.ennaranja.com/tag/ibex/',
            'https://www.rtve.es/temas/bolsa/1079/',
            'https://www.lavanguardia.com/economia']

In [18]:
import newspaper
import requests as req
from nltk.corpus import stopwords
stop_words_sp=set(stopwords.words('spanish'))
from bs4 import BeautifulSoup as bs
import json

from sqlalchemy import create_engine, Column, Float, Integer, JSON, DateTime, Text
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import DDL

import pandas as pd


# ANALISIS DE SENTIMIENTO
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [19]:
#!sudo -u postgres createdb articles

In [20]:
SCHEMA='articles'

In [21]:
conn_str=f'postgresql+psycopg2://iudh:password@localhost:5432/{SCHEMA}'

In [22]:
Base=declarative_base()

class Articles(Base):
    __tablename__='article'
    __table_args__={'schema': SCHEMA}
    
    url=Column(Text(), primary_key=True)
    authors=Column(Text())
    title=Column(Text())
    publish_date=Column(DateTime)
    text=Column(Text())
    top_image=Column(Text())
    keywords=Column(JSON)
    summary=Column(Text())
    video=Column(JSON)
    polarity=Column(Float)
    subjectivity=Column(Float)
    negativity=Column(Float)
    neutrality=Column(Float)
    positivity=Column(Float)
    compound_score=Column(Float)

In [23]:
class Crawler:
    
    def __init__(self, str_conn, schema, Table):
        print('Conectando a base de datos...')
        self.schema=schema
        self.Table=Table
        self.motor=create_engine(str_conn)
        self.sesion=sessionmaker(bind=self.motor)()
        self.motor.execute(DDL(f'create schema if not exists {self.schema}'))
        print('Conectado.')
    
    def crea_tablas(self):
        
        try:
            print('Creando tabla...')
            self.Table.__table__.create(self.motor)
        except:
            print('Tabla ya existe.')
        
        
    def rellena_tablas(self, base_url):   
        print('Rellenando tablas...')
        
        paper=newspaper.build(base_url, memoize_articles=False)
        articles=list(set([e.url for e in paper.articles]))
        
        for art in articles:
            print(art)
            news=newspaper.Article(art)
            
            try:
                news.download()
                news.parse()
                news.nlp()
            except Exception as ex:
                print(ex)
                continue
                
            
            text=news.text if news.text else bs(req.get(art).content, 'html.parser').find('body').get_text()
            
            sentiment=TextBlob(text).sentiment
            
            analyzer=SentimentIntensityAnalyzer().polarity_scores(text)
            
            item=Articles(url=art,
                          authors=json.dumps(news.authors),
                          title=news.title,
                          publish_date=news.publish_date,
                          text=text,
                          top_image=news.top_image,
                          keywords=json.dumps([key for key in news.keywords if key not in stop_words_sp]),
                          summary=news.summary,
                          video=json.dumps(news.movies),
                          polarity=sentiment[0],
                          subjectivity=sentiment[1],
                          negativity=analyzer['neg'],
                          neutrality=analyzer['neu'],
                          positivity=analyzer['pos'],
                          compound_score=analyzer['compound'])
            
            self.sesion.add(item)
            
        self.sesion.commit()

        
        
    def borra_tablas(self):
        try:
            print('Borrando tabla...')
            self.Table.__table__.drop(self.motor)
        except:
            print('ya existe tabla...')
            
            
    def show_df(self):
        data=self.motor.execute(DDL(f'select * from {self.schema}.{self.Table.__tablename__}')).fetchall()
        columns=self.motor.execute(DDL(f"select * from {self.schema}.information_schema.columns where table_name='{self.Table.__tablename__}'")).fetchall()
        return pd.DataFrame(data, columns=[e[3] for e in columns])


In [24]:
news=Crawler(conn_str, SCHEMA, Articles)

Conectando a base de datos...
Conectado.


In [25]:
news.borra_tablas()

Borrando tabla...


In [26]:
news.crea_tablas()

Creando tabla...


In [27]:
news.rellena_tablas('https://www.lavanguardia.com/economia')

In [28]:
for e in BASE_URLS[:1]:
    news.rellena_tablas(e)

Rellenando tablas...
https://sincroguia-tv.expansion.com/parrilla/2023-05-30
Article `download()` failed with 404 Client Error: Not Found for url: https://sincroguia-tv.expansion.com/parrilla/2023-05-30 on URL https://sincroguia-tv.expansion.com/parrilla/2023-05-30
https://www.expansion.com/mercados/cronica-bolsa/2023/04/24/64461937468aeba83b8b4589.html
https://www.expansion.com/economia-sostenible/2023/05/25/646df06e468aebcf628b47e7.html
https://datosmacro.expansion.com/analisis/breves/20161216/ipc-pib
https://www.expansion.com/mercados/cronica-bolsa/2023/05/18/6465bd17e5fdeaa5628b4587.html
https://datosmacro.expansion.com/analisis/actualidad/20160401/impuestos-irpf-maximo
https://www.expansion.com/catalunya/2023/05/25/646f267be5fdeaca628b4581.html
https://www.expansion.com/mercados/cronica-bolsa/2023/05/05/6454999ae5fdead0058b45aa.html
https://datosmacro.expansion.com/analisis/actualidad/20180224/irpf
https://sincroguia-tv.expansion.com/peliculas/garcia-y-garcia--jn6Z-SPA
https://www

https://www.expansion.com/empresas/tecnologia/2023/05/25/646ea908e5fdea680c8b45a7.html
https://www.expansion.com/mercados/divisas/2023/05/25/646cb17c468aebad5f8b47f8.html
https://videos.expansion.com/v/0_1qlhdkos-martinez-almeida-vivimos-una-degradacion-institucional-de-nuestra-democracia
https://datosmacro.expansion.com/analisis/actualidad/20180120/desempleo-2017-espana
https://www.expansion.com/catalunya/2023/05/25/646f8d56468aebb7028b45b2.html
https://videos.expansion.com/v/0_o54e7ags-la-infanta-sofia-recibe-la-confirmacion-con-felipe-vi-como-padrino
https://www.expansion.com/fueradeserie/relojes/album/2023/05/24/6454b820e5fdea2a558b4576.html
http://videos.expansion.com/v/0_o54e7ags-la-infanta-sofia-recibe-la-confirmacion-con-felipe-vi-como-padrino
https://www.expansion.com/empresas/inmobiliario/2023/05/25/646f0937468aebe4638b4889.html
https://www.expansion.com/mercados/2023/05/08/64588c18468aeb03188b4616.html
https://www.expansion.com/mercados/cronica-bolsa/2023/05/09/6459e125468ae

https://sincroguia-tv.expansion.com/parrilla/2023-05-31
Article `download()` failed with 404 Client Error: Not Found for url: https://sincroguia-tv.expansion.com/parrilla/2023-05-31 on URL https://sincroguia-tv.expansion.com/parrilla/2023-05-31
https://www.expansion.com/empresas/2023/05/20/6467a71ce5fdea66388b45eb.html
https://www.expansion.com/mercados/cronica-bolsa/2023/04/14/6438ea7be5fdead1098b458c.html
https://www.expansion.com/mercados/2023/05/25/646f36f6e5fdead5558b456f.html?intcmp=WIDPRMPRT
https://www.expansion.com/mercados/cronica-bolsa/2023/04/17/643cdf64468aeb5b238b4656.html
https://www.expansion.com/empresas/banca/2023/05/25/646e93fce5fdeac62f8b4575.html
https://datosmacro.expansion.com/analisis/actualidad/20190409/viviendas-madrid
https://www.expansion.com/mercados/cronica-bolsa/2023/05/24/646da6ea468aeb22568b472b.html
https://sincroguia-tv.expansion.com/programas/deportes-deportes-noche--L8jZ-SPA
https://datosmacro.expansion.com/analisis/actualidad/20170801/diferencias-d

In [29]:
df=news.show_df()

df.head()

Unnamed: 0,url,authors,title,publish_date,text,top_image,keywords,summary,video,polarity,subjectivity,negativity,neutrality,positivity,compound_score
0,https://www.expansion.com/mercados/cronica-bol...,"[""Alejandro S\u00e1nchez"", ""M. De La Cruz""]",El Ibex espera la nueva oleada de resultados e...,2023-04-24,"Jornada de transición en las Bolsas europeas, ...",https://phantom-expansion.unidadeditorial.es/8...,"[""nueva"", ""espera"", ""puntos"", ""ibex"", ""resulta...","Hasta entonces, el optimismo macro de semanas ...",[],0.0,0.175,0.002,0.962,0.036,0.9786
1,https://www.expansion.com/economia-sostenible/...,"[""Pedro Biurrun""]","Sostenibilidad, el talón de Aquiles de la empr...",2023-05-25,Dreamstime\n\nCrecen las exigencias regulatori...,https://phantom-expansion.unidadeditorial.es/2...,"[""materia"", ""empresas"", ""sostenibilidad"", ""emp...","DreamstimeCrecen las exigencias regulatorias, ...",[],0.204167,0.283333,0.028,0.972,0.0,-0.296
2,https://datosmacro.expansion.com/analisis/brev...,[],"IPC y PIB, no es tan complicado como lo pintan...",2016-12-16,Los últimos estudios indican que el español me...,https://datosmacro.expansion.com/sites/default...,"[""2023"", ""tan"", ""complicado"", ""pintan"", ""pib"",...",Los últimos estudios indican que el español me...,[],0.0,0.0,0.019,0.981,0.0,-0.5267
3,https://www.expansion.com/mercados/cronica-bol...,"[""Mauricio Skrycky"", ""Alejandro S\u00e1nchez"",...",El Ibex cierra plano y se descuelga de las sub...,2023-05-18,Las Bolsas europeas han celebrado con subidas ...,https://phantom-expansion.unidadeditorial.es/9...,"[""descuelga"", ""ibex"", ""subidas"", ""europa"", ""pl...",Jornada en la que se han impuesto las compras ...,[],0.0,0.0,0.005,0.956,0.04,0.9709
4,https://datosmacro.expansion.com/analisis/actu...,[],El engaño del tipo máximo 2023,2016-04-01,Afganistán Albania Alemania Andorra Angola Ant...,https://datosmacro.expansion.com/sites/default...,"[""2023"", ""tipo"", ""rep\u00fablica"", ""guinea"", ""...",Afganistán Albania Alemania Andorra Angola Ant...,[],0.0,0.0,0.0,1.0,0.0,0.0


In [35]:
len(df)

182

In [32]:
df.to_csv('../data/noticias.csv', index=False)

# Psycopg2

In [33]:
import psycopg2

In [34]:
try:
    conexion=psycopg2.connect(user='iudh',
                              password='password',
                              host='localhost',
                              port='5432',
                              database='articles')
    
    cursor=conexion.cursor()
    
    query='select * from articles.article'
    
    cursor.execute(query)
    
    data=cursor.fetchall()
    
    for e in data:
        print(e[5])
        
except (Exception, psycopg2.Error) as error:
    print('Error cogiendo los datos bla bla bla...', error)

https://phantom-expansion.unidadeditorial.es/8c8d8d48f04cc5aa1da6304c3a69009a/crop/0x0/2044x1363/resize/1200/f/jpg/assets/multimedia/imagenes/2023/04/21/16820559917314.jpg
https://phantom-expansion.unidadeditorial.es/2c6a5444251ab1c3a5105319124394ee/crop/0x0/2044x1363/resize/1200/f/jpg/assets/multimedia/imagenes/2023/05/24/16849266178614.jpg
https://datosmacro.expansion.com/sites/default/files/varios/art/2016/12/ipc-pib-f.png
https://phantom-expansion.unidadeditorial.es/9055bf676b28bcdfa888fa834fe73abb/crop/0x0/2044x1363/resize/1200/f/jpg/assets/multimedia/imagenes/2023/04/24/16823355930650.jpg
https://datosmacro.expansion.com/sites/default/files/varios/art/2016/04/impuestos-rojo-portada.png
https://phantom-expansion.unidadeditorial.es/fb7378312f12a714bcbe20a0d163d95e/crop/0x0/2044x1363/resize/1200/f/jpg/assets/multimedia/imagenes/2023/05/25/16850058751965.jpg
https://phantom-expansion.unidadeditorial.es/33e0f6a2c6006bc40f0b5b27856e30fa/resize/1200/f/jpg/assets/multimedia/imagenes/2023