# Newspaper

Scrapeo de noticias.

https://pypi.org/project/newspaper3k/

In [None]:
%pip install nltk

In [None]:
import nltk

nltk.download()

**Articulo**

In [None]:
%pip install newspaper3k

In [None]:
from newspaper import Article

In [None]:
url='https://www.elmundo.es/tecnologia/2020/12/14/5fd755b0fdddff24688b45c2.html'

In [None]:
article=Article(url)

In [None]:
article.download()

article.parse()

article.nlp()

In [None]:
#article.html

In [None]:
article.authors

In [None]:
article.publish_date

In [None]:
article.text[:200]

In [None]:
article.top_image

In [None]:
article.movies

In [None]:
article.keywords

In [None]:
article.summary

In [None]:
article.title

**desde web**

In [None]:
import newspaper

cnn_paper=newspaper.build('http://cnn.com')

# es un generador
for article in cnn_paper.articles:
    print(article.url)

In [None]:
for category in cnn_paper.category_urls():
    print(category)

# Newspaper scraper

In [None]:
BASE_URLS=['https://www.expansion.com/mercados/cronica-bolsa.html',
            'https://www.bolsamania.com/indice/IBEX-35/noticias',
            'https://cincodias.elpais.com/tag/ibex_35/',
            'https://www.estrategiasdeinversion.com/especial/noticias-del-ibex-35',
            'https://es.investing.com/indices/spain-35-news',
            'https://www.abc.es/economia/bolsa/ibex-35/',
            'https://www.20minutos.es/minuteca/ibex-35/',
            'https://cadenaser.com/tag/ibex_35/',
            'https://noticiasibex35.com/',
            'https://okdiario.com/tag/ibex-35/',
            'https://www.antena3.com/noticias/temas/ibex35-1',
            'https://www.dailyfx.com/espanol/ibex-35',
            'https://www.infobolsa.es/news',
            'https://www.larazon.es/tags/ibex-35/',
            'https://www.elperiodico.com/es/temas/ibex-35-8003',
            'https://es.finance.yahoo.com/quote/%5Eibex/',
            'https://www.libertaddigital.com/empresas/ibex-35/',
            'https://www.hosteltur.com/tag/ibex-35',
            'https://www.elconfidencial.com/mercados/',
            'https://noticiasdebolsa.es/',
            'https://www.finanzas.com/',
            'https://www.ennaranja.com/tag/ibex/',
            'https://www.rtve.es/temas/bolsa/1079/',
            'https://www.lavanguardia.com/economia']

In [None]:
import newspaper
import requests as req
from nltk.corpus import stopwords
stop_words_sp=set(stopwords.words('spanish'))
from bs4 import BeautifulSoup as bs
import json

from sqlalchemy import create_engine, Column, Float, Integer, JSON, DateTime, Text
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import DDL

import pandas as pd


# ANALISIS DE SENTIMIENTO
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
#!sudo -u postgres createdb articles

In [None]:
SCHEMA='articles'

In [None]:
conn_str=f'postgresql+psycopg2://postgres:password@localhost:5432/{SCHEMA}'

In [None]:
Base=declarative_base()

class Articles(Base):
    __tablename__='article'
    __table_args__={'schema': SCHEMA}
    
    url=Column(Text(), primary_key=True)
    authors=Column(Text())
    title=Column(Text())
    publish_date=Column(DateTime)
    text=Column(Text())
    top_image=Column(Text())
    keywords=Column(JSON)
    summary=Column(Text())
    video=Column(JSON)
    polarity=Column(Float)
    subjectivity=Column(Float)
    negativity=Column(Float)
    neutrality=Column(Float)
    positivity=Column(Float)
    compound_score=Column(Float)

In [None]:
class Crawler:
    
    def __init__(self, str_conn, schema, Table):
        print('Conectando a base de datos...')
        self.schema=schema
        self.Table=Table
        self.motor=create_engine(str_conn)
        self.sesion=sessionmaker(bind=self.motor)()
        self.motor.execute(DDL(f'create schema if not exists {self.schema}'))
        print('Conectado.')
    
    def crea_tablas(self):
        if not self.motor.dialect.has_table(self.motor, self.Table.__tablename__, schema=self.schema):
            print('Creando tabla...')
            self.Table.__table__.create(self.motor)
        else:
            print('Tabla ya existe.')
        
        
    def rellena_tablas(self, base_url):   
        print('Rellenando tablas...')
        
        paper=newspaper.build(base_url, memoize_articles=False)
        articles=list(set([e.url for e in paper.articles]))
        
        for art in articles:
            print(art)
            news=newspaper.Article(art)
            
            try:
                news.download()
                news.parse()
                news.nlp()
            except Exception as ex:
                print(ex)
                continue
                
            
            text=news.text if news.text else bs(req.get(art).content, 'html.parser').find('body').get_text()
            
            sentiment=TextBlob(text).sentiment
            
            analyzer=SentimentIntensityAnalyzer().polarity_scores(text)
            
            item=Articles(url=art,
                          authors=json.dumps(news.authors),
                          title=news.title,
                          publish_date=news.publish_date,
                          text=text,
                          top_image=news.top_image,
                          keywords=json.dumps([key for key in news.keywords if key not in stop_words_sp]),
                          summary=news.summary,
                          video=json.dumps(news.movies),
                          polarity=sentiment[0],
                          subjectivity=sentiment[1],
                          negativity=analyzer['neg'],
                          neutrality=analyzer['neu'],
                          positivity=analyzer['pos'],
                          compound_score=analyzer['compound'])
            
            self.sesion.add(item)
            
        self.sesion.commit()

        
        
    def borra_tablas(self):
        if self.motor.dialect.has_table(self.motor, self.Table.__tablename__, schema=self.schema):
            print('Borrando tabla...')
            self.Table.__table__.drop(self.motor)
            
            
    def show_df(self):
        data=self.motor.execute(DDL(f'select * from {self.schema}.{self.Table.__tablename__}')).fetchall()
        columns=self.motor.execute(DDL(f"select * from {self.schema}.information_schema.columns where table_name='{self.Table.__tablename__}'")).fetchall()
        return pd.DataFrame(data, columns=[e[3] for e in columns])


In [None]:
news=Crawler(conn_str, SCHEMA, Articles)

In [None]:
#news.borra_tablas()

In [None]:
news.crea_tablas()

In [None]:
news.rellena_tablas('https://www.lavanguardia.com/economia')

In [None]:
for e in BASE_URLS:
    news.rellena_tablas(e)

In [None]:
df=news.show_df()

df.head()

In [None]:
len(df)

In [None]:
df.to_csv('data/noticias.csv', index=False)

# Psycopg2

In [None]:
import psycopg2

In [None]:
try:
    conexion=psycopg2.connect(user='iudh',
                              password='password',
                              host='localhost',
                              port='5432',
                              database='articles')
    
    cursor=conexion.cursor()
    
    query='select * from articles.article'
    
    cursor.execute(query)
    
    data=cursor.fetchall()
    
    for e in data:
        print(e[5])
        
except (Exception, psycopg2.Error) as error:
    print('Error cogiendo los datos bla bla bla...', error)