# Mapping Interconnections: Conspiracies Behind Wildfire Causes in Portugal (part 2 - Web sraping)

## Data preparation for visual analysis

1. Bib

In [14]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

nltk.download('stopwords')
nltk.download('punkt')

import matplotlib.pyplot as plt
import seaborn as sns
from nltk import FreqDist, ngrams
from collections import Counter
from nltk.util import ngrams
from matplotlib_venn import venn2, venn3


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bernardo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bernardo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


2. Carregar o CSV e Adicionar a Coluna "contents"

In [15]:
# Carregar o CSV original
df_artigos = pd.read_csv('detalhes_artigos_liquidos.csv')

# Adicionar uma coluna vazia 'contents'
df_artigos['contents'] = None

# Verificar as colunas
print(df_artigos.columns)


Index(['termo', 'site', 'ano', 'titulo', 'data', 'link', 'contents'], dtype='object')


3. Função para Remover Frases e Palavras Indesejadas [Improved] 

In [16]:
import re
import unicodedata
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

def clean_text(text, language='portuguese'):
    if not text:  # Handle None or empty inputs
        return ""
    
    # Normalize Unicode (e.g., remove accents)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # List of unwanted words/phrases
    unwanted_phrases = [
        r"\bsite\b", r"\butiliza\b", r"\bnavegar\b", r"\bconsentir\b", r"\butilizacao\b",
        r"\bsaiba\b", r"\balterar\b", r"\blocalizacao\b", r"\besqueceu\b", r"\bpalavra\b",
        r"\bchave\b", r"\btopicos\b", r"\bsobre\b", r"\buso\b", r"\buso\b", r"\bnavegador\b"
    ]
    
    # Remove unwanted phrases
    for phrase in unwanted_phrases:
        text = re.sub(phrase, '', text, flags=re.IGNORECASE)
    
    # Tokenize with a regex to handle words efficiently
    tokenizer = RegexpTokenizer(r'\b\w+\b')
    tokens = tokenizer.tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words(language))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Join the tokens back into a string
    cleaned_text = " ".join(filtered_tokens)
    return cleaned_text.strip()


4. Função de Scraping de Artigos [Improved] 

In [17]:
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def scrape_and_clean_article(url):
    try:
        # Configure retries and timeout
        session = requests.Session()
        retries = Retry(total=3, backoff_factor=0.3, status_forcelist=[500, 502, 503, 504])
        session.mount('http://', HTTPAdapter(max_retries=retries))
        session.mount('https://', HTTPAdapter(max_retries=retries))
        
        # Fetch the webpage
        response = session.get(url, timeout=10)
        response.encoding = response.apparent_encoding  # Handle encoding
        response.raise_for_status()  # Check for HTTP request errors
        
        # Parse the HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract title
        title_tag = soup.find('h1')
        title_text = title_tag.get_text(strip=True) if title_tag else "Título não encontrado"
        
        # Extract content
        content_div = soup.find('div', {'class': 'story__body'})  # Adaptable class name
        if content_div:
            content = "\n".join([p.get_text(strip=True) for p in content_div.find_all('p')])
        else:
            content = "\n".join([p.get_text(strip=True) for p in soup.find_all('p')])
        
        # Clean the extracted content
        cleaned_content = clean_text(content)
        return cleaned_content, title_text  # Return both content and title
    except Exception as e:
        print(f"Erro ao processar {url}: {e}")
        return None, None


5. Loop de Scraping e Preenchimento da Coluna "contents" (3m26seg)

In [18]:
for index, row in df_artigos.iterrows():
    url = row['link']
    cleaned_content = scrape_and_clean_article(url)
    if cleaned_content:
        df_artigos.at[index, 'contents'] = cleaned_content  # Preencher a coluna 'contents' com o conteúdo limpo


Erro ao processar https://arquivo.pt/noFrame/replay/20170815152855/http://www.sapo.pt/noticias/controlado-incendio-em-torre-residencial-no_598431e25e28b30728870262: 404 Client Error: Not Found for url: https://arquivo.pt/noFrame/replay/20170815152855/http://24.sapo.pt/noticias/internacional/artigo/controlado-incendio-em-torre-residencial-no-dubai_22757989.html


6. Guardar os dados em um CSV

In [19]:
df_artigos

Unnamed: 0,termo,site,ano,titulo,data,link,contents
0,incêndio,www.publico.pt,2017,Incêndio em Gaia - PÚBLICO,2017-01-18 04:36:37,https://arquivo.pt/noFrame/replay/201701180436...,(cookies cookies porpaulo pimenta 25 10 2016 1...
1,incêndio,www.publico.pt,2017,Complexo turístico Zmar recupera as infra-estr...,2017-01-10 20:21:26,https://arquivo.pt/noFrame/replay/201701102021...,(obras recuperacao edificios infra estruturas ...
2,incêndio,www.publico.pt,2017,Zambujeira do Mar - PÚBLICO,2017-01-11 02:26:52,https://arquivo.pt/noFrame/replay/201701110226...,(cookies cookies todos porcarlos dias 10 01 20...
3,incêndio,www.publico.pt,2017,Ferreira do Zêzere - PÚBLICO,2017-01-11 06:05:20,https://arquivo.pt/noFrame/replay/201701110605...,(cookies esti utilizazo cookies localizazo tod...
4,incêndio,www.publico.pt,2017,Andrzej Zulawski (1940 - 2016) - PÚBLICO,2017-06-22 05:00:38,https://arquivo.pt/noFrame/replay/201706220500...,(cookies cookies todos porluis miguel queirose...
...,...,...,...,...,...,...,...
522,seca,www.cmjornal.pt,2018,Vocalista dos Xutos surpreendido com homenagem...,2018-02-06 08:11:35,https://arquivo.pt/noFrame/replay/201802060811...,(tim vocalista banda portuguesa xutos pontapes...
523,seca,www.cmjornal.pt,2018,Coro canta ‘Xutos & Pontapés’ e recorda Zé Ped...,2018-02-02 22:52:34,https://arquivo.pt/noFrame/replay/201802022252...,(nao unico olhar ceu cantam xutos pontapes dom...
524,seca,www.cmjornal.pt,2018,Apple acaba com iPhone X já este verão - Tecno...,2018-02-07 03:30:17,https://arquivo.pt/noFrame/replay/201802070330...,(pensar comprar iphone x entao pense melhor ap...
525,seca,www.cmjornal.pt,2018,Shawn Mendes atua no festival Sudoeste na Zamb...,2018-02-06 23:30:27,https://arquivo.pt/noFrame/replay/201802062330...,(pub pub pub pub copyright 2018 todos direitos...


In [20]:
df_artigos.to_csv('detalhes_artigos_completo.csv', index=False)


print("Web scraping e limpeza concluídos! Dados guardados em 'detalhes_artigos_completo.csv'.")

Web scraping e limpeza concluídos! Dados guardados em 'detalhes_artigos_completo.csv'.
