In [15]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import pipeline
import requests
import trafilatura
# %pip install hf_xet
# import torch
# %pip install torch
%pip install readability-lxml


# # Descargar recursos necesarios de NLTK
nltk.download('punkt_tab')
nltk.download('gutenberg')
nltk.download('stopwords')

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt_tab to C:\Users\Jose
[nltk_data]     Gómez\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package gutenberg to C:\Users\Jose
[nltk_data]     Gómez\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Jose
[nltk_data]     Gómez\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
def generate_summary_a(article_text):
    summarizer = pipeline("text-generation", model="facebook/bart-large-cnn")
    
    # BART tiene límite de tokens, cortar si el texto es muy largo
    article_text = article_text[:1024]
    
    summary = summarizer(article_text, max_length=130, min_length=50, do_sample=False)
    return summary[0]['summary_text']

In [17]:
def classifier_a(data):
    from transformers import pipeline

    # Crear pipeline i definir model
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

    # Important si el text es en anglés, posar els topics també en anglés

    # Definició dels topics
    # Definició dels temes que pot tenir l'article
    # possible_labels = ["política", "deportes", "economía", "ciencia", "noticias","tecnología", "salud", "cultura", "guerra", "internacional"]
    
    possible_labels = ["politics", "sports", "economy", "science", "technology", "health", "culture", "war", "international"]


    article_text = " ".join(data)[:1024]
    result = classifier(article_text, candidate_labels=possible_labels, device=0)

    print("Topic més probable:", result['labels'][0])
    print("Scores:", list(zip(result['labels'], result['scores'])))

    return result

In [18]:
def title_article_preprocessing_a(clean_text):

    # 1. Toeknize del texte
    # tokens = word_tokenize(clean_text, language='spanish')
    tokens = word_tokenize(clean_text, language='english')
    
    # 2. Normalitzar a minúscules i eliminar puntuació
    tokens = [t.lower() for t in tokens if t.isalpha()]  # solo palabras
    
    # 3. Eliminar stopweords
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    
    # print(tokens[:50])

    return tokens


def title_article_preprocessing_b(clean_text):
    # ToDo: Implementar otra funcion de procesameinto de texto diferente 
    pass

In [19]:
import requests
from bs4 import BeautifulSoup
from readability import Document
from lxml import html

url = "https://link.springer.com/article/10.1186/s12915-020-00925-x"



def scrape_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    # 1. Intentar <article>
    article = soup.find("article")
    if article:
        text = article.get_text(separator="\n")
    else:
        # 2. Intentar <main>
        main = soup.find("main")
        if main:
            text = main.get_text(separator="\n")
        else:
            # 3. Fallback: todos los <p>, unir el bloque más largo
            paragraphs = soup.find_all("p")
            blocks = []
            current_block = []
            for p in paragraphs:
                current_block.append(p.get_text())
                # Separar bloques grandes por lógica simple
                if len(current_block) > 3:  
                    blocks.append("\n".join(current_block))
                    current_block = []
            if current_block:
                blocks.append("\n".join(current_block))
            # Elegir el bloque más largo (probable artículo)
            text = max(blocks, key=len) if blocks else ""
    
    # Limpiar líneas vacías
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    data = "\n".join(lines)

    return data


# def scrape_text_refined(url):
#     response = requests.get(url)
#     response.raise_for_status()

#     tree = html.fromstring(response.content)

#     # Buscar el contenedor principal del artículo (para Springer suele ser c-article-body)
#     article_divs = tree.xpath('//div[contains(@class,"c-article-body")]')
    
#     if article_divs:
#         paragraphs = article_divs[0].xpath('.//p//text()')
#         data = "\n".join([p.strip() for p in paragraphs if p.strip()])
#     else:
#         # Fallback: todos los <p> de la página
#         paragraphs = tree.xpath('//p//text()')
#         data = "\n".join([p.strip() for p in paragraphs if p.strip()])


#     return data

def scrape_text_advanced(url):
    try:
        # Descarregar la pàgina web amb un User-Agent vàlid
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=15)
        
        # Forcem que la resposta sigui interpretada com a UTF-8
        response.encoding = 'utf-8' 
        
        if response.status_code == 200:
            # 2. Li passem el contingut ja descarregat a trafilatura per només obtenir el cos del text, l'article.
            clean_text = trafilatura.extract(response.text, include_comments=False)
            
            if clean_text:
                print(clean_text[:1000])  # Mostrar només els primers 1000 caràcters
                return clean_text
        
        return None

    except Exception as e:
        print(f"Error en la descarga: {e}")
        return None


    


In [20]:
execution_results = {}

def execution_1(url):
    data = scrape_text(url)
    tokens = title_article_preprocessing_a(data)
    results = classifier_a(tokens)
    resume = generate_summary_a(data)

    save_results(results, 1)
    print(f"Resum de l'article: {resume}")


def execution_2(url):
    data = scrape_text_advanced(url)
    tokens = title_article_preprocessing_a(data)
    results = classifier_a(tokens)
    resume = generate_summary_a(data)

    save_results(results, 2)
    print(f"Resum de l'article: {resume}")


# def execution_3(url):
#     data = scrape_text(url)
#     tokens = title_article_preprocessing_b(data)
#     results = classifier_a(tokens)
#     resume = generate_summary_a(data)

#     save_results(results, 3)
#     print(f"Resum de l'article: {resume}")


# def execution_4(url):
#     data = scrape_text(url)
#     tokens = title_article_preprocessing_a(data)
#     results = train_model(tokens)
#     resume = generate_summary_a(data)

#     save_results(results, 4)
#     print(f"Resum de l'article: {resume}")


# def execution_5(url):
#     data = scrape_text(url)
#     tokens = title_article_preprocessing_a(data)
#     results = classifier_a(tokens)
#     resume = generate_summary_b(data)

#     save_results(results, 5)
#     print(f"Resum de l'article: {resume}")


def save_results(results, num):
        execution_results[f"execution_{num}"] = {
        "topic": results['labels'][0],
        "score": results['scores'][0]
    }
    


In [21]:
def main():

    urls = ["https://link.springer.com/article/10.1186/s12915-020-00925-x"]

    for url in urls:
        print(f"############# Execution 1: URL{url}")
        execution_1(url)

        print(f"############# Execution 2: URL{url}")
        execution_2(url)

    print("Execution Results Summary:")
    for key, value in execution_results.items():
        print(f"{key}: Topic - {value['topic']}, Score - {value['score']}")



main()

############# Execution 1: URLhttps://link.springer.com/article/10.1186/s12915-020-00925-x


Loading weights:   0%|          | 0/515 [00:00<?, ?it/s]

Topic més probable: science
Scores: [('science', 0.6881455779075623), ('international', 0.07561494410037994), ('economy', 0.0668274536728859), ('culture', 0.0525495819747448), ('technology', 0.044419512152671814), ('health', 0.03472081199288368), ('war', 0.017296474426984787), ('sports', 0.01105914730578661), ('politics', 0.009366490878164768)]


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Please make sure the generation config includes `forced_bos_token_id=0`. 


Loading weights:   0%|          | 0/316 [00:00<?, ?it/s]

BartForCausalLM LOAD REPORT from: facebook/bart-large-cnn
Key                                                       | Status     |  | 
----------------------------------------------------------+------------+--+-
model.encoder.layers.{0...11}.fc2.bias                    | UNEXPECTED |  | 
model.encoder.layers.{0...11}.self_attn.k_proj.bias       | UNEXPECTED |  | 
model.encoder.layers.{0...11}.fc1.bias                    | UNEXPECTED |  | 
model.encoder.layers.{0...11}.self_attn_layer_norm.bias   | UNEXPECTED |  | 
model.encoder.layers.{0...11}.self_attn_layer_norm.weight | UNEXPECTED |  | 
model.encoder.layers.{0...11}.self_attn.q_proj.weight     | UNEXPECTED |  | 
model.encoder.layers.{0...11}.self_attn.out_proj.weight   | UNEXPECTED |  | 
model.encoder.layers.{0...11}.final_layer_norm.weight     | UNEXPECTED |  | 
model.encoder.layers.{0...11}.fc2.weight                  | UNEXPECTED |  | 
model.encoder.layers.{0...11}.final_layer_norm.bias       | UNEXPECTED |  | 
model.encoder.laye

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Passing `generation_config` together with generation-related arguments=({'min_length', 'max_length', 'do_sample'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.


KeyError: 'summary_text'

In [None]:
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Note: you may need to restart the kernel to use updated packages.
