In [1]:
# %pip install spacy
%pip install sentence-transformers
# %pip install nltk


Collecting sentence-transformers
  Using cached sentence_transformers-5.2.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<6.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.57.6-py3-none-any.whl.metadata (43 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.10.0-cp313-cp313-win_amd64.whl.metadata (31 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Using cached huggingface_hub-1.3.3-py3-none-any.whl.metadata (13 kB)
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<6.0.0,>=4.41.0->sentence-transformers)
  Using cached tokenizers-0.22.2-cp39-abi3-win_amd64.whl.metadata (7.4 kB)
Collecting safetensors>=0.4.3 (from transformers<6.0.0,>=4.41.0->sentence-transformers)
  Using cached safetensors-0.7.0-cp38-abi3-win_amd64.whl.metadata (4.2 kB)
Using cached sentence_transformers-5.2.0-py3-none-any.whl (493 kB)
Using cached transf



In [2]:
# ==============================
# PIPELINE COMPLETO: Scraping + Preprocesamiento + Identificación de Tema
# ==============================

import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# ------------------------------
# 1️⃣ Scraping y extracción de texto
# ------------------------------
url = "https://es.wikipedia.org/wiki/Python"

response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Título del artículo
title = soup.find("h1", {"id": "firstHeading"}).text

# Contenido principal: solo párrafos
content_div = soup.find("div", {"id": "mw-content-text"})
paragraphs = content_div.find_all("p")
article_text = "\n".join([p.get_text().strip() for p in paragraphs if p.get_text().strip()])

print(f"--- Título ---\n{title}\n")
print(f"--- Texto del artículo (primeros 500 caracteres) ---\n{article_text[:500]}\n")

# ------------------------------
# 2️⃣ Preprocesamiento del texto
# ------------------------------

# 2a. NLTK
nltk.download("punkt")
nltk.download("stopwords")

tokens_nltk = word_tokenize(article_text.lower())
tokens_clean_nltk = [t for t in tokens_nltk if t.isalpha() and t not in stopwords.words("spanish")]
print(f"Tokens NLTK (primeros 20): {tokens_clean_nltk[:20]}")

# 2b. SpaCy
nlp = spacy.load("es_core_news_sm")
doc = nlp(article_text)
tokens_clean_spacy = [token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop]
print(f"Tokens SpaCy (primeros 20): {tokens_clean_spacy[:20]}")

# ------------------------------
# 3️⃣ Identificación del tema
# ------------------------------

# --- TF-IDF ---
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform([article_text])

# Palabras más relevantes por TF-IDF
tfidf_feature_names = vectorizer_tfidf.get_feature_names_out()
tfidf_scores = X_tfidf.toarray()[0]
top_words = sorted(zip(tfidf_feature_names, tfidf_scores), key=lambda x: x[1], reverse=True)[:10]
print(f"\nTop palabras TF-IDF: {top_words}")

# --- Embeddings HuggingFace ---
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased')
embedding = model.encode([article_text])
print(f"Embedding shape: {embedding.shape}")

# Si tuvieras más artículos, podrías calcular similitud:
# embeddings = model.encode(list_of_articles)
# similarity_matrix = cosine_similarity(embeddings)

print("\n--- Pipeline completado ---")


  soup = BeautifulSoup(response.text, "html.parser")


AttributeError: 'NoneType' object has no attribute 'text'