In [None]:
import os
import re
import time
import random
import glob
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [None]:
def scrape_meneame(max_pages=50, save_interval=50, last_scraped_date=None):
    base_url = "https://meneame.net"
    results = []

    def scrape_page(page_number):
        url = f"{base_url}/?page={page_number}"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
            'Accept-Language': 'es-ES,es;q=0.9',
            'Referer': 'https://www.google.com'
        }

        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"❌ Error en {url}: {response.status_code}")
            return []

        soup = BeautifulSoup(response.text, "lxml")
        return extract_news(soup)

    def extract_news(soup):
        newswrap = soup.find(id="newswrap")
        if not newswrap:
            return []

        news_summaries = newswrap.find_all(class_="news-summary")
        new_entries = []

        for news_summary in news_summaries:
            try:
                news_body = news_summary.find(class_="news-body")
                if not news_body:
                    continue
                
                news_id = int(news_body.get("data-link-id"))
                center_content = news_body.find_next(class_="center-content")
                title = center_content.find("h2").find("a").text.strip()
                source_link = center_content.find("h2").find("a")["href"]

                content_div = news_body.find("div", class_="news-content")
                content = content_div.text.strip() if content_div else ""

                news_submitted = center_content.find("div", class_="news-submitted")
                published_timestamp = int(news_submitted.find_all("span", attrs={"data-ts": True})[-1].get("data-ts"))
                published_date = datetime.fromtimestamp(published_timestamp).strftime("%Y-%m-%d %H:%M:%S")

                if last_scraped_date and published_date <= last_scraped_date:
                    return "STOP"

                user_link = news_submitted.find("a", href=re.compile("/user/.+/history"))
                user = user_link.text.strip() if user_link else "Desconocido"

                source_span = news_submitted.find("span", class_="showmytitle")
                source = source_span.text.strip() if source_span else "Desconocido"

                news_details = news_body.find_next(class_="news-details")
                comments = int(news_details.select_one("a.comments").get("data-comments-number"))
                positive_votes = int(news_details.select_one("span.positive-vote-number").text)
                anonymous_votes = int(news_details.select_one("span.anonymous-vote-number").text)
                negative_votes = int(news_details.select_one("span.negative-vote-number").text)
                karma = int(news_details.select_one("span.karma-number").text)
                category = news_details.select_one("a.subname").text.strip()

                clicks_span = news_body.find("span", id=f"clicks-number-{news_id}")
                clicks = int(clicks_span.text.strip()) if clicks_span else 0
                votes_a = news_body.find("a", id=f"a-votes-{news_id} ga-event")
                meneos = int(votes_a.text.strip()) if votes_a else 0

                story_link = news_summary.find("a", href=re.compile("^/story/"))
                full_story_link = f"{base_url}{story_link['href']}" if story_link else "Desconocido"

                scraped_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

                new_entries.append({
                    "news_id": news_id, "title": title, "content": content, "full_story_link": full_story_link,
                    "meneos": meneos, "clicks": clicks, "karma": karma, "positive_votes": positive_votes,
                    "anonymous_votes": anonymous_votes, "negative_votes": negative_votes, "category": category,
                    "comments": comments, "published_date": published_date, "user": user, "source": source,
                    "source_link": source_link, "scraped_date": scraped_date
                })

            except Exception as e:
                print(f"⚠️ Error procesando noticia: {e}")
                continue

        return new_entries

    for page in range(1, max_pages + 1):
        scraped_data = scrape_page(page)
        
        if scraped_data == "STOP":
            break

        results.extend(scraped_data)

        if page % save_interval == 0:
            save_data(results, f"meneame_scraped_temp_{page}.csv")

        time.sleep(random.uniform(1, 2))

    if results:
        save_data(results, "meneame_scraped_4.csv")
        print("✅ Nuevas noticias guardadas en meneame_scraped_4.csv")
    else:
        print("⚠️ No hay nuevas noticias.")

In [None]:
# df1 = pd.read_csv("../00.data/scraped/meneame_scraped_1.csv", encoding="utf-8")
# df2 = pd.read_csv("../00.data/scraped/meneame_scraped_2.csv", encoding="utf-8")
# df3 = pd.read_csv("../00.data/scraped/meneame_scraped_3.csv", encoding="utf-8")

# df = pd.concat([df1,df2,df3], ignore_index=True)

In [None]:
def get_next_scraped_filename(directory="../00.data/scraped", pattern="meneame_scraped_*.csv"):
    """
    Encuentra el siguiente nombre de archivo para guardar los datos scrapeados.
    """
    files = glob.glob(os.path.join(directory, pattern))
    
    if not files:
        return os.path.join(directory, "meneame_scraped_1.csv")
    
    # Extraer los números de los archivos existentes
    existing_numbers = []
    for file in files:
        match = re.search(r"meneame_scraped_(\d+)\.csv", file)
        if match:
            existing_numbers.append(int(match.group(1)))
    
    next_number = max(existing_numbers) + 1 if existing_numbers else 1
    return os.path.join(directory, f"meneame_scraped_{next_number}.csv")

def save_data(data, directory="../00.data/scraped"):
    """
    Guarda los datos scrapeados en un nuevo archivo numerado.
    """
    os.makedirs(directory, exist_ok=True)  # Asegurar que el directorio existe
    filename = get_next_scraped_filename(directory)
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False, encoding="utf-8")
    print(f"📁 Datos guardados en {filename}")

def get_last_scraped_date(directory="../00.data/scraped", pattern="meneame_scraped_*.csv"):
    # Buscar todos los archivos en el directorio que coincidan con el patrón
    files = glob.glob(os.path.join(directory, pattern))

    if not files:
        return None  # Si no hay archivos, no hay una última fecha de scrapeo

    # Leer solo la columna 'scraped_date' de cada archivo existente
    dfs = [pd.read_csv(f, usecols=["scraped_date"], encoding="utf-8") for f in files if os.path.exists(f)]
    
    if not dfs:
        return None  # Si no se pudieron leer archivos, no hay fecha de scrapeo

    # Combinar los DataFrames y obtener la última fecha de scrapeo
    df = pd.concat(dfs, ignore_index=True)
    return df["scraped_date"].max() if not df.empty else None

def scrape_until_latest():
    last_scraped_date = get_last_scraped_date()
    print(f"📅 Última fecha de scrapeo: {last_scraped_date}")
    
    scrape_meneame(max_pages=50, save_interval=5, last_scraped_date=last_scraped_date)

In [None]:
scrape_until_latest()

In [None]:
######################################