In [1]:
import os
import re
import time
import random
import glob
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

# Function to scrape Meneame.net
def scrape_meneame(max_pages=50, save_interval=50, last_scraped_date=None):
    base_url = "https://meneame.net"
    results = []

    def scrape_page(page_number):
        url = f"{base_url}/?page={page_number}"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
            'Accept-Language': 'es-ES,es;q=0.9',
            'Referer': 'https://www.google.com'
        }

        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"❌ Error en {url}: {response.status_code}")
            return []

        soup = BeautifulSoup(response.text, "lxml")
        return extract_news(soup)

    def extract_news(soup):
        newswrap = soup.find(id="newswrap")
        if not newswrap:
            return []

        news_summaries = newswrap.find_all(class_="news-summary")
        new_entries = []

        for news_summary in news_summaries:
            try:
                news_body = news_summary.find(class_="news-body")
                if not news_body:
                    continue
                
                news_id = int(news_body.get("data-link-id"))
                center_content = news_body.find_next(class_="center-content")
                title = center_content.find("h2").find("a").text.strip()
                source_link = center_content.find("h2").find("a")["href"]

                content_div = news_body.find("div", class_="news-content")
                content = content_div.text.strip() if content_div else ""

                news_submitted = center_content.find("div", class_="news-submitted")
                published_timestamp = int(news_submitted.find_all("span", attrs={"data-ts": True})[-1].get("data-ts"))
                published_date = datetime.fromtimestamp(published_timestamp).strftime("%Y-%m-%d %H:%M:%S")

                # Stop if we reach already scraped news
                if last_scraped_date and published_date <= last_scraped_date:
                    return "STOP"

                user_link = news_submitted.find("a", href=re.compile("/user/.+/history"))
                user = user_link.text.strip() if user_link else "Desconocido"

                source_span = news_submitted.find("span", class_="showmytitle")
                source = source_span.text.strip() if source_span else "Desconocido"

                news_details = news_body.find_next(class_="news-details")
                comments = int(news_details.select_one("a.comments").get("data-comments-number"))
                positive_votes = int(news_details.select_one("span.positive-vote-number").text)
                anonymous_votes = int(news_details.select_one("span.anonymous-vote-number").text)
                negative_votes = int(news_details.select_one("span.negative-vote-number").text)
                karma = int(news_details.select_one("span.karma-number").text)
                category = news_details.select_one("a.subname").text.strip()

                clicks_span = news_body.find("span", id=f"clicks-number-{news_id}")
                clicks = int(clicks_span.text.strip()) if clicks_span else 0
                votes_a = news_body.find("a", id=f"a-votes-{news_id} ga-event")
                meneos = int(votes_a.text.strip()) if votes_a else 0

                story_link = news_summary.find("a", href=re.compile("^/story/"))
                full_story_link = f"{base_url}{story_link['href']}" if story_link else "Desconocido"

                scraped_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

                new_entries.append({
                    "news_id": news_id, "title": title, "content": content, "full_story_link": full_story_link,
                    "meneos": meneos, "clicks": clicks, "karma": karma, "positive_votes": positive_votes,
                    "anonymous_votes": anonymous_votes, "negative_votes": negative_votes, "category": category,
                    "comments": comments, "published_date": published_date, "user": user, "source": source,
                    "source_link": source_link, "scraped_date": scraped_date
                })

            except Exception as e:
                print(f"⚠️ Error procesando noticia: {e}")
                continue

        return new_entries

    for page in range(1, max_pages + 1):
        scraped_data = scrape_page(page)
        
        if scraped_data == "STOP":
            break

        results.extend(scraped_data)

        time.sleep(random.uniform(1, 2))

    if results:
        save_new_data(results)
        print("✅ Nuevas noticias guardadas.")
    else:
        print("⚠️ No hay nuevas noticias.")

# Get the next available filename
def get_next_scraped_filename(directory="../00.data/scraped", pattern="meneame_scraped_*.csv"):
    os.makedirs(directory, exist_ok=True)
    files = glob.glob(os.path.join(directory, pattern))

    if not files:
        return os.path.join(directory, "meneame_scraped_1.csv")
    
    existing_numbers = [int(re.search(r"meneame_scraped_(\d+)\.csv", f).group(1)) for f in files if re.search(r"meneame_scraped_(\d+)\.csv", f)]
    next_number = max(existing_numbers) + 1 if existing_numbers else 1

    return os.path.join(directory, f"meneame_scraped_{next_number}.csv")

# Save only newly scraped data to a new file with the next number
def save_new_data(new_data, directory="../00.data/scraped"):
    latest_file = get_latest_scraped_file(directory)

    if latest_file and os.path.exists(latest_file):
        existing_df = pd.read_csv(latest_file, encoding="utf-8")

        new_df = pd.DataFrame(new_data)
        new_rows = new_df[~new_df.apply(tuple, 1).isin(existing_df.apply(tuple, 1))]

        if new_rows.empty:
            print("⚠️ No new rows to save.")
            return
    else:
        new_rows = pd.DataFrame(new_data)

    new_filename = get_next_scraped_filename(directory)
    new_rows.to_csv(new_filename, index=False, encoding="utf-8")
    print(f"📁 {len(new_rows)} new rows saved in {new_filename}")

# Get the most recent scraped file
def get_latest_scraped_file(directory="../00.data/scraped", pattern="meneame_scraped_*.csv"):
    files = glob.glob(os.path.join(directory, pattern))
    return max(files, key=os.path.getmtime) if files else None

# Get last scraped date
def get_last_scraped_date(directory="../00.data/scraped", pattern="meneame_scraped_*.csv"):
    latest_file = get_latest_scraped_file(directory)

    if not latest_file:
        return None  

    df = pd.read_csv(latest_file, usecols=["scraped_date"], encoding="utf-8")
    return df["scraped_date"].max() if not df.empty else None

# Scrape until the latest date
def scrape_until_latest():
    last_scraped_date = get_last_scraped_date()
    print(f"📅 Última fecha de scrapeo: {last_scraped_date}")
    
    scrape_meneame(max_pages=50, save_interval=5, last_scraped_date=last_scraped_date)


In [2]:
scrape_until_latest()

📅 Última fecha de scrapeo: 2025-03-04 14:12:57
📁 25 new rows saved in ../00.data/scraped/meneame_scraped_5.csv
✅ Nuevas noticias guardadas.


In [None]:
funciones.scrape(

In [None]:
######################################