In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import random
import os

def scrape_meneame_comments(url):
    """
    Scrapes comments from a given Meneame news article.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
        'Accept-Language': 'es-ES,es;q=0.9',
        'Referer': 'https://www.google.com'
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            print(f"Error fetching {url}: {response.status_code}")
            return []
    except requests.exceptions.RequestException as e:
        print(f"Network error: {e}")
        return []

    soup = BeautifulSoup(response.text, "lxml")
    threader_zero = soup.find_all('div', class_='threader zero')

    comment_data = []

    for thread in threader_zero:
        comments = thread.find_all('div', class_='comment')

        for comment in comments:
            comment_id = comment.get("data-id", "").split("-")[1] if comment.get("data-id") else "unknown"
            user = comment.find('a', class_='username').text.strip() if comment.find('a', class_='username') else "Anonymous"
            text_div = comment.find("div", class_="comment-text")
            comment_text = text_div.get_text(strip=True) if text_div else ""

            parent_div = comment.find_parent("div", class_="threader-childs")
            comment_type = "child" if parent_div else "parent"

            votos_span = comment.find("a", class_="votes-counter")
            votos = votos_span.get_text(strip=True) if votos_span else "0"

            karma_span = comment.find("span", class_="votes-counter", id=lambda x: x and x.startswith("vk-"))
            karma = karma_span.get_text(strip=True).replace("K", "") if karma_span else "0"

            comment_data.append({
                "comment_id": comment_id,
                "user": user,
                "type": comment_type,
                "content": comment_text,
                "votos": votos,
                "karma": karma
            })

    return comment_data

def extract_news(num_pages=3000, chunk_size=100_000):
    """
    Extracts news articles and their associated comments from the first 'num_pages' of Meneame.
    Saves comments in CSV files in chunks of 'chunk_size' rows.
    """
    base_url = "https://www.meneame.net/?page="
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
        'Accept-Language': 'es-ES,es;q=0.9',
        'Referer': 'https://www.google.com'
    }

    all_comments = []
    file_counter = 1

    # Define the directory & ensure it exists
    save_dir = "../00.data/scraped/comments/"
    os.makedirs(save_dir, exist_ok=True)

    for page in range(100, num_pages + 1):
        url = f"{base_url}{page}"
        print(f"Scraping page {page}: {url}")

        try:
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_code != 200:
                print(f"Error fetching {url}: {response.status_code}")
                continue
        except requests.exceptions.RequestException as e:
            print(f"Network error: {e}")
            continue

        soup = BeautifulSoup(response.text, "lxml")
        newswrap = soup.find(id="newswrap")
        if not newswrap:
            print(f"No news found on page {page}.")
            continue

        news_summaries = newswrap.find_all(class_="news-summary")

        for news_summary in news_summaries:
            try:
                news_body = news_summary.find(class_="news-body")
                if not news_body:
                    continue

                news_id = int(news_body.get("data-link-id", "0"))

                story_link = news_summary.find("a", href=re.compile("^/story/"))
                full_story_link = f"https://www.meneame.net{story_link['href']}" if story_link else "Desconocido"

                # Scrape comments from the news article
                comments = scrape_meneame_comments(full_story_link)

                for comment in comments:
                    comment["news_id"] = news_id  # Attach news_id to each comment
                    all_comments.append(comment)

                #Only save when 100,000 comments are reached
                if len(all_comments) >= chunk_size:
                    df = pd.DataFrame(all_comments)
                    filename = os.path.join(save_dir, f"comentarios_scraped_{file_counter}.csv")
                    df.to_csv(filename, index=False)
                    print(f"Saved {filename} with {len(all_comments)} comments.")
                    
                    all_comments = []  # Reset list for next batch
                    file_counter += 1  # Increase file counter

            except Exception as e:
                print(f"Error processing news: {e}")
                continue

        time.sleep(random.uniform(1, 3))  # Randomized delay to avoid bans

    # Save any remaining comments
    if all_comments:
        df = pd.DataFrame(all_comments)
        filename = os.path.join(save_dir, f"comentarios_scraped_{file_counter}.csv")
        df.to_csv(filename, index=False)
        print(f" Saved {filename} with {len(all_comments)} comments.")

    print("Scraping completed!")

# Running the function
extract_news(num_pages=3000, chunk_size=100_000)


Scraping page 100: https://www.meneame.net/?page=100
Scraping page 101: https://www.meneame.net/?page=101
Scraping page 102: https://www.meneame.net/?page=102
Scraping page 103: https://www.meneame.net/?page=103
Scraping page 104: https://www.meneame.net/?page=104
Scraping page 105: https://www.meneame.net/?page=105
Scraping page 106: https://www.meneame.net/?page=106
Scraping page 107: https://www.meneame.net/?page=107
Scraping page 108: https://www.meneame.net/?page=108
Scraping page 109: https://www.meneame.net/?page=109
Scraping page 110: https://www.meneame.net/?page=110
Scraping page 111: https://www.meneame.net/?page=111
Scraping page 112: https://www.meneame.net/?page=112
Scraping page 113: https://www.meneame.net/?page=113
Scraping page 114: https://www.meneame.net/?page=114
Scraping page 115: https://www.meneame.net/?page=115
Scraping page 116: https://www.meneame.net/?page=116
Scraping page 117: https://www.meneame.net/?page=117
Scraping page 118: https://www.meneame.net/?pa

KeyboardInterrupt: 

In [87]:
df0=pd.read_csv("../00.data/preprocesado/meneame_procesado.csv")
df0

Unnamed: 0,news_id,title,content,full_story_link,meneos,clicks,karma,positive_votes,anonymous_votes,negative_votes,category,comments,published_date,user,source,source_link,scraped_date,provincia,comunidad
0,4038391,Canadá también se replantea la compra de cazas...,Los cambios en el tablero de las relaciones di...,https://meneame.net/story/canada-tambien-repla...,0,147.0,338,47,23,0,Política y Sociedad,20,2025-03-15 19:00:03,exducados,elespanol.com,https://www.elespanol.com/omicrono/defensa-y-e...,2025-03-15 19:12:42,,
1,4038374,Las mujeres tuareg construyen un tataram,Serie de fotografías mostrando la construcción...,https://meneame.net/story/mujeres-tuareg-const...,0,812.0,406,28,19,0,Entretenimiento y Cultura,8,2025-03-15 18:00:03,Oktarr,maximilien-bruggmann.ch,http://www.maximilien-bruggmann.ch/es/PhotoSea...,2025-03-15 19:12:42,,
2,4038341,Dos turistas alemanes fueron detenidos por sem...,"Los casos de Jessica Brösche, retenida durante...",https://meneame.net/story/dos-turistas-alemane...,0,981.0,393,62,65,0,Política y Sociedad,27,2025-03-15 17:35:03,Fartón_Valenciano,nytimes.com,https://www.nytimes.com/es/2025/03/14/espanol/...,2025-03-15 19:12:42,,
3,4038312,"Tesla se hunde en Europa, el 94% de los aleman...",De acuerdo con una reciente encuesta realizada...,https://meneame.net/story/tesla-hunde-europa-9...,0,424.0,387,106,121,0,Política y Sociedad,58,2025-03-15 17:00:02,Grahml,forococheselectricos.com,https://forococheselectricos.com/2025/03/tesla...,2025-03-15 19:12:42,,
4,4038332,El europarlamentario Marc Botenga pone en su l...,El europarlamentario Marc Botenga regaña al mi...,https://meneame.net/story/europarlamentario-ma...,0,1211.0,394,77,87,1,Política y Sociedad,14,2025-03-15 16:25:02,quercus_ilex,x.com,https://x.com/BotengaM/status/1900180061199725...,2025-03-15 19:12:42,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,4036030,"Musk defiende sacar a EEUU de la OTAN y ""dejar...","El asesor de la Presidencia de EEUU, el magnat...",https://meneame.net/story/musk-defiende-sacar-...,0,1417.0,412,154,208,1,Política y Sociedad,125,2025-03-09 18:15:03,exmarginalexquoque,europapress.es,https://www.europapress.es/internacional/notic...,2025-03-15 19:13:06,,
300,4035958,Cambiar de Chrome a Firefox para ganar privaci...,Como ganar privacidad y evitar anuncios con Fi...,https://meneame.net/story/cambiar-chrome-firef...,0,2522.0,385,154,187,0,Tecnología y Ciencia,127,2025-03-09 17:40:02,eugeniodl,genbeta.com,https://www.genbeta.com/a-fondo/google-me-ha-d...,2025-03-15 19:13:09,,
301,4036074,Grupo para crear un nuevo Menéame abierto,Enlace a la sala general del grupo de Telegram...,https://meneame.net/story/grupo-crear-nuevo-me...,0,3344.0,450,214,220,13,Entretenimiento y Cultura,259,2025-03-09 16:50:03,niel,t.me,https://t.me/nuevomeneame/1,2025-03-15 19:13:09,,
302,4036081,Israelíes acusados de violar a una adolescente...,"Los israelíes gritan: ""La británica [víctima] ...",https://meneame.net/story/israelies-acusados-v...,0,3110.0,499,224,363,2,Política y Sociedad,47,2025-03-09 16:50:02,Tkachenko,x.com,https://x.com/receipts_lol/status/189851131271...,2025-03-15 19:13:09,,


In [None]:
df[['comment_id', "user", "content", "votos", "karma", "news_id"]]