In [81]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import random
import os

def scrape_meneame_comments(url):
    """
    Scrapes comments from a given Meneame news article.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
        'Accept-Language': 'es-ES,es;q=0.9',
        'Referer': 'https://www.google.com'
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            print(f"Error fetching {url}: {response.status_code}")
            return []
    except requests.exceptions.RequestException as e:
        print(f"Network error: {e}")
        return []

    soup = BeautifulSoup(response.text, "lxml")
    threader_zero = soup.find_all('div', class_='threader zero')

    comment_data = []

    for thread in threader_zero:
        comments = thread.find_all('div', class_='comment')

        for comment in comments:
            comment_id = comment.get("data-id", "").split("-")[1] if comment.get("data-id") else "unknown"
            user = comment.find('a', class_='username').text.strip() if comment.find('a', class_='username') else "Anonymous"
            text_div = comment.find("div", class_="comment-text")
            comment_text = text_div.get_text(strip=True) if text_div else ""

            parent_div = comment.find_parent("div", class_="threader-childs")
            comment_type = "child" if parent_div else "parent"

            votos_span = comment.find("a", class_="votes-counter")
            votos = votos_span.get_text(strip=True) if votos_span else "0"

            karma_span = comment.find("span", class_="votes-counter", id=lambda x: x and x.startswith("vk-"))
            karma = karma_span.get_text(strip=True).replace("K", "") if karma_span else "0"

            comment_data.append({
                "comment_id": comment_id,
                "user": user,
                "type": comment_type,
                "content": comment_text,
                "votos": votos,
                "karma": karma
            })

    return comment_data

def extract_news(num_pages=3000, chunk_size=100_000):
    """
    Extracts news articles and their associated comments from the first 'num_pages' of Meneame.
    Saves comments in CSV files in chunks of 'chunk_size' rows.
    """
    base_url = "https://www.meneame.net/?page="
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
        'Accept-Language': 'es-ES,es;q=0.9',
        'Referer': 'https://www.google.com'
    }

    all_comments = []
    file_counter = 1

    # Define the directory & ensure it exists
    save_dir = "../00.data/scraped/comments/"
    os.makedirs(save_dir, exist_ok=True)

    for page in range(100, num_pages + 1):
        url = f"{base_url}{page}"
        print(f"Scraping page {page}: {url}")

        try:
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_code != 200:
                print(f"Error fetching {url}: {response.status_code}")
                continue
        except requests.exceptions.RequestException as e:
            print(f"Network error: {e}")
            continue

        soup = BeautifulSoup(response.text, "lxml")
        newswrap = soup.find(id="newswrap")
        if not newswrap:
            print(f"⚠️ No news found on page {page}.")
            continue

        news_summaries = newswrap.find_all(class_="news-summary")

        for news_summary in news_summaries:
            try:
                news_body = news_summary.find(class_="news-body")
                if not news_body:
                    continue

                news_id = int(news_body.get("data-link-id", "0"))

                story_link = news_summary.find("a", href=re.compile("^/story/"))
                full_story_link = f"https://www.meneame.net{story_link['href']}" if story_link else "Desconocido"

                # Scrape comments from the news article
                comments = scrape_meneame_comments(full_story_link)

                for comment in comments:
                    comment["news_id"] = news_id  # Attach news_id to each comment
                    all_comments.append(comment)

                #Only save when 100,000 comments are reached
                if len(all_comments) >= chunk_size:
                    df = pd.DataFrame(all_comments)
                    filename = os.path.join(save_dir, f"comentarios_scraped_{file_counter}.csv")
                    df.to_csv(filename, index=False)
                    print(f"✅ Saved {filename} with {len(all_comments)} comments.")
                    
                    all_comments = []  # Reset list for next batch
                    file_counter += 1  # Increase file counter

            except Exception as e:
                print(f"⚠️ Error processing news: {e}")
                continue

        time.sleep(random.uniform(1, 3))  # Randomized delay to avoid bans

    # Save any remaining comments
    if all_comments:
        df = pd.DataFrame(all_comments)
        filename = os.path.join(save_dir, f"comentarios_scraped_{file_counter}.csv")
        df.to_csv(filename, index=False)
        print(f" Saved {filename} with {len(all_comments)} comments.")

    print("🎉 Scraping completed!")

# Running the function
extract_news(num_pages=3000, chunk_size=100_000)


Scraping page 100: https://www.meneame.net/?page=100
Scraping page 101: https://www.meneame.net/?page=101
Scraping page 102: https://www.meneame.net/?page=102
Scraping page 103: https://www.meneame.net/?page=103
Scraping page 104: https://www.meneame.net/?page=104
Scraping page 105: https://www.meneame.net/?page=105
Scraping page 106: https://www.meneame.net/?page=106
Scraping page 107: https://www.meneame.net/?page=107
Scraping page 108: https://www.meneame.net/?page=108
Scraping page 109: https://www.meneame.net/?page=109
Scraping page 110: https://www.meneame.net/?page=110
Scraping page 111: https://www.meneame.net/?page=111
Scraping page 112: https://www.meneame.net/?page=112
Scraping page 113: https://www.meneame.net/?page=113
Scraping page 114: https://www.meneame.net/?page=114
Scraping page 115: https://www.meneame.net/?page=115
Scraping page 116: https://www.meneame.net/?page=116
Scraping page 117: https://www.meneame.net/?page=117
Scraping page 118: https://www.meneame.net/?pa

KeyboardInterrupt: 

In [78]:
df

Unnamed: 0,comment_id,user,type,content,votos,karma,news_id
0,42479167,ipanies,parent,"Miserables e hijoputas ha habido siempre, pero...",17,171,4037568
1,42479065,JackNorte,parent,Por sus actos les reconocereis.,8,84,4037568
2,42479151,tromperri,parent,Pues no lo entiendo… ¿esto no era culpa de el ...,7,75,4037568
3,42479082,jonolulu,parent,Lo de siempre: Solo les importan las victimas ...,5,62,4037568
4,42479066,Thornton,parent,¡Que aplauso a las víctinimas ni que leches! T...,5,59,4037568
...,...,...,...,...,...,...,...
10131,42462625,Katos,child,#66¿cómo?. ¿Pero qué dices?,0,10,4036537
10132,42462702,Nitros,child,"#66Ya, pero se hacen los locos y se creen que ...",4,6,4036537
10133,42462572,Natxelas_,parent,Pero a que son hombres? Ah?,0,6,4036537
10134,42462753,Battlestar,child,#79Dos hombres y una mujer,1,18,4036537


In [None]:
df[['comment_id', "user", "content", "votos", "karma", "news_id"]]