# Reddit Scraper

Este notebook usa la **API oficial de Reddit con OAuth (PRAW)** para descargar **posts** y **comentarios** de un subreddit y escribe **Data/raw/** sin métricas ni preprocesamiento.

## Librerias

In [1]:
# Ejecutar esta celda para instalar las siguientes dependencias:
# ```bash
# pip install praw pandas tqdm python-dotenv
# ```

In [2]:
from dataclasses import dataclass, asdict
from typing import List, Optional, Tuple
from pathlib import Path
import os, json, re, time
from datetime import datetime, timezone
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import praw

## Parámetros y Credenciales

In [3]:
# Editar el archivo .env con las credenciales de Reddit API
load_dotenv()
# Credenciales de Reddit API 
# Antes de ejecutar esta celda definir en un .env las siguientes variables:
# - `REDDIT_CLIENT_ID`
# - `REDDIT_CLIENT_SECRET`
# - `REDDIT_USER_AGENT`
# O definirlas directamente aquí:
# REDDIT_CLIENT_ID     = "tu_client_id"
# REDDIT_CLIENT_SECRET = "tu_client_secret"
# REDDIT_USER_AGENT    = "tu_user_agent"
REDDIT_CLIENT_ID     = os.getenv("REDDIT_CLIENT_ID")
REDDIT_CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")
REDDIT_USER_AGENT    = os.getenv("REDDIT_USER_AGENT")

# Parámetros
SUBREDDIT                   = "USCIS"    # <-- el subreddit
SORT                        = "top"      # "hot" | "new" | "top" | "rising"
TIME_FILTER                 = "year"     # si SORT="top", usar: "day"|"week"|"month"|"year"|"all"
MIN_POSTS                   = 20         # mínimo a obtener
MAX_POSTS                   = 200        # máximo a obtener
MAX_COMMENTS_PER_POST       = 80         # tope de comentarios por post
DOWNLOAD_IMAGES             = False      # descarga local de imágenes si la URL lo es
REQUEST_SLEEP_S             = 0.7        # respeta rate limit
DATA_DIR                    = Path("data") # directorio base de datos     
RAW_DIR                     = DATA_DIR / "raw" # directorio datos sin procesar

# Crear directorios si no existen
RAW_DIR.mkdir(parents=True, exist_ok=True)

# Verificar credenciales
if not REDDIT_CLIENT_ID or not REDDIT_CLIENT_SECRET:
    raise RuntimeError("Faltan credenciales: define REDDIT_CLIENT_ID y REDDIT_CLIENT_SECRET en el entorno o .env")

## Utilidades

In [4]:
def make_reddit() -> "praw.Reddit":
    """Crea una instancia autenticada de PRAW Reddit API Client."""
    return praw.Reddit(
        client_id=REDDIT_CLIENT_ID,
        client_secret=REDDIT_CLIENT_SECRET,
        user_agent=REDDIT_USER_AGENT,
    )


def is_image_url(u: str) -> bool:
    """Determina si una URL apunta a una imagen (jpg, jpeg, png, gif)."""
    return bool(re.search(r"\.(jpg|jpeg|png|gif)(?:\?.*)?$", (u or ""), flags=re.I))


def to_iso(ts_utc: Optional[float]) -> Optional[str]:
    """Convierte un timestamp UTC a formato ISO 8601."""
    try:
        return datetime.fromtimestamp(float(ts_utc), tz=timezone.utc).isoformat()
    except Exception:
        return None


def rsleep():
    """Duerme por REQUEST_SLEEP_S segundos si es mayor a 0."""
    if REQUEST_SLEEP_S > 0:
        time.sleep(REQUEST_SLEEP_S)

## Esquema de salida cruda de Posts y Comentarios

In [5]:
@dataclass
class PostRow:
    post_id: str
    title: Optional[str]
    author: Optional[str]
    score: Optional[int]
    num_comments: Optional[int]
    created: Optional[str]
    permalink: str
    is_self: bool
    image_urls: List[str]
    selftext: Optional[str]
    subreddit: Optional[str]


@dataclass
class CommentRow:
    post_id: str
    comment_id: str
    author: Optional[str]
    created: Optional[str]
    score: Optional[int]
    body: str

## Extracción

In [6]:
# Crear cliente Reddit
reddit = make_reddit()
# Seleccionar subreddit
sub = reddit.subreddit(SUBREDDIT)

# Obtener listado de posts según el criterio
if SORT == "hot":
    listing = sub.hot(limit=MAX_POSTS)
elif SORT == "new":
    listing = sub.new(limit=MAX_POSTS)
elif SORT == "rising":
    listing = sub.rising(limit=MAX_POSTS)
elif SORT == "top":
    listing = sub.top(time_filter=TIME_FILTER, limit=MAX_POSTS)
else:
    listing = sub.hot(limit=MAX_POSTS)

# Inicializar listas de resultados
posts_rows: List[PostRow] = []
comments_rows: List[CommentRow] = []

# Iterar posts
for p in tqdm(listing, total=MAX_POSTS, desc=f"Posts r/{SUBREDDIT}"):
    rsleep()
    img_urls = [p.url] if is_image_url(getattr(p, "url", "")) else []

    pr = PostRow(
        post_id=p.id,
        title=getattr(p, "title", None),
        author=f"u/{p.author}" if getattr(p, "author", None) else None,
        score=int(getattr(p, "score", 0)) if getattr(p, "score", None) is not None else None,
        num_comments=int(getattr(p, "num_comments", 0)) if getattr(p, "num_comments", None) is not None else None,
        created=to_iso(getattr(p, "created_utc", None)),
        permalink=f"https://www.reddit.com{getattr(p, 'permalink', '')}",
        is_self=bool(getattr(p, "is_self", False)),
        image_urls=img_urls,
        selftext=(getattr(p, "selftext", None) or None),
        subreddit=str(getattr(p, "subreddit", SUBREDDIT)) if getattr(p, "subreddit", None) else SUBREDDIT,
    )

    # Comentarios
    try:
        p.comments.replace_more(limit=0)
        com_list = p.comments.list()[:MAX_COMMENTS_PER_POST]
    except Exception as e:
        print(f"[WARN] Comentarios {e} en {pr.permalink}")
        com_list = []

    for c in com_list:
        text = getattr(c, "body", "") or ""
        comments_rows.append(CommentRow(
            post_id=pr.post_id,
            comment_id=getattr(c, "id", ""),
            author=f"u/{c.author}" if getattr(c, "author", None) else None,
            created=to_iso(getattr(c, "created_utc", None)),
            score=int(getattr(c, "score", 0)) if getattr(c, "score", None) is not None else None,
            body=text,
        ))

    posts_rows.append(pr)

print(f"Posts: {len(posts_rows)} | Comentarios: {len(comments_rows)}")

Posts r/USCIS: 100%|██████████| 200/200 [05:57<00:00,  1.79s/it]

Posts: 200 | Comentarios: 15131





## Guardado (CSV + JSONL)

In [7]:
posts_df = pd.DataFrame([asdict(p) for p in posts_rows])
comments_df = pd.DataFrame([asdict(c) for c in comments_rows])

posts_csv = RAW_DIR / "posts.csv"
comments_csv = RAW_DIR / "comments.csv"
posts_jsonl = RAW_DIR / "posts.jsonl"
comments_jsonl = RAW_DIR / "comments.jsonl"

posts_df.to_csv(posts_csv, index=False, encoding="utf-8-sig")
comments_df.to_csv(comments_csv, index=False, encoding="utf-8-sig")

with open(posts_jsonl, "w", encoding="utf-8") as f:
    for _, row in posts_df.iterrows():
        f.write(json.dumps(row.to_dict(), ensure_ascii=False) + "\n")

with open(comments_jsonl, "w", encoding="utf-8") as f:
    for _, row in comments_df.iterrows():
        f.write(json.dumps(row.to_dict(), ensure_ascii=False) + "\n")

print("Guardado en:", RAW_DIR)

Guardado en: data\raw


## Vista rápida

In [10]:
print(f"Posts: {len(posts_rows)} | Comentarios: {len(comments_rows)}")
print("Posts:")
display(posts_df.head())
print("Comentarios:")
display(comments_df.head())

Posts: 200 | Comentarios: 15131
Posts:


Unnamed: 0,post_id,title,author,score,num_comments,created,permalink,is_self,image_urls,selftext,subreddit
0,1lcftjz,Got my mom her green card by enlisting in the ...,u/SuperiorT,4155,537,2025-06-16T00:52:02+00:00,https://www.reddit.com/r/USCIS/comments/1lcftj...,False,[],"Well that's a wrap. On June 12, 2025 my mom fi...",USCIS
1,1grmeq4,Today I became a US citizen,u/adepojus,3869,261,2024-11-15T02:45:34+00:00,https://www.reddit.com/r/USCIS/comments/1grmeq...,False,[https://i.redd.it/j2iwqcb0bz0e1.jpeg],I came into United States as an F-1 student in...,USCIS
2,1gkfbph,Today I became a US citizen,u/Asteroids19_9,3696,142,2024-11-05T19:40:39+00:00,https://www.reddit.com/r/USCIS/comments/1gkfbp...,False,[https://i.redd.it/ejtng9ozy4zd1.jpeg],I am a 19 year old student at college. It took...,USCIS
3,1glflxy,"So, what now? An immigration attorney perspect...",u/Honest-Grape-9352,2899,714,2024-11-07T02:01:16+00:00,https://www.reddit.com/r/USCIS/comments/1glflx...,True,[],"(Before I begin, I kindly ask that I not be DM...",USCIS
4,1ltlanr,Became a Citizen after 26 years!!,u/Ajax4557,2647,165,2025-07-07T04:44:32+00:00,https://www.reddit.com/r/USCIS/comments/1ltlan...,False,[https://i.redd.it/5d0ljp8jtdbf1.jpeg],,USCIS


Comentarios:


Unnamed: 0,post_id,comment_id,author,created,score,body
0,1lcftjz,my05pdb,u/DrummerHistorical493,2025-06-16T01:03:23+00:00,205,What a great son!
1,1lcftjz,my05mw5,u/Thedippyhoe,2025-06-16T01:02:57+00:00,211,Congratulations!!! Big hugs to your momma!\n\n...
2,1lcftjz,my0c8ez,u/WonderfulVariation93,2025-06-16T01:44:40+00:00,193,You are exempt from all future Mother’s Day gi...
3,1lcftjz,my08ip3,u/GeekNoy,2025-06-16T01:21:16+00:00,52,Congrats to your mom. You're an awesome son.
4,1lcftjz,my067yj,u/Greedy_Disaster_3130,2025-06-16T01:06:41+00:00,128,This is a great benefit offered to service mem...
