# Extracción cruda de Reddit (API oficial)

Extracción de posts y comentarios desde la API oficial de Reddit (PRAW) y guarda los resultados tal cual se obtienen: campo por campo

In [1]:
from dataclasses import dataclass, asdict
from typing import List, Optional
from pathlib import Path
import time, re, json
import pandas as pd
from tqdm import tqdm
import praw, requests
from datetime import datetime, timezone

In [2]:
SUBREDDIT         = "USCIS"
SORT              = "top"
TIME_FILTER       = "year"
MIN_POSTS         = 20
MAX_POSTS         = 200
MAX_COMMENTS_PER_POST = 80
DOWNLOAD_IMAGES   = False
REQUEST_SLEEP_S   = 0.7
OUT_DIR           = Path("./reddit_api_output")

OUT_DIR.mkdir(parents=True, exist_ok=True)
(OUT_DIR).mkdir(parents=True, exist_ok=True)

@dataclass
class PostRow:
    post_id: str
    title: Optional[str]
    author: Optional[str]
    score: Optional[int]
    num_comments: Optional[int]
    created: Optional[str]
    permalink: str
    is_self: bool
    image_urls: List[str]
    selftext: Optional[str]
    subreddit: Optional[str]

@dataclass
class CommentRow:
    post_id: str
    comment_id: str
    author: Optional[str]
    created: Optional[str]
    score: Optional[int]
    body: str


In [3]:
def make_reddit():
    reddit = praw.Reddit(
        client_id="kOhzoYbAa7yXmebOh5EDRw",
        client_secret="WLrYM1NI3faRN7H48KiITdO3d-YhaQ",
        user_agent="Fickle_Finish_9750"
    )
    return reddit

def is_image_url(u: str) -> bool:
    return bool(re.search(r"\.(jpg|jpeg|png|gif)(?:\?.*)?$", (u or ""), flags=re.I))

def to_iso(ts_utc: float) -> str:
    try:
        return datetime.fromtimestamp(ts_utc, tz=timezone.utc).isoformat()
    except Exception:
        return None

def sleep():
    time.sleep(REQUEST_SLEEP_S)


In [4]:
reddit = make_reddit()

sub = reddit.subreddit(SUBREDDIT)
if SORT == "hot":
    it = sub.hot(limit=MAX_POSTS)
elif SORT == "new":
    it = sub.new(limit=MAX_POSTS)
elif SORT == "rising":
    it = sub.rising(limit=MAX_POSTS)
elif SORT == "top":
    it = sub.top(time_filter=TIME_FILTER, limit=MAX_POSTS)
else:
    it = sub.hot(limit=MAX_POSTS)

posts_rows, comments_rows = [], []

for p in tqdm(it, total=MAX_POSTS, desc="Posts"):
    img_urls = [p.url] if is_image_url(getattr(p, "url", "")) else []
    pr = PostRow(
        post_id=p.id,
        title=p.title,
        author=f"u/{p.author}" if p.author else None,
        score=int(p.score) if p.score is not None else None,
        num_comments=int(p.num_comments) if p.num_comments is not None else None,
        created=to_iso(getattr(p, "created_utc", None)),
        permalink=f"https://www.reddit.com{p.permalink}",
        is_self=bool(p.is_self),
        image_urls=img_urls,
        selftext=(p.selftext or None),
        subreddit=str(p.subreddit) if p.subreddit else SUBREDDIT,
    )

    try:
        p.comments.replace_more(limit=0)
        com_list = p.comments.list()[:MAX_COMMENTS_PER_POST]
    except Exception as e:
        print("[WARN] Comentarios", e, "en", pr.permalink)
        com_list = []

    for c in com_list:
        text = getattr(c, "body", "") or ""
        comments_rows.append(CommentRow(
            post_id=pr.post_id,
            comment_id=getattr(c, "id", ""),
            author=f"u/{c.author}" if c.author else None,
            created=to_iso(getattr(c, "created_utc", None)),
            score=int(getattr(c, "score", 0)) if getattr(c, "score", None) is not None else None,
            body=text,
        ))

    posts_rows.append(pr)

print(f"Posts: {len(posts_rows)} | Comentarios: {len(comments_rows)}")


Posts: 100%|██████████| 200/200 [04:02<00:00,  1.21s/it]

Posts: 200 | Comentarios: 15093





In [5]:
posts_df = pd.DataFrame([asdict(p) for p in posts_rows])
comments_df = pd.DataFrame([asdict(c) for c in comments_rows])

posts_csv = OUT_DIR / "posts.csv"
comments_csv = OUT_DIR / "comments.csv"
posts_jsonl = OUT_DIR / "posts.jsonl"
comments_jsonl = OUT_DIR / "comments.jsonl"

posts_df.to_csv(posts_csv, index=False, encoding="utf-8-sig")
comments_df.to_csv(comments_csv, index=False, encoding="utf-8-sig")

with open(posts_jsonl, "w", encoding="utf-8") as f:
    for _, row in posts_df.iterrows():
        f.write(json.dumps(row.to_dict(), ensure_ascii=False) + "\n")

with open(comments_jsonl, "w", encoding="utf-8") as f:
    for _, row in comments_df.iterrows():
        f.write(json.dumps(row.to_dict(), ensure_ascii=False) + "\n")

posts_csv, comments_csv, posts_jsonl, comments_jsonl


(PosixPath('reddit_api_output/posts.csv'),
 PosixPath('reddit_api_output/comments.csv'),
 PosixPath('reddit_api_output/posts.jsonl'),
 PosixPath('reddit_api_output/comments.jsonl'))

In [6]:
posts_df.shape, comments_df.shape

((200, 11), (15093, 6))

In [8]:
posts_df.head()

Unnamed: 0,post_id,title,author,score,num_comments,created,permalink,is_self,image_urls,selftext,subreddit
0,1lcftjz,Got my mom her green card by enlisting in the ...,u/SuperiorT,4157,537,2025-06-16T00:52:02+00:00,https://www.reddit.com/r/USCIS/comments/1lcftj...,False,[],"Well that's a wrap. On June 12, 2025 my mom fi...",USCIS
1,1grmeq4,Today I became a US citizen,u/adepojus,3872,261,2024-11-15T02:45:34+00:00,https://www.reddit.com/r/USCIS/comments/1grmeq...,False,[https://i.redd.it/j2iwqcb0bz0e1.jpeg],I came into United States as an F-1 student in...,USCIS
2,1gkfbph,Today I became a US citizen,u/Asteroids19_9,3694,142,2024-11-05T19:40:39+00:00,https://www.reddit.com/r/USCIS/comments/1gkfbp...,False,[https://i.redd.it/ejtng9ozy4zd1.jpeg],I am a 19 year old student at college. It took...,USCIS
3,1glflxy,"So, what now? An immigration attorney perspect...",u/Honest-Grape-9352,2898,715,2024-11-07T02:01:16+00:00,https://www.reddit.com/r/USCIS/comments/1glflx...,True,[],"(Before I begin, I kindly ask that I not be DM...",USCIS
4,1ltlanr,Became a Citizen after 26 years!!,u/Ajax4557,2648,165,2025-07-07T04:44:32+00:00,https://www.reddit.com/r/USCIS/comments/1ltlan...,False,[https://i.redd.it/5d0ljp8jtdbf1.jpeg],,USCIS


In [9]:
comments_df.head()

Unnamed: 0,post_id,comment_id,author,created,score,body
0,1lcftjz,my05pdb,u/DrummerHistorical493,2025-06-16T01:03:23+00:00,206,What a great son!
1,1lcftjz,my05mw5,u/Thedippyhoe,2025-06-16T01:02:57+00:00,212,Congratulations!!! Big hugs to your momma!\n\n...
2,1lcftjz,my0c8ez,u/WonderfulVariation93,2025-06-16T01:44:40+00:00,188,You are exempt from all future Mother’s Day gi...
3,1lcftjz,my08ip3,u/GeekNoy,2025-06-16T01:21:16+00:00,51,Congrats to your mom. You're an awesome son.
4,1lcftjz,my067yj,u/Greedy_Disaster_3130,2025-06-16T01:06:41+00:00,128,This is a great benefit offered to service mem...
