In [None]:
library(httr)
library(jsonlite)
library(dplyr)


# --- 1. SETUP ---
client_id <- "*******************"
client_secret <- "yiHZ**Nw0eQcG**************"
user_agent <- "ShaheenMiningProject/1.0"
# Get Access Token
token_response <- POST(
  url = "https://www.reddit.com/api/v1/access_token",
  authenticate(client_id, client_secret),
  body = list(grant_type = "client_credentials"),
  encode = "form",
  add_headers(`User-Agent` = user_agent)
)
token <- content(token_response)$access_token

# --- 2. MULTI-SUBREDDIT LOOP ---
target_subreddits <- c("jobs", "cscareerquestions", "Futurology", "antiwork", 
                       "singularity", "technology", "ArtistHate", "DataScience")

all_data_list <- list() # To store results from all subreddits

for (sub in target_subreddits) {
  cat("\n--- Starting Subreddit: r/", sub, " ---\n")
  
  after_id <- NULL
  sub_posts <- list()
  
  # We pull 10 pages (1000 posts) per subreddit
  for (i in 1:10) {
    cat("  Fetching page", i, "for", sub, "...\n")
    
    request_url <- paste0("https://oauth.reddit.com/r/", sub, "/new.json?limit=100")
    if (!is.null(after_id)) {
      request_url <- paste0(request_url, "&after=", after_id)
    }
    
    raw_data <- GET(url = request_url, 
                    add_headers(Authorization = paste("bearer", token), `User-Agent` = user_agent))
    
    if (status_code(raw_data) != 200) break # Stop if error
    
    parsed_json <- content(raw_data, "parsed")
    
    # Extract data
    current_page_posts <- lapply(parsed_json$data$children, function(x) {
      data.frame(
        subreddit = sub,
        id = x$data$id,
        title = ifelse(is.null(x$data$title), "", x$data$title),
        text = ifelse(is.null(x$data$selftext), "", x$data$selftext),
        score = x$data$score,
        created_utc = as.POSIXct(x$data$created_utc, origin="1970-01-01"),
        stringsAsFactors = FALSE
      )
    })
    
    sub_posts[[i]] <- bind_rows(current_page_posts)
    
    # Update pagination token
    after_id <- parsed_json$data$after
    if (is.null(after_id) || after_id == "") break
    
    Sys.sleep(1.5) # Be kind to the API
  }
  
  all_data_list[[sub]] <- bind_rows(sub_posts)
}








# --- 3. COMBINE AND CLEAN ---
df_all <- bind_rows(all_data_list) %>%
  distinct(id, .keep_all = TRUE) # Remove any cross-posts/duplicates

# --- 4. ADVANCED FILTERING ---
# Expanded keywords for AI anxiety and job displacement
keywords <- c(
  "AI", "artificial intelligence", "chatgpt", "llm", "automation",
  "layoff", "job cut", "replaced", "redundant", "fired", "unemployment",
  "displacement", "career anxiety", "future of work", "hiring freeze", 
  "upskilling", "obsolete", "income", "UBI"
)

pattern <- paste(keywords, collapse = "|")

df_filtered <- df_all %>%
  filter(grepl(pattern, paste(title, text), ignore.case = TRUE))

# --- 5. SAVE ---
write.csv(df_filtered, file = "reddit_ai_anxiety_data.csv", row.names = FALSE)

cat("\nDone!\nTotal raw posts scraped:", nrow(df_all), 
    "\nRelevant posts after filtering:", nrow(df_filtered))