In [32]:
import pandas as pd
import praw
import json
from datetime import datetime
import os
from dotenv import load_dotenv

In [20]:
# Load env variable
load_dotenv(override=True)

True

In [21]:
# Reddit API credentials
reddit = praw.Reddit(
                    client_id=os.getenv("REDDIT_CLIENT_ID"),
                    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
                    user_agent=os.getenv("REDDIT_USER_AGENT")
)

In [22]:
print(reddit.read_only)

True


In [38]:
# Extract data from subreddit instance
# Create empty list
data = []

# Loop over topics in hot category
for submission in reddit.subreddit("ai+technology+politics").hot(limit=1000):

    # Create a data dictionary
    data.append({
        
        "id":submission.id,
        "title":submission.title,
        "author": submission.author.name if submission.author else "[deleted]",
        "subreddit": submission.subreddit.display_name,
        "score":submission.score,
        "num_comments": submission.num_comments,
        "created_utc": submission.created_utc,
        "url":submission.url,
        "selftext": submission.selftext
    })

df = pd.DataFrame(data)

df.to_csv('../data/extracted_reddit.csv')

df.head()

Unnamed: 0,id,title,author,subreddit,score,num_comments,created_utc,url,selftext
0,1fm4tt7,Trump to women: Stop ‘thinking about abortion....,njdotcom,politics,6397,580,1726930000.0,https://www.nj.com/politics/2024/09/trump-to-w...,
1,1fm20ee,Vaporizing plastics recycles them into nothing...,AdSpecialist6598,technology,2282,354,1726922000.0,https://arstechnica.com/science/2024/09/vapori...,
2,1flzkzc,"Voter registration is spiking, particularly am...",Cute-Perception2335,politics,12442,543,1726913000.0,https://www.usatoday.com/story/news/politics/e...,
3,1fm1pb1,America sort of likes Kamala Harris – and that...,flintflower,politics,5761,572,1726921000.0,https://www.businessinsider.com/kamala-harris-...,
4,1fm0dyg,A dramatic rise in pregnant women dying in Tex...,redditofthebanned,politics,5609,320,1726916000.0,https://www.nbcnews.com/health/womens-health/t...,


In [43]:
# Investigate data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            20 non-null     object 
 1   title         20 non-null     object 
 2   author        20 non-null     object 
 3   subreddit     20 non-null     object 
 4   score         20 non-null     int64  
 5   num_comments  20 non-null     int64  
 6   created_utc   20 non-null     float64
 7   url           20 non-null     object 
 8   selftext      20 non-null     object 
dtypes: float64(1), int64(2), object(6)
memory usage: 1.5+ KB


In [44]:
df['created_utc'] = pd.to_datetime(df.created_utc, unit='s')
df.head()

Unnamed: 0,id,title,author,subreddit,score,num_comments,created_utc,url,selftext
0,1fm4tt7,Trump to women: Stop ‘thinking about abortion....,njdotcom,politics,6397,580,2024-09-21 14:54:23,https://www.nj.com/politics/2024/09/trump-to-w...,
1,1fm20ee,Vaporizing plastics recycles them into nothing...,AdSpecialist6598,technology,2282,354,2024-09-21 12:35:34,https://arstechnica.com/science/2024/09/vapori...,
2,1flzkzc,"Voter registration is spiking, particularly am...",Cute-Perception2335,politics,12442,543,2024-09-21 10:00:12,https://www.usatoday.com/story/news/politics/e...,
3,1fm1pb1,America sort of likes Kamala Harris – and that...,flintflower,politics,5761,572,2024-09-21 12:18:17,https://www.businessinsider.com/kamala-harris-...,
4,1fm0dyg,A dramatic rise in pregnant women dying in Tex...,redditofthebanned,politics,5609,320,2024-09-21 10:56:30,https://www.nbcnews.com/health/womens-health/t...,


In [42]:
df.subreddit.value_counts()

subreddit
politics      13
technology     7
Name: count, dtype: int64

In [41]:
df['url'][0]

'https://www.nj.com/politics/2024/09/trump-to-women-youre-broke-and-depressed-but-i-can-make-you-happy.html'