In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('file_name.csv')

In [3]:
df.columns

Index(['Title', 'Political Lean', 'Score', 'Id', 'Subreddit', 'URL',
       'Num of Comments', 'Text', 'Date Created'],
      dtype='object')

In [4]:
df.Subreddit.unique()

array(['socialism', 'democrats', 'DemocraticSocialism', 'SocialDemocracy',
       'progressive', 'alltheleft', 'Liberal', 'feminisms', 'Communist',
       'RadicalFeminism', 'Libertarian', 'conservatives', 'Capitalism',
       'republicans', 'anarchocapitalism'], dtype=object)

# Scrape & Format Fucntions

In [14]:
import requests
import json
import time

# Replace these with your Reddit API credentials
client_id = ''
client_secret = ''
user_agent = 'personal_research_project_not_for_distribution.NLP:v1.0 (by /u/Practical_Year_8917)'
username = ''
password = ''


def get_token(client_id, client_secret, user_agent, username, password):
    auth = requests.auth.HTTPBasicAuth(client_id, client_secret)
    data = {
        'grant_type': 'password',
        'username': username,
        'password': password
    }
    headers = {'User-Agent': user_agent}
    response = requests.post('https://www.reddit.com/api/v1/access_token',
                             auth=auth, data=data, headers=headers)
    token = response.json()['access_token']
    return token


# Function to get posts from a subreddit with error handling
def get_posts(subreddit, token, user_agent, limit=10, after=None):
    try:
        headers = {'Authorization': f'bearer {token}', 'User-Agent': user_agent}
        params = {'limit': limit}
        if after:
            params['after'] = after
        url = f'https://oauth.reddit.com/r/{subreddit}/hot'
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        time.sleep(0.6)  # Sleep to avoid hitting rate limits
        posts = response.json()['data']
        return posts
    except requests.exceptions.RequestException as e:
        print(f"Error fetching posts for subreddit {subreddit}: {e}")
        return {'children': [], 'after': None}

# Function to get comments from a post with error handling
def get_comments(post_id, token, user_agent):
    try:
        headers = {'Authorization': f'bearer {token}', 'User-Agent': user_agent}
        url = f'https://oauth.reddit.com/comments/{post_id}?limit=10'
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        time.sleep(0.6)  # Sleep to avoid hitting rate limits
        comments = response.json()[1]['data']['children']
        return comments
    except requests.exceptions.RequestException as e:
        print(f"Error fetching comments for post {post_id}: {e}")
        return []


# Function to extract post and comment information and store it in a DataFrame
def extract_post_info(posts, token, user_agent):
    columns = ['Title', 'Score', 'Id', 'Subreddit', 'URL', 'Num of Comments', 'Text', 'Date Created', 'Comment_Text']
    data = []
    
    for post in posts:
        post_data = post['data']
        post_id = post_data.get('id')
        comments = get_comments(post_id, token, user_agent)
        
        for comment in comments:
            if 'body' in comment['data']:
                comment_text = comment['data'].get('body')
                row = [
                    post_data.get('title'),
                    post_data.get('score'),
                    post_id,
                    post_data.get('subreddit'),
                    post_data.get('url'),
                    post_data.get('num_comments'),
                    post_data.get('selftext'),
                    post_data.get('created_utc'),
                    comment_text
                ]
                data.append(row)
            else:
                row = [
                    post_data.get('title'),
                    post_data.get('score'),
                    post_id,
                    post_data.get('subreddit'),
                    post_data.get('url'),
                    post_data.get('num_comments'),
                    post_data.get('selftext'),
                    post_data.get('created_utc'),
                    None
                ]
                data.append(row)
    
    df = pd.DataFrame(data, columns=columns)
    return df


In [15]:
token = get_token(client_id, client_secret, user_agent, username, password)

In [16]:
token

'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzI0MTkwNDIyLjE4NDIwMSwiaWF0IjoxNzI0MTA0MDIyLjE4NDIwMSwianRpIjoiMEE2X3hua0RXWnZ0b0tjWXgzUDROWE5GS1drOWlnIiwiY2lkIjoiZjFENEJZNFRmM1FrS0FBajBKZzZRZyIsImxpZCI6InQyX24weml3ZTNjMSIsImFpZCI6InQyX24weml3ZTNjMSIsImxjYSI6MTY5ODg5MzUwMzc3MSwic2NwIjoiZUp5S1Z0SlNpZ1VFQUFEX193TnpBU2MiLCJmbG8iOjl9.iG0NGGWnLQnRHAwQPrb5T3AMQD7yuExP6UIZ0cw6uwws_Hzvsabud4DrDsDkY_oKsdg8gq-9p69XXlVQJ8GTVZVZRR0Ha2aznoycdADyK41-mZUfjD8f0v2DNad-DlhsRdfxGQLORm8A_g9UklrAsV6svDHWHskRMZlT43i7LsP-4xkJ_JTJLJHiUlhcH8jtSShw2cK2SOFK0pxb_WAc7Xm5Nc7QGhHpDN4dgdmVCRLHGzy8ey6HkLEVwrwBYf7e3ozrH8C4b_WI2rvaNdf03HrV5MzFKRZ2xewrR5BwXomvWnrjH4l6oBy1mPvbHAPhhtKUAMfWPek-9OQziqyK-A'

# Scraper Test

In [None]:
#politics, Democrats, socialism, progressive, GreenParty
#PoliticalDiscussion, NeutralPolitics, ChangeMyView, Ask_Politics, ModeratePolitics
#Republican, Conservative, Libertarian, Anarcho_Capitalism, The_Donald

In [40]:
subreddit = 'politics'
after = None
post_count = 0
max_posts = 10  # Set the maximum number of posts you want to scrape
all_posts = []

while post_count < max_posts:
    posts_data = get_posts(subreddit, token, user_agent, limit=10, after=after)
    posts = posts_data['children']
    after = posts_data['after']
    if not posts:
        break

    all_posts.extend(posts)
    post_count += len(posts)

# Extract post and comment information and store it in a DataFrame
df = extract_post_info(all_posts, token, user_agent)
df['Date Created'] = pd.to_datetime(df['Date Created'], unit='s')  # Convert to datetime

In [41]:
df

Unnamed: 0,Title,Score,Id,Subreddit,URL,Num of Comments,Text,Date Created,Comment_Text
0,"/r/Politics' 2024 US Elections Live Thread, Pa...",25,1e9d7i4,politics,https://www.reddit.com/live/1db9knzhqzdfp/,327,,2024-07-22 12:36:02,"To sort this thread by 'best comments first', ..."
1,"/r/Politics' 2024 US Elections Live Thread, Pa...",25,1e9d7i4,politics,https://www.reddit.com/live/1db9knzhqzdfp/,327,,2024-07-22 12:36:02,There was an impressive marketing push / spend...
2,"/r/Politics' 2024 US Elections Live Thread, Pa...",25,1e9d7i4,politics,https://www.reddit.com/live/1db9knzhqzdfp/,327,,2024-07-22 12:36:02,https://x.com/mike_pence/status/18153883907846...
3,"/r/Politics' 2024 US Elections Live Thread, Pa...",25,1e9d7i4,politics,https://www.reddit.com/live/1db9knzhqzdfp/,327,,2024-07-22 12:36:02,Erie County PA resident here. The timeline of ...
4,"/r/Politics' 2024 US Elections Live Thread, Pa...",25,1e9d7i4,politics,https://www.reddit.com/live/1db9knzhqzdfp/,327,,2024-07-22 12:36:02,
...,...,...,...,...,...,...,...,...,...
77,All 50 Democratic party US state chairs back H...,16588,1e900lb,politics,https://www.reuters.com/world/us/all-50-democr...,1712,,2024-07-21 23:40:23,"\nAs a reminder, this subreddit [is for civil ..."
78,All 50 Democratic party US state chairs back H...,16588,1e900lb,politics,https://www.reuters.com/world/us/all-50-democr...,1712,,2024-07-21 23:40:23,I feel like Dem rank and file get the message:...
79,All 50 Democratic party US state chairs back H...,16588,1e900lb,politics,https://www.reuters.com/world/us/all-50-democr...,1712,,2024-07-21 23:40:23,"Damn, there is much less infighting than i tho..."
80,All 50 Democratic party US state chairs back H...,16588,1e900lb,politics,https://www.reuters.com/world/us/all-50-democr...,1712,,2024-07-21 23:40:23,There is FINALLY a sense of energy and hope ba...


# Production Scraper

In [30]:
subreddit_list = ['politics', 'Democrats', 'socialism', 'progressive', 'GreenParty', \
                  'PoliticalDiscussion', 'NeutralPolitics', 'ChangeMyView', 'Ask_Politics', 'ModeratePolitics',\
                 'Republican', 'Conservative', 'Libertarian', 'Anarcho_Capitalism', 'The_Donald']

In [23]:
subreddit_list = ['PoliticalDiscussion', 'NeutralPolitics', 'ChangeMyView', 'Ask_Politics', 'ModeratePolitics']

In [24]:
len(subreddit_list)

5

In [25]:
all_posts = []
for sub in subreddit_list:
    after = None
    post_count = 0
    max_posts = 200  # Set the maximum number of posts you want to scrape
    print(sub)
    while post_count < max_posts:
        print(post_count)
        posts_data = get_posts(sub, token, user_agent, limit=10, after=after)
        posts = posts_data['children']
        after = posts_data['after']
        if not posts:
            break

        all_posts.extend(posts)
        post_count += len(posts)

PoliticalDiscussion
0
11
21
31
41
51
61
71
81
91
101
111
121
131
141
151
161
171
181
191
NeutralPolitics
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
ChangeMyView
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
Ask_Politics
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
ModeratePolitics
0
11
21
31
41
51
61
71
81
91
101
111
121
131
141
151
161
171
181
191


In [26]:
# Extract post and comment information and store it in a DataFrame
df = extract_post_info(all_posts, token, user_agent)
df['Date Created'] = pd.to_datetime(df['Date Created'], unit='s')  # Convert to datetime

In [27]:
df.head()

Unnamed: 0,Title,Score,Id,Subreddit,URL,Num of Comments,Text,Date Created,Comment_Text
0,Casual Questions Thread,27,1bwbuka,PoliticalDiscussion,https://www.reddit.com/r/PoliticalDiscussion/c...,2782,This is a place for the PoliticalDiscussion co...,2024-04-05 07:15:20,[A reminder for everyone](https://www.reddit.c...
1,Casual Questions Thread,27,1bwbuka,PoliticalDiscussion,https://www.reddit.com/r/PoliticalDiscussion/c...,2782,This is a place for the PoliticalDiscussion co...,2024-04-05 07:15:20,"I’m 30 years old. Growing up, while I obviousl..."
2,Casual Questions Thread,27,1bwbuka,PoliticalDiscussion,https://www.reddit.com/r/PoliticalDiscussion/c...,2782,This is a place for the PoliticalDiscussion co...,2024-04-05 07:15:20,[removed]
3,Casual Questions Thread,27,1bwbuka,PoliticalDiscussion,https://www.reddit.com/r/PoliticalDiscussion/c...,2782,This is a place for the PoliticalDiscussion co...,2024-04-05 07:15:20,How many of the proposals in P2025 become laws...
4,Casual Questions Thread,27,1bwbuka,PoliticalDiscussion,https://www.reddit.com/r/PoliticalDiscussion/c...,2782,This is a place for the PoliticalDiscussion co...,2024-04-05 07:15:20,


In [28]:
df.to_csv('reddit_scrape_aug18_center.csv')

In [25]:
for i in range(7):
    print(df[df.Title == 'WHAT JOE BIDEN HAS DONE']['Comment_Text'][i])

So many accomplishments. These need to be more widely promoted/talked about

And with the new economic report coming out, that once again the economy is actually doing good and were (again) not going into a recession, their new take is now we're in a "Vibecession" lol. Biden and the Democrats just can't win against these "analysts" and mainstream media

Seems like every year since Biden got elected republicans and the media been trying to stoke fear in a recession happening right around the corner, yet every year seems like Democratic policies are working (and if anything, progress is being hindered by republicans and "neutral" people like Jerome Powell)
Campaign needs a nicely presented PDF of this. Mailer/brochure too
Just the recent $5 BILLION  package for infrastructure does so much...

$1 Billion to replace the Blatnik Bridge connecting WI - MN

$600 million to replace the I-5 Bridge between Vancouver, Washington, and Portland, Oregon, with an earthquake-resistant, multimodal brid

In [14]:
df.head(5)

Unnamed: 0,Title,Score,Id,Subreddit,URL,Num of Comments,Text,Date Created
0,MOD ANNOUNCEMENT: Regarding violent rhetoric,104,1e2s6b8,LateStageCapitalism,https://www.reddit.com/r/LateStageCapitalism/c...,18,"Hello, comrades. This is your mod team speakin...",2024-07-14 03:08:49
1,"Rule 6 ""no lesser evil"" rhetoric - is it accel...",441,1bt0nag,LateStageCapitalism,https://www.reddit.com/r/LateStageCapitalism/c...,2,Reposting the answer given to a user who was a...,2024-04-01 10:56:20
2,Why tho,948,1e6oswt,LateStageCapitalism,https://i.redd.it/ebs6gi1vycdd1.png,17,,2024-07-18 23:03:27
3,Man this shit is depressing.,5486,1e6a65i,LateStageCapitalism,https://i.redd.it/mvzo7rsdt9dd1.jpeg,106,,2024-07-18 12:27:20
4,Biden appears to forget the name of his own Se...,1788,1e6f7av,LateStageCapitalism,https://www.mediaite.com/tv/biden-appears-to-f...,179,,2024-07-18 16:10:21
