#Set up (run every 2 hours)

In [None]:
import pandas as pd
import datetime

In [None]:
import requests

# note that CLIENT_ID refers to 'personal use script' and SECRET_TOKEN to 'token'
auth = requests.auth.HTTPBasicAuth('user id', 'user sec key')

# here we pass our login method (password), username, and password
data = {'grant_type': 'password',
        'username': 'username',
        'password': 'password'}

# setup our header info, which gives reddit a brief description of our app
headers = {'User-Agent': 'MyBot/0.0.1'}

# send our request for an OAuth token
res = requests.post('https://www.reddit.com/api/v1/access_token',
                    auth=auth, data=data, headers=headers)

# convert response to JSON and pull access_token value
TOKEN = res.json()['access_token']

# add authorization to our headers dictionary
headers = {**headers, **{'Authorization': f"bearer {TOKEN}"}}

# while the token is valid (~2 hours) we just add headers=headers to our requests
requests.get('https://oauth.reddit.com/api/v1/me', headers=headers)

<Response [200]>

In [None]:
# Update the request to include the time range

res = requests.get("https://oauth.reddit.com/r/python/hot", headers=headers, params={'limit':500})

df = pd.DataFrame()  # initialize dataframe

# loop through each post retrieved from GET request
for post in res.json()['data']['children']:
    # append relevant data to dataframe
    df = df.append({
            'subreddit': post['data']['subreddit'],
            'title': post['data']['title'],
            'selftext': post['data']['selftext'],
            'upvote_ratio': post['data']['upvote_ratio'],
            'ups': post['data']['ups'],
            'downs': post['data']['downs'],
            'score': post['data']['score'],
            'link_flair_css_class': post['data']['link_flair_css_class'],
            'created_utc': datetime.fromtimestamp(post['data']['created_utc']).strftime('%Y-%m-%dT%H:%M:%SZ'),
            'id': post['data']['id'],
            'kind': post['kind']
        }, ignore_index=True)

In [None]:
df

Unnamed: 0,subreddit,title,selftext,upvote_ratio,ups,downs,score,link_flair_css_class,created_utc,id,kind
0,Python,Sunday Daily Thread: What's everyone working o...,# Weekly Thread: What's Everyone Working On Th...,0.99,9,0,9,daily-thread,2023-12-17T00:00:09Z,18k47x7,t3
1,Python,Saturday Daily Thread: Resource Request and Sh...,# Weekly Thread: Resource Request and Sharing ...,1.00,1,0,1,daily-thread,2023-12-16T00:00:19Z,18je37p,t3
2,Python,Polars 0.20 released. Next release will be 1.0.,,0.98,322,0,322,news,2023-12-16T22:22:16Z,18k282p,t3
3,Python,Turn Your Python Project Into A Website in 6 M...,,0.62,7,0,7,tutorial,2023-12-17T14:18:34Z,18ki9wj,t3
4,Python,"Hi, this is a 2 minute survey about GUI librar...",,0.60,2,0,2,discussion,2023-12-17T19:59:28Z,18kpqc7,t3
...,...,...,...,...,...,...,...,...,...,...,...
97,Python,Working on updating the PyDev debugger to use ...,"Ok, so, I'm updating the PyDev debugger to use...",0.86,10,0,10,news,2023-12-09T10:48:42Z,18eb2aq,t3
98,Python,Balderdash LSTM : Python AI Game,"Howdy,\n\nBalderdash is a game where you try t...",1.00,2,0,2,intermediate-showcase,2023-12-09T15:01:55Z,18efddn,t3
99,Python,Built a Python script to get Spotify stats!,"Hey r/Python, \n\nI wrote a Python script whil...",0.89,24,0,24,beginner-showcase,2023-12-08T22:25:23Z,18dyf8o,t3
100,Python,Python 3.12.1 Released,,0.97,259,0,259,news,2023-12-08T02:18:46Z,18dc796,t3


#test how to set start and end date.


In [None]:
#tried to use push shift buf failed want me to be moderator??
#tried before after tiem in redited but it looks liek it was removed or something

#Full


In [None]:
import requests
import pandas as pd
from datetime import datetime

# we use this function to convert responses to dataframes
def df_from_response(res):
    # initialize temp dataframe for batch of data in response
    df = pd.DataFrame()

    # loop through each post pulled from res and append to df
    for post in res.json()['data']['children']:
        df = df.append({
            'subreddit': post['data']['subreddit'],
            'title': post['data']['title'],
            'selftext': post['data']['selftext'],
            'upvote_ratio': post['data']['upvote_ratio'],
            'ups': post['data']['ups'],
            'downs': post['data']['downs'],
            'score': post['data']['score'],
            'link_flair_css_class': post['data']['link_flair_css_class'],
            'created_utc': datetime.fromtimestamp(post['data']['created_utc']).strftime('%Y-%m-%dT%H:%M:%SZ'),
            'id': post['data']['id'],
            'kind': post['kind']
        }, ignore_index=True)

    return df

# authenticate API
client_auth = requests.auth.HTTPBasicAuth('user id', 'user sec key')
data = {
    'grant_type': 'password',
    'username': 'username',
    'password': 'password'
}


headers = {'User-Agent': 'myBot/0.0.1'}

# send authentication request for OAuth token
res = requests.post('https://www.reddit.com/api/v1/access_token',
                    auth=client_auth, data=data, headers=headers)
# extract token from response and format correctly
TOKEN = f"bearer {res.json()['access_token']}"
# update API headers with authorization (bearer token)
headers = {**headers, **{'Authorization': TOKEN}}

# initialize dataframe and parameters for pulling data in loop
data = pd.DataFrame()
params = {'limit': 50}

# loop through 10 times (returning 1K posts)
for i in range(10):
    # make request
    res = requests.get("https://oauth.reddit.com/r/python/new",
                       headers=headers,
                       params=params)

    # get dataframe from response
    new_df = df_from_response(res)
    # take the final row (oldest entry)
    row = new_df.iloc[len(new_df)-1]
    # create fullname
    fullname = row['kind'] + '_' + row['id']
    # add/update fullname in params
    params['after'] = fullname

    # append new_df to data
    data = data.append(new_df, ignore_index=True)

In [None]:
data.tail()

Unnamed: 0,subreddit,title,selftext,upvote_ratio,ups,downs,score,link_flair_css_class,created_utc,id,kind
495,Python,Monday Daily Thread: Project ideas!,# Weekly Thread: Project Ideas 💡\n\nWelcome to...,0.67,3,0,3,daily-thread,2023-11-13T00:00:10Z,17txnh4,t3
496,Python,Python Threading: 7-Day Crash Course,,0.91,169,0,169,tutorial,2023-11-12T22:45:47Z,17tw12h,t3
497,Python,🏈 Enhance Your Fantasy Football Strategy with ...,"Hey Dev Squad,\n\nReady to elevate your fantas...",0.56,2,0,2,beginner-showcase,2023-11-12T22:13:58Z,17tvbq7,t3
498,Python,pictriage: a lightweight image folder organizer,"I made pictriage, which is a fast way to visua...",0.82,12,0,12,intermediate-showcase,2023-11-12T20:13:30Z,17tsl0n,t3
499,Python,Document Your Python Code and Projects With Ch...,,0.45,0,0,0,tutorial,2023-11-12T18:23:55Z,17tq7av,t3


#..

In [None]:
import requests
import pandas as pd
from datetime import datetime
import time  # Import the time module


# we use this function to convert responses to dataframes
def df_from_response(res):
    # initialize temp dataframe for batch of data in response
    df = pd.DataFrame()

    # loop through each post pulled from res and append to df
    for post in res.json()['data']['children']:
        df = df.append({
            'subreddit': post['data']['subreddit'],
            'title': post['data']['title'],
            'selftext': post['data']['selftext'],
            'upvote_ratio': post['data']['upvote_ratio'],
            'ups': post['data']['ups'],
            'downs': post['data']['downs'],
            'score': post['data']['score'],
            'link_flair_css_class': post['data']['link_flair_css_class'],
            'created_utc': datetime.fromtimestamp(post['data']['created_utc']).strftime('%Y-%m-%dT%H:%M:%SZ'),
            'id': post['data']['id'],
            'kind': post['kind']
        }, ignore_index=True)

    return df

# authenticate API
client_auth = requests.auth.HTTPBasicAuth('user id', 'user sec key')
data = {
    'grant_type': 'password',
    'username': 'username',
    'password': 'password'
}


headers = {'User-Agent': 'myBot/0.0.1'}

# send authentication request for OAuth token
res = requests.post('https://www.reddit.com/api/v1/access_token',
                    auth=client_auth, data=data, headers=headers)
# extract token from response and format correctly
TOKEN = f"bearer {res.json()['access_token']}"
# update API headers with authorization (bearer token)
headers = {**headers, **{'Authorization': TOKEN}}

# initialize dataframe and parameters for pulling data in loop
data = pd.DataFrame()
params = {'limit': 50}


# Set the start and finish dates
start_date = datetime(2023, 10, 1)  # replace with your start date
end_date = datetime(2023, 10,29)  # replace with your end date
reddit='python'

# loop through 10 times (returning 1K posts)
for i in range(15):
    # make request
    res = requests.get(f"https://oauth.reddit.com/r/{reddit}/new",
                       headers=headers,
                       params=params)

    # get dataframe from response
    new_df = df_from_response(res)

    # Check if the last post date is before the start date
    last_post_date = datetime.strptime(new_df.iloc[len(new_df) - 1]['created_utc'], '%Y-%m-%dT%H:%M:%SZ')
    if last_post_date < start_date:
        break  # Stop the loop if the last post date is before the start date

    # take the final row (oldest entry)
    row = new_df.iloc[len(new_df) - 1]
    # create fullname
    fullname = row['kind'] + '_' + row['id']
    # add/update fullname in params
    params['after'] = fullname

    # append new_df to data
    data = data.append(new_df, ignore_index=True)

# Filter posts within the specified date range
filtered_data = data[data['created_utc'].apply(lambda x: start_date <= datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ') <= end_date)]

# Now, 'filtered_data' contains only the posts within the specified date range.
data.to_csv('reddit_posts.csv', index=False)

In [None]:
# max is 1000 from any url such as top / new

#testing praw


In [None]:
import praw
import pandas as pd
from datetime import datetime

# Reddit API credentials
reddit = praw.Reddit(client_id='user id',
                     client_secret='user sec key',
                     username='username',
                     password='password',
                     user_agent='myBot/0.0.1')

# Subreddit and date range
subreddit_name = 'python'
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 3, 1)

# Initialize data frame
data = pd.DataFrame()

# Fetch posts using PRAW
for submission in reddit.subreddit(subreddit_name).new(limit=None):
    # Convert date to UTC format
    created_utc = datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%dT%H:%M:%SZ')

    # Check if post is within the specified date range
    if start_date <= datetime.strptime(created_utc, '%Y-%m-%dT%H:%M:%SZ') <= end_date:
        data = data.append({
            'subreddit': submission.subreddit.display_name,
            'title': submission.title,
            'selftext': submission.selftext,
            'upvote_ratio': submission.upvote_ratio,
            'ups': submission.ups,
            'downs': submission.downs,
            'score': submission.score,
            'link_flair_css_class': submission.link_flair_css_class,
            'created_utc': created_utc,
            'id': submission.id
        }, ignore_index=True)
    else:
        break  # Stop fetching posts if the date is past the end_date

# Now, 'data' contains only the posts within the specified date range.


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



In [None]:
import praw
import pandas as pd
from datetime import datetime

def fetch_posts(subreddit, start_date, end_date):
    reddit = praw.Reddit(
        client_id='user id',
        client_secret='user sec key',
        username='username',
        password='password',
        user_agent='myBot/0.0.1'
    )

    data = pd.DataFrame()

    for submission in reddit.subreddit(subreddit).new(limit=None):
        created_utc = datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%dT%H:%M:%SZ')

        if start_date <= datetime.strptime(created_utc, '%Y-%m-%dT%H:%M:%SZ') <= end_date:
            data = data.append({
                'subreddit': submission.subreddit.display_name,
                'title': submission.title,
                'selftext': submission.selftext,
                'upvote_ratio': submission.upvote_ratio,
                'ups': submission.ups,
                'downs': submission.downs,
                'score': submission.score,
                'link_flair_css_class': submission.link_flair_css_class,
                'created_utc': created_utc,
                'id': submission.id
            }, ignore_index=True)
        else:
            break

    return data

# Set the start and finish dates
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 3, 1)
reddit_subreddit = 'python'

# Fetch posts
data = fetch_posts(reddit_subreddit, start_date, end_date)

# Print the first few rows of the data
print(data.head())


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Empty DataFrame
Columns: []
Index: []


#another one

In [None]:
client_auth = requests.auth.HTTPBasicAuth('user id', 'user sec key')
data = {
    'grant_type': 'password',
    'username': 'username',
    'password': 'password'
}

In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta

def df_from_response(res):
    df = pd.DataFrame()
    for post in res.json()['data']['children']:
        df = df.append({
            'subreddit': post['data']['subreddit'],
            'title': post['data']['title'],
            'selftext': post['data']['selftext'],
            'upvote_ratio': post['data']['upvote_ratio'],
            'ups': post['data']['ups'],
            'downs': post['data']['downs'],
            'score': post['data']['score'],
            'link_flair_css_class': post['data']['link_flair_css_class'],
            'created_utc': datetime.fromtimestamp(post['data']['created_utc']).strftime('%Y-%m-%dT%H:%M:%SZ'),
            'id': post['data']['id'],
            'kind': post['kind']
        }, ignore_index=True)
    return df

# Client and User credentials
client_id = 'user id'
client_secret = 'user sec key'
username = 'username'
password = 'password'

# User Agent
headers = {'User-Agent': 'myBot/0.0.1'}

# OAuth token retrieval
client_auth = requests.auth.HTTPBasicAuth(client_id, client_secret)

# Setting up the request data
data = {
    'grant_type': 'password',
    'username': username,
    'password': password
}

# Send authentication request for OAuth token
res = requests.post('https://www.reddit.com/api/v1/access_token', auth=client_auth, data=data, headers=headers)
res.raise_for_status()

# Extract token from response and update the headers
TOKEN = f"bearer {res.json()['access_token']}"
headers['Authorization'] = TOKEN

# Parameters for request
params = {'limit': 100}

# Time range setup
reddit = 'python'
end_date = datetime.utcnow()
start_date = datetime(2023, 1, 1)
slice_size = timedelta(days=30)

all_data = pd.DataFrame()

# Loop over time-sliced queries
while end_date > start_date:
    params['before'] = end_date.timestamp()
    after = None
    slice_data = pd.DataFrame()

    for i in range(10):
        if after:
            params['after'] = after

        res = requests.get(f"https://oauth.reddit.com/r/{reddit}/new", headers=headers, params=params)
        res.raise_for_status()

        new_slice = df_from_response(res)
        if new_slice.empty:
            break

        after = new_slice['kind'].iloc[-1] + '_' + new_slice['id'].iloc[-1]
        slice_data = slice_data.append(new_slice, ignore_index=True)

    all_data = all_data.append(slice_data, ignore_index=True)
    end_date -= slice_size

filtered_data = all_data[
    all_data['created_utc'].apply(
        lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ')
    ).between(start_date, end_date)
]

print(f"Total posts retrieved: {len(filtered_data)}")

  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.ap

Total posts retrieved: 0


  all_data = all_data.append(slice_data, ignore_index=True)


#anotherone


In [None]:
import requests
import pandas as pd
from datetime import datetime

def get_pushshift_data(subreddit, before_timestamp):
    url = f"https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&size=100&before={before_timestamp}"
    response = requests.get(url)
    response.raise_for_status()  # This will raise an exception if there is an issue with the request
    return response.json()['data']

# Initial setup
subreddit = 'python'
end_date = datetime(2023, 1, 1)
start_date = datetime(2022, 1, 1)  # Example start date: January 1st, 2022

# Convert end date to UTC timestamp
before_timestamp = int(end_date.timestamp())

all_posts = []

# Fetch posts from Pushshift in batches until you reach the start_date
while True:
    print(f"Fetching posts before: {datetime.utcfromtimestamp(before_timestamp)}")
    posts_batch = get_pushshift_data(subreddit, before_timestamp)

    # If there are no more posts returned by the API, exit the loop
    if not posts_batch:
        break

    for post in posts_batch:
        # Extract relevant data from each post
        all_posts.append({
            'subreddit': post.get('subreddit'),
            'title': post.get('title'),
            'selftext': post.get('selftext'),
            'created_utc': post.get('created_utc'),
            'score': post.get('score'),
            'num_comments': post.get('num_comments'),
            'id': post.get('id'),
        })

    # Update `before_timestamp` to the earliest post's time in the current batch
    before_timestamp = posts_batch[-1]['created_utc']

    # Break the loop if we have reached the start date
    if datetime.utcfromtimestamp(before_timestamp) <= start_date:
        break

# Convert to DataFrame
posts_df = pd.DataFrame(all_posts)

# Convert UTC timestamps to datetime objects
posts_df['created_utc'] = pd.to_datetime(posts_df['created_utc'], unit='s')

# Optionally, filter the DataFrame to the precise start and end dates
filtered_posts_df = posts_df[
    (posts_df['created_utc'] >= start_date) & (posts_df['created_utc'] <= end_date)
]

print(f"Total posts retrieved: {len(filtered_posts_df)}")

# analysis


In [None]:
res.json()['data']['children'][0]

{'kind': 't3',
 'data': {'approved_at_utc': None,
  'subreddit': 'Python',
  'selftext': 'Hi,\n\nI’ve been working on a lightweight log collector which will run on any vps, is extremely easy to set up, and works with your current libraries.\n\n&amp;#x200B;\n\n&amp;#x200B;\n\n[ui view](https://preview.redd.it/0issvvq983xb1.png?width=2780&amp;format=png&amp;auto=webp&amp;s=87131c227176735e5ef2cd12d148d9202743cd44)\n\nMost log platforms either\n\n* are expensive (datadog, newrelic)\n* require you to use opentelemetry, which is difficult to set up and not well documented\n* require you to host a clickhouse instance\n\nFor someone who has projects with less than 5,000 users, I failed to find a simple log platform which worked with the log libraries I already used.\n\nErlog is a simple python script which accepts logs from an HTTP endpoint, and optionally watches log files and inserts JSON formatted logs into a local duckdb database. You can then query logs using natural syntax.\n\nThere is 