In [1]:
# Imports
from google.colab import files
import requests
import csv
import datetime
from tqdm import tqdm
import time
import random

In [None]:
start_date = datetime.datetime(2023, 5, 1)
end_date = datetime.datetime(2023, 5, 11)

subreddits = ['bitcoinmarkets',
              'cryptocurrency',
              'bitcoin',
              'altcoin',
              'cryptomarkets',
              'cryptotrading',
              'bitcoinbeginners',
              'cryptotechnology',
              'bitcoinmining',
              'cryptocurrencytrading',
              'cryptocurrencyinvesting',
              'cryptocurrencies',
              'bitcoinstocks',
              'cryptoinvestor',
              'bitcoinserious',
              'cryptocurrencynews',
              'bitcointrading',
              'cryptosecurity',
              'bitcoinminingpool',
              'cryptohardware',
              'all']

random_subreddit_index = 0
def select_subreddit_by_index(index):
    if 0 <= index < len(subreddits):
        return subreddits[index]
    else:
        return None


with open('reddit_bitcoin_posts.csv', mode='w', newline='') as file:
    writer = csv.writer(file, quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['post_id', 'title', 'selftext', 'url', 'author', 'score', 'publish_date', 'num_of_comments',
                     'permalink', 'flair'])

    no_data_found = False
    date = 0

    while date <= (end_date - start_date).days:
        date_str = (start_date + datetime.timedelta(date)).strftime("%Y-%m-%d")
        timestamp = int((start_date + datetime.timedelta(date)).timestamp())

        url = f"https://api.pushshift.io/reddit/search/submission/?q=bitcoin btc&after={timestamp}&before=" \
              f"{timestamp + 86400}&size=1000&subreddit={'bitcoin' if not no_data_found else select_subreddit_by_index(random_subreddit_index)}"
        print(url)

        success = False
        while not success:
            response = requests.get(url)

            if response.status_code == 200:
                success = True
                data = response.json()['data']

                if len(data) == 0 and len(subreddits) > random_subreddit_index:
                    print(f"No Reddit posts found for {date_str}. Trying with a different subreddit: {select_subreddit_by_index(random_subreddit_index)}")
                    no_data_found = True
                    random_subreddit_index += 1
                    continue
                else:
                    random_subreddit_index = 0
                    no_data_found = False

                for post in data:
                    title = post['title']
                    url = post['url']
                    try:
                        # If flair is available then get it, else set 'NaN'
                        flair = post['link_flair_text']
                    except KeyError:
                        flair = 'NaN'
                    author = post['author']
                    sub_id = post['id']
                    score = post['score']
                    try:
                        # If selftext is available then get it, else set it empty
                        selftext = post['selftext']
                        list_of_empty_markers = ['[removed]', '[deleted]']
                        # Many times selftext would be removed or deleted, if that's the case then set it empty
                        if selftext in list_of_empty_markers:
                            selftext = ''
                    except:
                        selftext = ''
                    created = datetime.datetime.fromtimestamp(post['created_utc'])  # 1520561700.0
                    numComms = post['num_comments']
                    permalink = post['permalink']
                    try:
                        writer.writerow([sub_id, title, selftext, url, author,
                                        score, created, numComms, permalink, flair
                                        ])
                    except Exception as e:
                        print(f"Something went wrong writing in the CSV file, skipping this entry..")
                        pass

                print(f"Fetched and wrote {len(data)} Reddit posts for {date_str} to the CSV file.")
                date += 1  # Advance to the next date

            else:
                print(f"Failed to fetch data from the API for {date_str}. Status code: {response.status_code}. Retrying in 1 minute...")
                time.sleep(60)


In [None]:
# Download the data to the computer
files.download("reddit_bitcoin_posts.csv")