In [1]:
%%capture
!pip install praw tweepy pandas requests beautifulsoup4 python-dotenv textblob tqdm

In [2]:
import praw
import tweepy
import json
import os
import time
import pandas as pd
from datetime import datetime
from dotenv import load_dotenv
from textblob import TextBlob
from tqdm import tqdm
from langdetect import detect, DetectorFactory

DetectorFactory.seed = 0  # To make detection deterministic

### Fetch Credentials

In [3]:
load_dotenv()

# Retrieve credentials
reddit_client_id = os.getenv("REDDIT_CLIENT_ID")
reddit_client_secret = os.getenv("REDDIT_CLIENT_SECRET")
reddit_user_agent = os.getenv("REDDIT_USER_AGENT")

### Crawling Configurations

In [4]:
# Crawler default settings
DEFAULT_REDDIT_LIMIT = 100

# Subreddits to crawl
SUBREDDITS = [
    'CryptoCurrency',
    'CryptoMarkets',
    'binance',
    'CoinBase',
    'Crypto_com',
    'kucoin',
    'BitcoinBeginners',
    'CryptoScams',
    'Kraken',
    'Bybit',
    'OKX',
    'CryptoTechnology',
    'Ethereum'
]

# Output file paths
REDDIT_OUTPUT_CSV = "../data/reddit_crypto_data.csv"

# Crypto Exchanges for detection
CRYPTO_EXCHANGES = {
    'binance': ['binance', 'bnb', 'binance us', 'binance app', 'binance exchange'],
    'coinbase': ['coinbase', 'coinbase pro', 'coinbase wallet', 'cb wallet'],
    'kraken': ['kraken', 'kraken exchange', 'kraken pro'],
    'okx': ['okx', 'okex'],
    'kucoin': ['kucoin', 'kucoin exchange'],
    'crypto.com': ['crypto.com', 'cro', 'crypto.com app', 'cdc'],
    'bybit': ['bybit', 'bybit app']
}

### Sentiment Analysis Method
- We want a balanced number of positive, negative and neutral

In [5]:
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

### Reddit Crawler Setup

In [6]:
class RedditCrawler:
    """Class for crawling Reddit posts and comments related to Crypto Exchange Services."""

    def __init__(self, client_id, client_secret, user_agent):
        """Initialize the Reddit crawler with API credentials."""
        self.reddit = praw.Reddit(
            client_id=client_id,
            client_secret=client_secret,
            user_agent=user_agent
        )
        self.data = []

    def crawl_subreddit(self, subreddit_name, limit=1000, search_query=None):
        subreddit = self.reddit.subreddit(subreddit_name)
        submissions = subreddit.search(search_query, limit=limit) if search_query else subreddit.top(time_filter="all", limit=limit)

        for submission in tqdm(submissions, desc=f"Crawling r/{subreddit_name}"):
            post_text = submission.title + " " + submission.selftext
            post_data = {
                'id': submission.id,
                'title': submission.title,
                'text': submission.selftext,
                'score': submission.score,
                'created_utc': datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                'author': str(submission.author),
                'num_comments': submission.num_comments,
                'subreddit': subreddit_name,
                'permalink': f"https://reddit.com{submission.permalink}",
                'type': 'submission',
                'platform': self._detect_crypto_exchange(post_text),
                'sentiment': get_sentiment(post_text)
            }
            if post_data['text'] and len(post_data['text']) > 20:
                try:
                    if detect(post_data['text']) == 'en':
                        self.data.append(post_data)
                except:
                    pass  # Skip posts where language detection fails


            submission.comments.replace_more(limit=5)
            for comment in submission.comments.list():
                if not comment.body or len(comment.body) < 20:
                    continue
                try:
                    if detect(comment.body) != 'en':
                        continue
                except:
                    continue

                comment_data = {
                    'id': comment.id,
                    'text': comment.body,
                    'score': comment.score,
                    'created_utc': datetime.fromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                    'author': str(comment.author),
                    'parent_id': comment.parent_id,
                    'subreddit': subreddit_name,
                    'permalink': f"https://reddit.com{submission.permalink}{comment.id}/",
                    'type': 'comment',
                    'platform': self._detect_crypto_exchange(comment.body),
                    'sentiment': get_sentiment(comment.body)
                }
                self.data.append(comment_data)

    def _detect_crypto_exchange(self, text):
        text = text.lower()
        for exchange, keywords in CRYPTO_EXCHANGES.items():
            for keyword in keywords:
                if keyword in text:
                    return exchange
        return 'general'

    def save_to_csv(self, filename):
        df = pd.DataFrame(self.data)
        df.drop_duplicates(subset=['id'], inplace=True)
        df.to_csv(filename, index=False)
        print(f"Saved {len(df)} records to {filename}")
        return df

    def save_to_json(self, filename):
        with open(filename, 'w') as f:
            json.dump(self.data, f)
        print(f"Saved {len(self.data)} records to {filename}")

### Execute Crawling Process

In [7]:
# Initialise Reddit Crawler
reddit_crawler = RedditCrawler(
    client_id=reddit_client_id,
    client_secret=reddit_client_secret,
    user_agent=reddit_user_agent
)

In [8]:
# Crawl Reddit
for subreddit in SUBREDDITS:
    reddit_crawler.crawl_subreddit(subreddit, limit=DEFAULT_REDDIT_LIMIT)

# Save Reddit Data
reddit_df = reddit_crawler.save_to_csv(REDDIT_OUTPUT_CSV)

# Optionally Display Data to Inspect
reddit_df

Crawling r/CryptoCurrency: 100it [16:50, 10.10s/it]
Crawling r/CryptoMarkets: 100it [03:53,  2.33s/it]
Crawling r/binance: 100it [03:54,  2.34s/it]
Crawling r/CoinBase: 100it [06:49,  4.09s/it]
Crawling r/Crypto_com: 100it [04:16,  2.56s/it]
Crawling r/kucoin: 100it [02:06,  1.26s/it]
Crawling r/BitcoinBeginners: 100it [03:52,  2.32s/it]
Crawling r/CryptoScams: 100it [02:32,  1.52s/it]
Crawling r/Kraken: 100it [01:12,  1.38it/s]
Crawling r/Bybit: 100it [01:52,  1.12s/it]
Crawling r/OKX: 100it [01:05,  1.54it/s]
Crawling r/CryptoTechnology: 100it [02:07,  1.28s/it]
Crawling r/Ethereum: 100it [07:52,  4.72s/it]


Saved 188590 records to ../data/reddit_crypto_data.csv


Unnamed: 0,id,title,text,score,created_utc,author,num_comments,subreddit,permalink,type,platform,sentiment,parent_id
0,n7rl2y,You hear about the kid who put in $500 into a ...,You hear about the kid who put in $500 into a ...,53912,2021-05-08 23:28:35,jonbristow,4452.0,CryptoCurrency,https://reddit.com/r/CryptoCurrency/comments/n...,submission,general,negative,
1,gxfsm80,,My buddy (who's a successful business person w...,642,2021-05-09 07:10:28,DopeMeme_Deficiency,,CryptoCurrency,https://reddit.com/r/CryptoCurrency/comments/n...,comment,general,positive,t3_n7rl2y
2,gxeifve,,You also don’t hear about me turning my $20 in...,17076,2021-05-09 00:43:56,louisgrc11,,CryptoCurrency,https://reddit.com/r/CryptoCurrency/comments/n...,comment,general,neutral,t3_n7rl2y
3,gxea1f7,,Survivorship bias is real and it's everywhere,1472,2021-05-08 23:35:03,cremebruleejuulpod,,CryptoCurrency,https://reddit.com/r/CryptoCurrency/comments/n...,comment,general,positive,t3_n7rl2y
4,gxefbuj,,I'm one of those guys who fomo'd in 2017 and e...,678,2021-05-09 00:18:13,foreignGER,,CryptoCurrency,https://reddit.com/r/CryptoCurrency/comments/n...,comment,general,positive,t3_n7rl2y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
188585,hiwxffp,,If anyone wondering why u\\tryM3B1tch called m...,0,2021-11-02 02:39:27,WarrenMuppet007,,Ethereum,https://reddit.com/r/ethereum/comments/qkaa7r/...,comment,general,positive,t1_hiwvs8g
188586,hiyu47e,,Check out Hop Protocol. You can bridge between...,2,2021-11-02 10:54:22,,,Ethereum,https://reddit.com/r/ethereum/comments/qkaa7r/...,comment,general,positive,t1_hixsz9x
188587,hiz5rau,,"i read through their white paper, and while it...",2,2021-11-02 12:43:46,3umel,,Ethereum,https://reddit.com/r/ethereum/comments/qkaa7r/...,comment,general,positive,t1_hiyu47e
188588,hiwy2gy,,"Lol, you tried to dig up my post history to in...",1,2021-11-02 02:43:46,WarrenMuppet007,,Ethereum,https://reddit.com/r/ethereum/comments/qkaa7r/...,comment,general,positive,t1_hiwxqro
