In [8]:
%%capture
!pip install praw tweepy pandas requests beautifulsoup4 python-dotenv textblob tqdm

In [9]:
import praw
import tweepy
import json
import os
import time
import pandas as pd
from datetime import datetime
from dotenv import load_dotenv
from textblob import TextBlob
from tqdm import tqdm
from langdetect import detect, DetectorFactory

DetectorFactory.seed = 0  # To make detection deterministic

### Fetch Credentials

In [13]:
os.environ["REDDIT_CLIENT_ID"] = "WRIBjg8QaaS0f4PikZyhyA"
os.environ["REDDIT_CLIENT_SECRET"] = "7czti0jLsBftW1CdET7z_F3yNaZQpg"
os.environ["REDDIT_USER_AGENT"] = "OpinionSearchBot/0.1"

In [14]:
load_dotenv()

# Retrieve credentials
reddit_client_id = os.getenv("REDDIT_CLIENT_ID")
reddit_client_secret = os.getenv("REDDIT_CLIENT_SECRET")
reddit_user_agent = os.getenv("REDDIT_USER_AGENT")

### Crawling Configurations

In [15]:
# Crawler default settings
DEFAULT_REDDIT_LIMIT = 800

# Subreddits to crawl
SUBREDDITS = [
    'CryptoCurrency',
    'CryptoMarkets',
    'binance',
    'CoinBase',
    'Crypto_com',
    'kucoin',
    'BitcoinBeginners',
    'CryptoScams',
    'Kraken',
    'Bybit',
    'OKX',
    'CryptoTechnology',
    'Ethereum'
]

# Output file paths
REDDIT_OUTPUT_CSV = "../data/reddit_crypto_data.csv"

# Crypto Exchanges for detection
CRYPTO_EXCHANGES = {
    'binance': ['binance', 'bnb', 'binance us', 'binance app', 'binance exchange'],
    'coinbase': ['coinbase', 'coinbase pro', 'coinbase wallet', 'cb wallet'],
    'kraken': ['kraken', 'kraken exchange', 'kraken pro'],
    'okx': ['okx', 'okex'],
    'kucoin': ['kucoin', 'kucoin exchange'],
    'crypto.com': ['crypto.com', 'crypto.com app'],
    'bybit': ['bybit', 'bybit app']
}

### Sentiment Analysis Method
- We want a balanced number of positive, negative and neutral

In [16]:
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

### Reddit Crawler Setup

In [17]:
class RedditCrawler:
    """Class for crawling Reddit posts and comments related to crypto exchanges."""

    def __init__(self, client_id, client_secret, user_agent):
        self.reddit = praw.Reddit(
            client_id=client_id,
            client_secret=client_secret,
            user_agent=user_agent
        )
        self.data = []

    def crawl_subreddit(self, subreddit_name, limit=2000, search_query=None):
        subreddit = self.reddit.subreddit(subreddit_name)
        submissions = subreddit.search(search_query, limit=limit) if search_query else subreddit.top(time_filter="all", limit=limit)

        for submission in tqdm(submissions, desc=f"Crawling r/{subreddit_name}"):
            post_text = f"{submission.title} {submission.selftext}".strip()
            platform = self._detect_crypto_exchange(post_text)

            if not self._is_valid_english_opinion(post_text, platform):
                continue

            post_data = {
                'id': submission.id,
                'title': submission.title,
                'text': submission.selftext,
                'score': submission.score,
                'created_utc': datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                'author': str(submission.author),
                'num_comments': submission.num_comments,
                'subreddit': subreddit_name,
                'permalink': f"https://reddit.com{submission.permalink}",
                'type': 'submission',
                'platform': platform,
                #'sentiment': get_sentiment(post_text)
            }
            self.data.append(post_data)

            # Process comments
            submission.comments.replace_more(limit=5)
            for comment in submission.comments.list():
                comment_text = comment.body.strip()
                platform = self._detect_crypto_exchange(comment_text)

                if not self._is_valid_english_opinion(comment_text, platform):
                    continue

                comment_data = {
                    'id': comment.id,
                    'text': comment_text,
                    'score': comment.score,
                    'created_utc': datetime.fromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                    'author': str(comment.author),
                    'parent_id': comment.parent_id,
                    'subreddit': subreddit_name,
                    'permalink': f"https://reddit.com{submission.permalink}{comment.id}/",
                    'type': 'comment',
                    'platform': platform,
                    #'sentiment': get_sentiment(comment_text)
                }
                self.data.append(comment_data)

    def _detect_crypto_exchange(self, text):
        text = text.lower()
        for exchange, keywords in CRYPTO_EXCHANGES.items():
            for keyword in keywords:
                if keyword in text:
                    return exchange
        return 'general'

    def _is_valid_english_opinion(self, text, platform):
        """Filter out short, non-English, or general-opinion content."""
        if not text or len(text) < 20:
            return False
        if platform == 'general':
            return False
        try:
            return detect(text) == 'en'
        except:
            return False

    def save_to_csv(self, filename):
        df = pd.DataFrame(self.data)
        df.drop_duplicates(subset=['id'], inplace=True)
        df.to_csv(filename, index=False)
        print(f"Saved {len(df)} records to {filename}")
        return df

    def save_to_json(self, filename):
        with open(filename, 'w') as f:
            json.dump(self.data, f)
        print(f"Saved {len(self.data)} records to {filename}")

### Execute Crawling Process

In [18]:
# Initialise Reddit Crawler
reddit_crawler = RedditCrawler(
    client_id=reddit_client_id,
    client_secret=reddit_client_secret,
    user_agent=reddit_user_agent
)

In [19]:
# Crawl Reddit
for subreddit in SUBREDDITS:
    reddit_crawler.crawl_subreddit(subreddit, limit=DEFAULT_REDDIT_LIMIT)

# Save Reddit Data
reddit_df = reddit_crawler.save_to_csv(REDDIT_OUTPUT_CSV)

# Optionally Display Data to Inspect
reddit_df

Crawling r/CryptoCurrency: 800it [10:28,  1.27it/s]
Crawling r/CryptoMarkets: 800it [01:02, 12.84it/s]
Crawling r/binance: 800it [05:43,  2.33it/s]
Crawling r/CoinBase: 800it [20:52,  1.57s/it]
Crawling r/Crypto_com: 800it [06:26,  2.07it/s]
Crawling r/kucoin: 800it [03:12,  4.16it/s]
Crawling r/BitcoinBeginners: 800it [03:14,  4.10it/s]
Crawling r/CryptoScams: 800it [02:16,  5.88it/s]
Crawling r/Kraken: 800it [06:01,  2.21it/s]
Crawling r/Bybit: 800it [07:45,  1.72it/s]
Crawling r/OKX: 800it [07:22,  1.81it/s]
Crawling r/CryptoTechnology: 800it [00:50, 15.75it/s]
Crawling r/Ethereum: 800it [02:20,  5.69it/s]


Saved 58341 records to ../data/reddit_crypto_data.csv


Unnamed: 0,id,title,text,score,created_utc,author,num_comments,subreddit,permalink,type,platform,parent_id
0,n9cby0,Not every new coin is a shitcoin: How to spot ...,"A few days ago, I made a post titled *""Rugpull...",20342,2021-05-11 02:29:24,hazelvelvet,2467.0,CryptoCurrency,https://reddit.com/r/CryptoCurrency/comments/n...,submission,binance,
1,gxomc6o,,Great post. I have a few questions. \n\n1) How...,11,2021-05-11 10:15:20,hoti0101,,CryptoCurrency,https://reddit.com/r/CryptoCurrency/comments/n...,comment,binance,t3_n9cby0
2,gxnksae,,> Start trading on the BSC (Binance) Smart Cha...,58,2021-05-11 05:05:51,fakesteez,,CryptoCurrency,https://reddit.com/r/CryptoCurrency/comments/n...,comment,binance,t3_n9cby0
3,gxsnol3,,"okay so i bought into 3 coins, around $10 tota...",4,2021-05-12 08:13:54,still_alive11,,CryptoCurrency,https://reddit.com/r/CryptoCurrency/comments/n...,comment,binance,t3_n9cby0
4,gxnazsz,,Isn’t the minimum purchase set to $15 on binance?,7,2021-05-11 03:55:45,BadAssPleb,,CryptoCurrency,https://reddit.com/r/CryptoCurrency/comments/n...,comment,binance,t3_n9cby0
...,...,...,...,...,...,...,...,...,...,...,...,...
58336,guhqwvf,,"Wait, there's an actual Ethereum app? Do I nee...",2,2021-04-14 22:46:45,HallofLogos,,Ethereum,https://reddit.com/r/ethereum/comments/mqr22z/...,comment,coinbase,t3_mqr22z
58337,guhneao,,I see the From and To on [Etherscan.io](https:...,1,2021-04-14 22:19:55,nightwillalwayswin,,Ethereum,https://reddit.com/r/ethereum/comments/mqr22z/...,comment,coinbase,t1_guhm1zp
58338,gui4vqg,,ive been all in on crypto for years and i stil...,20,2021-04-15 00:28:07,,,Ethereum,https://reddit.com/r/ethereum/comments/mqr22z/...,comment,coinbase,t1_gui4j8p
58339,guiqmeu,,I can't even tell you how happy I am to hear t...,14,2021-04-15 03:06:30,lukejames,,Ethereum,https://reddit.com/r/ethereum/comments/mqr22z/...,comment,coinbase,t1_gui9zsq
