In [5]:
import asyncio
import nest_asyncio
import praw
import pandas as pd
from datetime import datetime
from google.colab import userdata

# Install praw if it's not already installed
try:
    import praw
except ModuleNotFoundError:
    print("praw module not found. Installing...")
    !pip install praw
    import praw
try:
    import asyncpraw
except ModuleNotFoundError:
    print("asyncpraw module not found. Installing...")
    !pip install asyncpraw
    import asyncpraw

# Get Reddit API credentials from Colab secrets
try:
    client_id = userdata.get('client_id')
    client_secret = userdata.get('secret_key')
    user_agent = userdata.get('user_agent')
except KeyError as e:
    raise KeyError(f"Missing Reddit API credential: {e}. Please ensure it's set in Colab secrets.")

# Initialize Reddit API client
async def create_async_reddit_instance():
    return asyncpraw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent=user_agent
    )

# Define subreddit
subreddit_name = "uberdrivers"


# --- Function to Scrape Posts and Comments ---
async def scrape_reddit_data(subreddit_name, limit=500):
    """
    Scrapes Reddit posts and their comments from a given subreddit using asyncpraw.

    Args:
        subreddit_name (str): The name of the subreddit to scrape.
        limit (int): The number of posts to retrieve.

    Returns:
        pandas.DataFrame: A DataFrame containing the scraped data.
    """
    reddit = await create_async_reddit_instance()
    subreddit = await reddit.subreddit(subreddit_name)

    posts_data = []
    async for submission in subreddit.new(limit=limit):
        await submission.load() # Load the submission before accessing comments
        post_info = {
            'post_id': submission.id,
            'post_title': submission.title,
            'post_author': str(submission.author),
            'post_text': submission.selftext,
            'post_created_utc': datetime.utcfromtimestamp(submission.created_utc),
            'post_url': submission.url,
            'post_upvotes': submission.score,
            'post_num_comments': submission.num_comments,
            'comments': [] # Initialize the list to store comments
        }

        async for comment in submission.comments:
            if isinstance(comment, asyncpraw.models.MoreComments):
                continue
            comment_info = {
                'comment_id': comment.id,
                'comment_author': str(comment.author),
                'comment_text': comment.body,
                'comment_created_utc': datetime.utcfromtimestamp(comment.created_utc),
                'comment_upvotes': comment.score,
                'parent_id': comment.parent_id if hasattr(comment, 'parent_id') else None #Parent id may be null
            }
            post_info['comments'].append(comment_info)

        posts_data.append(post_info)
    await reddit.close()
    return pd.DataFrame(posts_data)



# --- Run Scraper and Store Data ---
async def main():
    print("Scraping data...")
    df = await scrape_reddit_data(subreddit_name, limit=500)
    print("Scraping complete.")

    # Convert post_created_utc to local time
    df['post_created_utc'] = df['post_created_utc'].dt.tz_localize('UTC').dt.tz_convert('America/Los_Angeles')
    # Convert comment created utc to local time
    for i, row in df.iterrows():
        for comment in row['comments']:
            comment['comment_created_utc'] = datetime.utcfromtimestamp(comment['comment_created_utc'].timestamp()).replace(tzinfo=None)
            comment['comment_created_utc'] = pd.to_datetime(comment['comment_created_utc']).tz_localize('UTC').tz_convert('America/Los_Angeles')

    # --- Display and Save Data ---
    print(df.head())

    # Save data to a CSV file
    file_name = f"{subreddit_name}_reddit_data_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
    df.to_csv(file_name, index=False)
    print(f"Data saved to {file_name}")


if __name__ == "__main__":
    nest_asyncio.apply()
    asyncio.run(main())

Scraping data...


  df = await scrape_reddit_data(subreddit_name, limit=500)


Scraping complete.
   post_id                                         post_title  \
0  1ildz22                            7 hours with Uber black   
1  1ildvf3                   Share disappeared - anyone else?   
2  1ildp4e                     Two years of offers like this.   
3  1ildf2n   Tried uber share for the first time, what a scam   
4  1ilcder  Pax should not be allowed to change the trip w...   

           post_author                                          post_text  \
0          Brandon2058  Not to bad $50+ per hour just doing straight U...   
1      ___Your___Mom__  Not complaining, happy it's gone. Everyday whe...   
2  No-Examination-4742                                                      
3    misterstealurbaby            5 dollars for 15km and almost one hour.   
4            TrollBond  It was a nice warm afternoon and I had only 20...   

           post_created_utc  \
0 2025-02-09 04:47:12-08:00   
1 2025-02-09 04:41:05-08:00   
2 2025-02-09 04:30:10-08:00   
3 2

In [3]:
!pip install praw

Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.3/189.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update_checker, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 update_checker-0.18.0
