In [None]:
import os
import sys
import numpy as np
import pandas as pd
import json
from pprint import pprint
import requests
import time
import random
!pip install langdetect

from langdetect import detect

from datetime import datetime, date, timedelta
import string
from scipy import stats

from pymongo import MongoClient

MONGO_HOST= '' #Fill MongoDB connexion string

try:
    client = MongoClient(MONGO_HOST)
    db = client.database # Use database (If it doesn't exist, it will be created)
except Exception as e:
    print(e)

In [None]:
def date2timestamp(date):
    '''
    "01/12/2011"
    '''
    return int(time.mktime(datetime.strptime(date, "%Y/%m/%d").timetuple()))

def next_day(date):
  tmrw = datetime.strptime(date, "%Y/%m/%d") + timedelta(days=1)
  return tmrw.strftime('%Y/%m/%d')

def list_of_days(start_date, end_date):

    delta = end_date - start_date       # as timedelta
    days = []
    for i in range(delta.days + 1):
        day = start_date + timedelta(days=i)
        days.append(str(day).replace('-', '/'))
    return days

def scrape_reddit(subreddit, date_start, date_end, size = 1000):
    """ Collects posts from a given subreddit

    Args:
        subreddit: name of the subreddit from which the posts will be collected
        date_start: start date of the collection
        date_end: end date of the collection
        size = {1,1000} number of posts

    Returns:
        Dataframe of the collected posts
    """
    start = date2timestamp(date_start)
    end = date2timestamp(date_end)
    # use the pushshift api to extract out data
    url = 'https://api.pushshift.io/reddit/search/submission/?subreddit={}&sort=desc&sort_type=created_utc&after={}&before={}&size={}'.format(subreddit,start, end, size)
    print(url)
    try:
        posts = requests.get(url)
        posts = posts.json()
        posts = posts['data']
    except:
        time.sleep(30)
        posts = requests.get(url)
        posts = posts.json()
        posts = posts['data']

    df = pd.DataFrame(columns=['subreddit', 'author', 'date', 'post',"title", "upvote_ratio"])
    
    for post in posts:
        if 'selftext' in post: # check if selftext parameter exists
            text=post['selftext']
            if text != "" and  text != '[removed]' and '[deleted]' not in text: # further check if selftext is not empty
                try: 
                    df = df.append(post, ignore_index=True)
                except:
                    continue
    return df


In [None]:
# List of subreddits posts will be collected from
subreddits = ["covidlonghaulers","Longhaulers","LongCovid","LongHaulersRecovery",
             "longcovidsolutions", "LongCovidActivism", "Long_Covid","LongCovidFighters",
             "LongHaulerKids", "covidlonga"]

# Time period and size for scraping the subreddits
days = list_of_days(date(2019,7,20), date(2022,8,17))
size = 1000

# Final dataframe containing all of the collected posts from the different subreddits
subreddit_df = pd.DataFrame()

# Looping through the predefined list of subreddits to scrape and save their posts
for subreddit in subreddits:
    days_local = list(days)
    while subreddit_df.shape[0] < 50000 and days_local:
        idx = random.randint(0, len(days_local)-1)
        date_start = days_local.pop(idx)
        date_end = next_day(date_start)
        df = scrape_reddit(subreddit, date_start, date_end, size = size)
        subreddit_df = pd.concat([subreddit_df, df])
        time.sleep(0.5)
        print(subreddit)
        print(subreddit_df.shape)
    # saving to the database
    db.Reddit_Long_Covid.insert_many(subreddit_df.to_dict('records'))