In [5]:
import pandas as pd
import numpy as np
import requests
import json
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import datetime, timezone, timedelta

In [6]:
def get_recent_posts(base_time, num_posts, avoid_distinguished=True, attached=None):
    if attached:
        posts = attached
    else:
        posts = []
    after = None
    params = {'t': 'timestamp'}
    
    # Chuyển đổi thời gian cơ sở thành Unix timestamp
    base_timestamp = int(base_time.timestamp())
    params['before'] = base_timestamp
    
    while len(posts) < num_posts:
        if after is not None:
            params['after'] = after
        res = requests.get('https://reddit.com/.json', params=params, headers={'User-agent': 'Dodge Bot 0.1'})
        if res.status_code != 200:
            print('API request failed.')
            return None
        the_json = res.json()
        
        if avoid_distinguished:
            page = [child for child in the_json['data']['children'] if not child['data']['stickied'] 
                    and not child['data']['archived'] 
                    and not child['data']['distinguished']
                    and not child['data']['domain'].startswith('i.redd.it') 
                    and not child['data']['domain'].startswith('v.redd.it') 
                    and not child['data']['url'].startswith('https://www.reddit.com/gallery')]
                    
        else:
            page = [child for child in the_json['data']['children'] if not child['data']['domain'].startswith('i.redd.it')
                     and not child['data']['domain'].startswith('v.redd.it') 
                     and not child['data']['url'].startswith('https://www.reddit.com/gallery')]
        
        posts.extend(page)
        after = the_json['data']['after']
        time.sleep(1)
        if len(posts) >= num_posts:
            break
        if after is None:
            break
    return posts[:num_posts]


In [7]:
def get_posts_by_day(base_time, num_days, avoid_distinguished=True, max_posts_per_day=None):
    all_posts = []

    for i in range(num_days):
        current_day = base_time - timedelta(days=i)
        base_time_day_start = datetime(current_day.year, current_day.month, current_day.day, tzinfo=timezone.utc)
        base_time_day_end = base_time_day_start + timedelta(days=1)

        posts = get_recent_posts(base_time_day_end, num_posts=max_posts_per_day, avoid_distinguished=avoid_distinguished)

        all_posts.extend(posts)

    return all_posts

base_time = datetime.now(timezone.utc)
num_days = 10

all_posts = get_posts_by_day(base_time, num_days, max_posts_per_day=50000)

print(len(all_posts))


11307


In [8]:
def get_subreddit_subscribers(subreddit):
    url = f'https://www.reddit.com/r/{subreddit}/about.json'
    headers = {'User-agent': 'Dodge Bot 0.1'}  
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        subscribers = data['data']['subscribers']
        return subscribers
    else:
        print(f"Failed to retrieve data for subreddit {subreddit}.")
        return None

In [9]:
subreddit_names = []
for post in all_posts:
    subreddit_name = post['data']['subreddit_name_prefixed'].split('r/')[-1]
    subreddit_names.append(subreddit_name)

subscribers = []
for subreddit_name in subreddit_names:
    subscribers_count = get_subreddit_subscribers(subreddit_name)
    subscribers.append(subscribers_count)


In [10]:
data = []

for post, subscriber_count in zip(all_posts, subscribers):
    post_data = {
        'created_utc': post['data']['created_utc'],
        'title': post['data']['title'],
        'selftext': post['data']['selftext'],
        'subreddit': post['data']['subreddit'],
        'num_comments': post['data']['num_comments'],
        'domain': post['data']['domain'],
        'ups': post['data']['ups'],
        'url': post['data']['url'],
        'base_time': base_time,
        'subscribers': subscriber_count
    }
    data.append(post_data)

df = pd.DataFrame(data)

df

Unnamed: 0,created_utc,title,selftext,subreddit,num_comments,domain,ups,url,base_time,subscribers
0,1.715423e+09,Official EBU Press Release: Joost Klein will n...,[https://eurovision.tv/mediacentre/release/sta...,eurovision,12594,self.eurovision,4560,https://www.reddit.com/r/eurovision/comments/1...,2024-05-11 16:55:28.100461+00:00,224296
1,1.715432e+09,Update: AITAH for wanting to leave my wife bec...,I made a post 3 months ago but it was removed ...,AITAH,4361,self.AITAH,1142,https://www.reddit.com/r/AITAH/comments/1cpgzs...,2024-05-11 16:55:28.100461+00:00,1610478
2,1.715381e+09,What have you seen inside someone’s home that ...,,AskReddit,3056,self.AskReddit,10303,https://www.reddit.com/r/AskReddit/comments/1c...,2024-05-11 16:55:28.100461+00:00,46301468
3,1.715411e+09,Fnatic vs. Team Liquid / MSI 2024 - Lower Brac...,###MSI 2024 \n[Official page](...,leagueoflegends,2492,self.leagueoflegends,3850,https://www.reddit.com/r/leagueoflegends/comme...,2024-05-11 16:55:28.100461+00:00,7270506
4,1.715415e+09,Eurovision thrown into ‘unprecedented’ chaos a...,,europe,2079,news.com.au,7101,https://www.news.com.au/entertainment/tv/eurov...,2024-05-11 16:55:28.100461+00:00,6418573
...,...,...,...,...,...,...,...,...,...,...
11302,1.715440e+09,There was never a real chance that all of that...,Bare minimum the prescriptions had to be real....,KendrickLamar,15,self.KendrickLamar,99,https://www.reddit.com/r/KendrickLamar/comment...,2024-05-11 16:55:28.100461+00:00,723038
11303,1.715445e+09,Jesus Christ if you don’t know what’s going on...,Stop asking “HEY CAN SOME TELL ME WHATS HAPPEN...,KendrickLamar,10,self.KendrickLamar,76,https://www.reddit.com/r/KendrickLamar/comment...,2024-05-11 16:55:28.100461+00:00,723038
11304,1.715443e+09,I’ve never seen so many people care about dome...,The “but KDot a woman beater” line I keep seei...,KendrickLamar,31,self.KendrickLamar,76,https://www.reddit.com/r/KendrickLamar/comment...,2024-05-11 16:55:28.100461+00:00,723037
11305,1.715412e+09,Mods need to pin a main post with dedicated up...,Lots of people are gonna get on throughout the...,KendrickLamar,26,self.KendrickLamar,345,https://www.reddit.com/r/KendrickLamar/comment...,2024-05-11 16:55:28.100461+00:00,723038


In [15]:
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client['data_mining']  
collection = db['reddit_data']  

records = df.to_dict(orient='records')

collection.insert_many(records)

InsertManyResult([ObjectId('6640827a9f4a31d5698cce68'), ObjectId('6640827a9f4a31d5698cce69'), ObjectId('6640827a9f4a31d5698cce6a'), ObjectId('6640827a9f4a31d5698cce6b'), ObjectId('6640827a9f4a31d5698cce6c'), ObjectId('6640827a9f4a31d5698cce6d'), ObjectId('6640827a9f4a31d5698cce6e'), ObjectId('6640827a9f4a31d5698cce6f'), ObjectId('6640827a9f4a31d5698cce70'), ObjectId('6640827a9f4a31d5698cce71'), ObjectId('6640827a9f4a31d5698cce72'), ObjectId('6640827a9f4a31d5698cce73'), ObjectId('6640827a9f4a31d5698cce74'), ObjectId('6640827a9f4a31d5698cce75'), ObjectId('6640827a9f4a31d5698cce76'), ObjectId('6640827a9f4a31d5698cce77'), ObjectId('6640827a9f4a31d5698cce78'), ObjectId('6640827a9f4a31d5698cce79'), ObjectId('6640827a9f4a31d5698cce7a'), ObjectId('6640827a9f4a31d5698cce7b'), ObjectId('6640827a9f4a31d5698cce7c'), ObjectId('6640827a9f4a31d5698cce7d'), ObjectId('6640827a9f4a31d5698cce7e'), ObjectId('6640827a9f4a31d5698cce7f'), ObjectId('6640827a9f4a31d5698cce80'), ObjectId('6640827a9f4a31d5698cce