In [1]:
import pandas as pd
import numpy as np
import requests
import json
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import datetime, timezone, timedelta

In [2]:
def get_recent_posts(base_time, num_posts, avoid_distinguished=True, attached=None):
    if attached:
        posts = attached
    else:
        posts = []
    after = None
    params = {'t': 'timestamp'}
    
    # Chuyển đổi thời gian cơ sở thành Unix timestamp
    base_timestamp = int(base_time.timestamp())
    params['before'] = base_timestamp
    
    while len(posts) < num_posts:
        if after is not None:
            params['after'] = after
        res = requests.get('https://reddit.com/.json', params=params, headers={'User-agent': 'Dodge Bot 0.1'})
        if res.status_code != 200:
            print('API request failed.')
            return None
        the_json = res.json()
        
        if avoid_distinguished:
            page = [child for child in the_json['data']['children'] if not child['data']['stickied'] 
                    and not child['data']['archived'] 
                    and not child['data']['distinguished']
                    and not child['data']['domain'].startswith('i.redd.it') 
                    and not child['data']['domain'].startswith('v.redd.it') 
                    and not child['data']['url'].startswith('https://www.reddit.com/gallery')]
                    
        else:
            page = [child for child in the_json['data']['children'] if not child['data']['domain'].startswith('i.redd.it')
                     and not child['data']['domain'].startswith('v.redd.it') 
                     and not child['data']['url'].startswith('https://www.reddit.com/gallery')]
        
        posts.extend(page)
        after = the_json['data']['after']
        time.sleep(1)
        if len(posts) >= num_posts:
            break
        if after is None:
            break
    return posts[:num_posts]


In [3]:
def get_posts_by_day(base_time, num_days, avoid_distinguished=True, max_posts_per_day=None):
    all_posts = []

    for i in range(num_days):
        current_day = base_time - timedelta(days=i)
        base_time_day_start = datetime(current_day.year, current_day.month, current_day.day, tzinfo=timezone.utc)
        base_time_day_end = base_time_day_start + timedelta(days=1)

        posts = get_recent_posts(base_time_day_end, num_posts=max_posts_per_day, avoid_distinguished=avoid_distinguished)

        all_posts.extend(posts)

    return all_posts

base_time = datetime.now(timezone.utc)
num_days = 3  # Lấy trong 3 ngày gần đây, bạn có thể thay đổi số ngày tùy ý

all_posts = get_posts_by_day(base_time, num_days, max_posts_per_day=50000)

print(len(all_posts))


3177


In [4]:
def get_subreddit_subscribers(subreddit):
    url = f'https://www.reddit.com/r/{subreddit}/about.json'
    headers = {'User-agent': 'Dodge Bot 0.1'}  
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        subscribers = data['data']['subscribers']
        return subscribers
    else:
        print(f"Failed to retrieve data for subreddit {subreddit}.")
        return None

In [5]:
subreddit_names = []
for post in all_posts:
    subreddit_name = post['data']['subreddit_name_prefixed'].split('r/')[-1]
    subreddit_names.append(subreddit_name)

subscribers = []
for subreddit_name in subreddit_names:
    subscribers_count = get_subreddit_subscribers(subreddit_name)
    subscribers.append(subscribers_count)




In [6]:
data = []

for post, subscriber_count in zip(all_posts, subscribers):
    post_data = {
        'created_utc': post['data']['created_utc'],
        'title': post['data']['title'],
        'selftext': post['data']['selftext'],
        'subreddit': post['data']['subreddit'],
        'num_comments': post['data']['num_comments'],
        'domain': post['data']['domain'],
        'ups': post['data']['ups'],
        'url': post['data']['url'],
        'base_time': base_time,
        'subscribers': subscriber_count
    }
    data.append(post_data)

df = pd.DataFrame(data)

df

Unnamed: 0,created_utc,title,selftext,subreddit,num_comments,domain,ups,url,base_time,subscribers
0,1.715380e+09,"Cancer Survivors of Reddit, what was the sympt...",,AskReddit,3765,self.AskReddit,5885,https://www.reddit.com/r/AskReddit/comments/1c...,2024-05-11 11:44:09.704697+00:00,46297399
1,1.715358e+09,What did you not appreciate until you had it?,"You've probably heard the saying, ""You don't a...",ask,3502,self.ask,3137,https://www.reddit.com/r/ask/comments/1cotf3w/...,2024-05-11 11:44:09.704697+00:00,868813
2,1.715394e+09,Saved/dreamed my whole life of buying a brand ...,GM of gwatney chevrolet in Arkansas took my c8...,mildlyinfuriating,2505,self.mildlyinfuriating,34106,https://www.reddit.com/r/mildlyinfuriating/com...,2024-05-11 11:44:09.704697+00:00,7533389
3,1.715362e+09,Sony just banned Ghost of Tsushima from being ...,You thought it was just helldivers eh?\n\nnon-...,gaming,2428,self.gaming,13567,https://www.reddit.com/r/gaming/comments/1couw...,2024-05-11 11:44:09.704697+00:00,40982380
4,1.715358e+09,Bumble founder says your dating 'AI concierge'...,,technology,2195,fortune.com,8857,https://fortune.com/2024/05/10/bumbles-whitney...,2024-05-11 11:44:09.704697+00:00,16272171
...,...,...,...,...,...,...,...,...,...,...
3172,1.715408e+09,thank you kdot for doing this on a friday night,no sleep tonight. actually i am going to bed. ...,KendrickLamar,13,self.KendrickLamar,96,https://www.reddit.com/r/KendrickLamar/comment...,2024-05-11 11:44:09.704697+00:00,719260
3173,1.715411e+09,Drake suing Kendrick is part of the plan,So for those that need context drake is suppos...,KendrickLamar,24,self.KendrickLamar,82,https://www.reddit.com/r/KendrickLamar/comment...,2024-05-11 11:44:09.704697+00:00,719259
3174,1.715409e+09,I don’t want dot to get into beef again,my heart can’t take it dog I can’t afford to l...,KendrickLamar,14,self.KendrickLamar,95,https://www.reddit.com/r/KendrickLamar/comment...,2024-05-11 11:44:09.704697+00:00,719260
3175,1.715411e+09,(Uk Person here) waking up this shit is a fuck...,All of this is fucking crazy,KendrickLamar,32,self.KendrickLamar,82,https://www.reddit.com/r/KendrickLamar/comment...,2024-05-11 11:44:09.704697+00:00,719261


In [8]:
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client['data_mining']  # Tên cơ sở dữ liệu
collection = db['reddit_data']  # Tên collection

# Chuyển DataFrame thành dạng dictionary (nếu cần)
records = df.to_dict(orient='records')

# Chèn dữ liệu vào collection
collection.insert_many(records)

InsertManyResult([ObjectId('663f6d9ad68fbb9ba8203b78'), ObjectId('663f6d9ad68fbb9ba8203b79'), ObjectId('663f6d9ad68fbb9ba8203b7a'), ObjectId('663f6d9ad68fbb9ba8203b7b'), ObjectId('663f6d9ad68fbb9ba8203b7c'), ObjectId('663f6d9ad68fbb9ba8203b7d'), ObjectId('663f6d9ad68fbb9ba8203b7e'), ObjectId('663f6d9ad68fbb9ba8203b7f'), ObjectId('663f6d9ad68fbb9ba8203b80'), ObjectId('663f6d9ad68fbb9ba8203b81'), ObjectId('663f6d9ad68fbb9ba8203b82'), ObjectId('663f6d9ad68fbb9ba8203b83'), ObjectId('663f6d9ad68fbb9ba8203b84'), ObjectId('663f6d9ad68fbb9ba8203b85'), ObjectId('663f6d9ad68fbb9ba8203b86'), ObjectId('663f6d9ad68fbb9ba8203b87'), ObjectId('663f6d9ad68fbb9ba8203b88'), ObjectId('663f6d9ad68fbb9ba8203b89'), ObjectId('663f6d9ad68fbb9ba8203b8a'), ObjectId('663f6d9ad68fbb9ba8203b8b'), ObjectId('663f6d9ad68fbb9ba8203b8c'), ObjectId('663f6d9ad68fbb9ba8203b8d'), ObjectId('663f6d9ad68fbb9ba8203b8e'), ObjectId('663f6d9ad68fbb9ba8203b8f'), ObjectId('663f6d9ad68fbb9ba8203b90'), ObjectId('663f6d9ad68fbb9ba8203b