## Imports
___

In [1]:
import pandas as pd
import numpy as np
import requests
import time

In [2]:
base_url = 'https://api.pushshift.io/reddit/search/submission'

## Sub 1
___

In [3]:
#create df (will be overwritten later)
posts_sci = pd.DataFrame()

In [4]:
#assign last current value
last = int(time.time())

In [5]:
while posts_sci.shape[0] < 1000:
    params = {
'subreddit' : 'askscience',
'size' : 100,
'before' : last,
}
    res_sci = requests.get(base_url, params=params)
    if res_sci.status_code == 200:
#         if successful
        if int(posts_sci.shape[0]) == 0:
#             if this is the first time, overwrite the existing df
            posts_sci = pd.DataFrame(res_sci.json()['data'])[['title', 'selftext', 'subreddit', 'created_utc']]
            last = posts_sci['created_utc'].values[-1]
            time.sleep(7)
        else:
#             otherwise merge the new data with the existing df
            data = res_sci.json()
            posts_sci2 = pd.DataFrame(res_sci.json()['data'])[['title', 'selftext', 'subreddit', 'created_utc']]
            posts_sci = pd.concat([posts_sci, 
                       posts_sci2], ignore_index=True)
            last = posts_sci['created_utc'].values[-1]
            time.sleep(7)
    else:
        print('a problem occured')

In [6]:
posts_sci.drop_duplicates()

Unnamed: 0,title,selftext,subreddit,created_utc
0,Can a nuclear bomb set off another nuke?,[removed],askscience,1646168184
1,"God forbid, Russia decides to go for the n nuc...",[removed],askscience,1646168000
2,Why can the common cold coexist with Covid?,[removed],askscience,1646167864
3,What fuel burns for the longest?,[removed],askscience,1646167761
4,Electromagnetic force,[removed],askscience,1646167434
...,...,...,...,...
995,Why do we have the tendency to speak unfiltere...,[removed],askscience,1645735569
996,In terms of science what good has come out of ...,[removed],askscience,1645735128
997,Is it possible to encode a woman's DNA into an...,[removed],askscience,1645734797
998,Has the perception of taste and/or smell been ...,[removed],askscience,1645734420


In [7]:
posts_sci.shape

1000

In [8]:
posts_sci['created_utc'].values[-1]

1645733680

In [9]:
dict(posts_sci.dtypes)

{'title': dtype('O'),
 'selftext': dtype('O'),
 'subreddit': dtype('O'),
 'created_utc': dtype('int64')}

In [10]:
posts_sci.duplicated(subset='title').value_counts()

False    975
True      25
dtype: int64

## Sub 2
___

In [12]:
posts_unpop = pd.DataFrame()
last = int(time.time())

In [13]:
while posts_unpop.shape[0] < 1000:
    params = {
'subreddit' : 'unpopularopinion',
'size' : 100,
'before' : last,
}
    res_unpop = requests.get(base_url, params=params)
    if res_unpop.status_code == 200:
#         if successful
        if int(posts_unpop.shape[0]) == 0:
#             if this is the first time, overwrite the existing df
            posts_unpop = pd.DataFrame(res_unpop.json()['data'])[['title', 'selftext', 'subreddit', 'created_utc']]
            last = posts_unpop['created_utc'].values[-1]
            time.sleep(7)
        else:
#             otherwise merge the new data with the existing df
            data = res_unpop.json()
            posts_unpop2 = pd.DataFrame(res_unpop.json()['data'])[['title', 'selftext', 'subreddit', 'created_utc']]
            posts_unpop = pd.concat([posts_unpop, 
                       posts_unpop2], ignore_index=True)
            last = posts_unpop['created_utc'].values[-1]
            time.sleep(7)
    else:
        print('a problem occured')

In [14]:
posts_unpop.drop_duplicates()

Unnamed: 0,title,selftext,subreddit,created_utc
0,Americans are extremly hypocrite with their vi...,[removed],unpopularopinion,1646168165
1,We give the n-word too much power,[removed],unpopularopinion,1646167946
2,Hairy chests are irresistibly attractive,"Go ahead and pop that second button, then go a...",unpopularopinion,1646167880
3,An Apology.,[removed],unpopularopinion,1646167723
4,It's not right to ban Russian athletes and art...,[removed],unpopularopinion,1646167622
...,...,...,...,...
1093,It’s bizarre and petty to have a day of the we...,It’s fine if you’re least favorite day is the ...,unpopularopinion,1646070677
1094,Russia's invasion of Ukraine isn't any worse t...,[removed],unpopularopinion,1646070461
1095,Porn is not a bad way of being introduced to sex,"First of all, I think people underestimate jus...",unpopularopinion,1646070383
1096,I'm not pro-war.,[removed],unpopularopinion,1646070328


In [17]:
posts_unpop.duplicated(subset='title').value_counts()

False    1061
True       37
dtype: int64

In [15]:
posts_unpop

Unnamed: 0,title,selftext,subreddit,created_utc
0,Americans are extremly hypocrite with their vi...,[removed],unpopularopinion,1646168165
1,We give the n-word too much power,[removed],unpopularopinion,1646167946
2,Hairy chests are irresistibly attractive,"Go ahead and pop that second button, then go a...",unpopularopinion,1646167880
3,An Apology.,[removed],unpopularopinion,1646167723
4,It's not right to ban Russian athletes and art...,[removed],unpopularopinion,1646167622
...,...,...,...,...
1093,It’s bizarre and petty to have a day of the we...,It’s fine if you’re least favorite day is the ...,unpopularopinion,1646070677
1094,Russia's invasion of Ukraine isn't any worse t...,[removed],unpopularopinion,1646070461
1095,Porn is not a bad way of being introduced to sex,"First of all, I think people underestimate jus...",unpopularopinion,1646070383
1096,I'm not pro-war.,[removed],unpopularopinion,1646070328


In [22]:
posts_df = pd.concat([posts_sci, posts_unpop])

In [23]:
posts_df.shape

(2098, 4)

In [27]:
posts_df.drop_duplicates().shape

(2095, 4)

In [28]:
posts_df = posts_df.drop_duplicates()

### Export to CSV:

In [33]:
posts_df.to_csv('Datasets/reddit_posts_scrape.csv', index=False)