In [26]:
import requests
import pandas as pd
from tqdm.notebook import tqdm
from pprint import pprint
import time
import os
import datetime

In [27]:
def download_posts(subreddit,post_file,iterations = 25):
    """Downloads posts from given subreddit sequentially storing them in a csv
    
    subreddit  -- the subreddit to sarch
    post_file  -- the filepath to store/append data to
    iterations -- number of iterations to perform (100 posts per iteration)
    """
    
    url = 'https://api.pushshift.io/reddit/search/submission'
    
    if os.path.exists(post_file): #load from existing postfile
        df = pd.read_csv(post_file)
        before = df.iloc[-1]['created_utc']
    else:
        df=pd.DataFrame()
        before = int(time.time()) #get current time
    
    params = {
            'subreddit' : subreddit,
            'size':100,
            'before':before
        }

    for i in tqdm(range(iterations)):
        try:
            res = requests.get(url, params)
            status = res.status_code
            res_json = res.json()
            if status !=200:
                print(status)
            
            data = res_json['data']
            if len(data)==0:
                print('End of subreddit')
                break
            df = df.append(pd.DataFrame(data))
            params['before']=df.iloc[-1]['created_utc']

        except:
            time.sleep(60)

        df.to_csv(post_file,index=False) #save out




In [28]:
subreddit = 'ChicksWithGuns'
postfile = f'Subreddit Posts/{subreddit}.csv'
download_posts(subreddit,postfile,1)

  0%|          | 0/1 [00:00<?, ?it/s]

End of subreddit


In [35]:
subreddit = 'ChicksWithGuns'
postfile = f'Subreddit Posts/{subreddit}.csv'
download_posts(subreddit,postfile,200)

  0%|          | 0/200 [00:00<?, ?it/s]

End of subreddit


In [29]:
#may need to check for duplicate images

In [None]:
## Checking for endings of url's for picture file types

In [30]:
pd.read_csv('Subreddit Posts/ChicksWithGuns.csv')['url'].str.split('/').apply(lambda x: x[-1]).str.split('.').apply(lambda x: x[-1]).unique()

array(['jpg', 'ugdxmr', 'hf0al127njv81', 'uag7r4', 'ua5xu6', 'u89h9s',
       'u81o3v', 'u6ashl', 'u59e13', 'u50e5p', 'u4yfu9', 'tzbcht',
       'tyl9cg', 'twu9a3', 'twlszs', 'tw9kjm', 'tvey9e', 'tvatlq',
       'tv6g9o', 'utilize-existing-vehicle-organizations', 'trnfqi',
       'tpbnjy', 'tp89jm', 'tnfjxq', 'tne0kd', 'GuwSTaO', 'pUmZVM2',
       'thvo3w', 'thkv4q', 'tfls2i', 'tcgsxd', 'tc8a0s', 'tbpxq9', 'png',
       'tazini', 'tafrkl', 't6wf33', 't6u4o6', 'eL7ywAn', 't584sa',
       't58339', 't4nrsq', 'NxxfGzn', 'sz03x4', 'sychsw', 'sycgvh',
       'sxnjdj', 'sxkdht', 'b1i9cmh', 'sw6w7e', 'sw6upv', 'xsxkM2z',
       'sv46ni', 'sv3m6g', 'aBtNXRV', 'sumka7', 'Tctqfqf', '', 'pZW4LnS',
       'ssuldv', 'sryvc5', 'sr1uz7', '4TLb9s9', 'slg29u', 'skfmno',
       'sg08j8', 'seqckw', 'scle8s', 'sblh10', 'say669', 's5ovma',
       's5i4ot', 's3ffd8', 's3fazs', 's3e7wy', 's3e20g', 's1wq3a',
       's1iq4l', 's10f3s', 'rutueq', 'rutsmp', 'rtobu0', 'rs2tlr',
       'rs058i', 'rq3p0x', 'rpevsm'

In [33]:
subreddit = 'Idiotswithguns'
postfile = f'Subreddit Posts/{subreddit}.csv'
download_posts(subreddit,postfile, 200)

  0%|          | 0/200 [00:00<?, ?it/s]

End of subreddit


In [36]:
subreddit = 'Idiotswithguns'
postfile = f'Subreddit Posts/{subreddit}.csv'
download_posts(subreddit,postfile, 200)

  0%|          | 0/200 [00:00<?, ?it/s]

End of subreddit


In [34]:
subreddit = 'TheWayWeWere'
postfile = f'Subreddit Posts/{subreddit}.csv'
download_posts(subreddit,postfile, 200)

  0%|          | 0/200 [00:00<?, ?it/s]

In [38]:
subreddit = 'portraits'
postfile = f'Subreddit Posts/{subreddit}.csv'
download_posts(subreddit,postfile, 300)

  download_posts(subreddit,postfile, 300)


  0%|          | 0/300 [00:00<?, ?it/s]

End of subreddit
