# Getting Reddit Data

There are two options for extracting data from Reddit:

* The `requests` library, which will allow us to interface directly with the Reddit API.

* The PRAW library, which is a wrapper library that adds an extra layer of abstraction in accessing the Reddit API.

Here we will cover the first option, using the `requests` library to interface directly with the API.

The final extraction script will look like this:

In [14]:
personal_use_script = "QspGOI_RriHL41-UVHstyQ"
secret = "B3c0Ci61lHdHcv2hogNswxpCn4p31g"

In [15]:
import requests
import pandas as pd


class Reddit:
    def __init__(self, client_id, secret_token, username, password):
        # first create authentication object
        auth = requests.auth.HTTPBasicAuth(client_id, secret_token)
        # build login dictionary
        login = {'grant_type': 'password',
                 'username': username,
                 'password': password}
        # setup header info (incl description of API)
        headers = {'User-Agent': 'MyBot/0.0.1'}
        # send request for OAuth token
        res = requests.post(f'https://www.reddit.com/api/v1/access_token',
                            auth=auth, data=login, headers=headers)
        # pull auth bearer token from response
        print(res.json())
        token = res.json()['access_token']
        # add authorization to headers dictionary
        headers['Authorization'] = f'bearer {token}'
        # add headers dict to internal attributes
        self.headers = headers
        # and api
        self.api = 'https://oauth.reddit.com'

    def get_new(self, subreddit, iters):
        # initialize dataframe to store data
        df = pd.DataFrame()
        # initialize parameters dictionary
        params = {'limit': 100}
        # iterate through several times to make sure we get all the data available
        for i in range(iters):
            # make request
            res = requests.get(f'{self.api}/r/{subreddit}/new',
                               headers=self.headers,
                               params=params)
            # check that we returned something (if not we reached end)
            if len(res.json()['data']['children']) == 0:
                print('No more found')
                return df
            # iterate through each thread recieved
            for thread in res.json()['data']['children']:
                # add info to dataframe
                df = df.append({
                    'id': thread['data']['name'],
                    'created_utc': int(thread['data']['created_utc']),
                    'subreddit': thread['data']['subreddit'],
                    'title': thread['data']['title'],
                    'selftext': thread['data']['selftext'],
                    'upvote_ratio': thread['data']['upvote_ratio'],
                    'ups': thread['data']['ups'],
                    'downs': thread['data']['downs'],
                    'score': thread['data']['score']
                }, ignore_index=True)
            # get earliest ID
            earliest = df['id'].iloc[len(df)-1]
            # add earliest ID to params
            params['after'] = earliest
        return df

In [16]:
SUB = 'investing'

In [17]:
CLIENT_ID = personal_use_script
SECRET_TOKEN = secret

In [22]:
USER = 'Swedgetarian'
with open("ww.txt", "r") as fp:
    PWD = fp.read().rstrip("\n")

In [23]:
reddit = Reddit(CLIENT_ID, SECRET_TOKEN, USER, PWD)

{'access_token': '1905573387406-sbCthkJPZRhwSze1Pdm0q9Syf7PhfQ', 'token_type': 'bearer', 'expires_in': 86400, 'scope': '*'}


In [24]:
data = reddit.get_new(SUB, 20)

No more found


In [26]:
data = data.replace({'|': ''}, regex=True)

In [27]:
data.to_csv(f'./data/reddit_{SUB}.csv', sep='|', index=False)