# Reddit API

Reddit makes it's APIs available for public use. This provides data on query activity that tell us what topics people are interested in getting information on. The focus of this project is to build a recommendation engine that can segment subreddit into different groups using Topic Modeling, therefore we perform convinience sampling to retrieve information that we wanted since it's inexpensive and easy. 

In [1]:
import requests
import pandas as pd
from datetime import datetime
import csv

### Authentication

In [2]:
def authentication(client_id, secret_key):
    auth = requests.auth.HTTPBasicAuth(client_id, secret_key)
    return auth

def authRequest(auth):
    # Create a file of username.txt and pwd.txt
    with open('username.txt', 'r') as f:
        username = f.read()
    with open('pwd.txt', 'r') as f:
        pwd = f.read()
    
    data = {
    'grant_type' : 'password',
    'username' : username,
    'password' : pwd
    }
    
    headers = {'User-Agent' : 'MyAPI/0.0.1'}
    
    res = requests.post('https://www.reddit.com/api/v1/access_token', auth=auth, data=data, headers=headers)
    token = res.json()['access_token']
    headers['Authorization'] = f'bearer {token}'
    
    return headers

In [3]:
client_id = '<YOUR CLIENT ID>'
secret_key = '<YOUR SECRET KEY>'

# API request in JSON form
auth = authentication(client_id, secret_key)
headers = authRequest(auth)

### Submissions
Collect the most popular 100 subreddits and get 200 submissions from <b>past years</b> for each subreddit as the latest submissions doesn't have sufficient comments.

In [4]:
def df_from_subreddits_response(res):
    df = pd.DataFrame()

    for post in res.json()['data']['children']:
        df = df.append({
            'name': post['data']['name']
        }, ignore_index=True)
    return df

def df_from_listings_response(res):
    df = pd.DataFrame()

    for post in res.json()['data']['children']:
        df = df.append({
            'name': post['data']['name']
        }, ignore_index=True)
    return df


def get_controversial_past_year_submissions(subreddit, n):
    response_list = []
    params = {'limit': '10000', 't': 'year'}
    epochs = n//100
    
    if n < 100:
        epochs = 1
    for _ in range(epochs):
        response = requests.get("https://oauth.reddit.com/" + subreddit + "/controversial", headers=headers, params=params)
        
        if response.status_code != 200:
            error_text = "Request returned an error: {} {}".format(response.status_code, response.text)
            print(response.status_code)
            raise Exception("Unsuccessful Trial")
            
        new_df = df_from_listings_response(response)
        row = new_df.iloc[0]
        params['after'] = row['name']
    
        append_to_csv(response.json(), 'submissions.csv')
                    
    return response_list

def get_popular_subreddits(n):
    response_list = []
    params = {'limit': '10000'}
    epochs = n//100
    
    if n < 100:
        epochs = 1
    for _ in range(epochs):
        response = requests.get("https://oauth.reddit.com/subreddits/popular", headers=headers, params=params)
        
        if response.status_code != 200:
            error_text = "Request returned an error: {} {}".format(response.status_code, response.text)
            print(response.status_code)
            raise Exception("Unsuccessful Trial")

        new_df = df_from_subreddits_response(response)
        row = new_df.iloc[0]
        params['after'] = row['name']
        
        for item in response.json()['data']['children']:
            for post in item['data']:
                if post == "display_name_prefixed":
                    response_list.append(item['data'].get(post))
                        
    return response_list

def write_column_names(fileName):
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Create headers for the data you want to save, in this example, we only want save these columns in our dataset
    csvWriter.writerow(['id', 'kind', 'category', 'created_utc', 'author', 'name', 'subreddit_id', 
                        'subreddit_subscriber','subreddit', 'title', 'selftext', 'upvote_ratio','url','num_comments',
                        'ups', 'downs', 'total_awards_received', 'score', 'created', 'num_crossposts'])
    csvFile.close()
    
def append_to_csv(json_res, fileName):
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    
    print("Appending to csv ... ")

    sr_post_list = json_res['data']['children']

    for post in sr_post_list:
        current_sr_post = post['data']

        id_ = current_sr_post['id']
        kind = post['kind']
        category = current_sr_post['category']
        created_utc = current_sr_post['created_utc']
        author = current_sr_post['author']
        name = current_sr_post['name']
        subreddit_id = current_sr_post['subreddit_id']
        subreddit_subscribers = current_sr_post['subreddit_subscribers']
        subreddit = current_sr_post['subreddit']
        title = current_sr_post['title']
        selftext = current_sr_post['selftext']
        upvote_ratio = current_sr_post['upvote_ratio']
        url = current_sr_post['url']
        num_comments = current_sr_post['num_comments']
        ups = current_sr_post['ups']
        downs = current_sr_post['downs']
        total_awards_received = current_sr_post['total_awards_received']
        score = current_sr_post['score']
        created = current_sr_post['created']
        num_crossposts = current_sr_post['num_crossposts']


        # Assemble all data in a list
        result = [id_, kind, category, created_utc, author, name, subreddit_id, subreddit_subscribers, subreddit, title, selftext, upvote_ratio, url, num_comments, ups, downs, total_awards_received, score, created, num_crossposts]

        # Append the result to the CSV file
        csvWriter.writerow(result)

    csvFile.close()

In [None]:
write_column_names('submissions.csv')
subreddits = get_popular_subreddits(100)
submissions = [get_controversial_past_year_submissions(sr, 200) for sr in subreddits]

### Comments
Retrieving comments is relatively difficult using Reddit API as it limit the data request for maximum of 10000, thus using third party API such as Pushshift will ease the comments extraction and produce a larger volume of comments dataset for particular post/submission. 

In this section, we will retrieve comments for each particular post from each submission. These comments will eventually be stored in the databases along with it's essential information as JSON. 

In [15]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def get_all_post_ids(filename):
    try:
        df = pd.read_csv(filename)
        post_ids = df.id.tolist()
        return post_ids
    except:
        print("An error occur")
        
def get_all_subreddits(filename):
    try:
        df = pd.read_csv(filename)
        subreddits = df.subreddit.unique().tolist()
        return subreddits
    except:
        print("An error occur")
        
def get_comments_with_post_ids(post_ids):
    # 1.get comment ids based on post ids
    # 2.get comment text based on comment ids
    CHUNK_SIZE = 100
    comment_id_list = []
    
    comment_id_list = [api.search_submission_comment_ids(ids=ids_chunk) for ids_chunk in list(chunks(post_ids, CHUNK_SIZE))]
    print("Successful Attempt on generating all comment ids ... ")
    comment_ids = [chunk_id for item in comment_id_list for chunk_id in list(item)]
    print("Successful Attempt on deserialize all comment ids ... ")
    comment_body_list = [api.search_comments(ids=body_chunk) for body_chunk in list(chunks(comment_ids, CHUNK_SIZE))]
    print("Successful Attempt on generating all comments ... ")
    return comment_body_list

def write_comment_column_names(fileName):
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Create headers for the data you want to save, in this example, we only want save these columns in our dataset
    csvWriter.writerow(['author', 'author_fullname', 'body', 'created_utc', 'id', 'parent_id', 'subreddit'
                        'subreddit_subscriber', 'retrived_on', 'score'])
    csvFile.close()

def comment_parsing(comment_list, fileName):
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    
    print("Appending to csv ... ")
    for comment in comment_list:
        author = comment['author']
#         author_fullname = comment['author_fullname']
        body = comment['body']
        created_utc = comment['created_utc']
        _id = comment['id']
        parent_id = comment['parent_id']
        subreddit = comment['subreddit']
        subreddit_id = comment['subreddit_id']
#         retrieved_on = comment['retrieved_on']
        score = comment['score']
        
        result = [author, body, created_utc, _id, parent_id, subreddit, subreddit_id, score]
        
        csvWriter.writerow(result)
    
    csvFile.close()
    
def write_comments(subreddit, limit=500):
    for usubreddit in subreddits:
        comments = api.search_comments(subreddit=usubreddit, limit=limit)
        comment_parsing(list(comments), 'comments.csv')
        time.sleep(5)

In [16]:
from pmaw import PushshiftAPI
import time

start = time. time()

api = PushshiftAPI()
# write_column_names('comments.csv')
subreddits = get_all_subreddits('submissions.csv')
write_comment_column_names('comments.csv')
write_comments(subreddits, 500)

stop = time. time()
print("The time of the run:", stop - start)

INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 83.33% - Requests: 6 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 5 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 83.33% - Requests: 6 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 5 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 71.43% - Requests: 7 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 71.43% - Requests: 7 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 83.33% - Requests: 6 - Batches: 1 - Items Remaining: 0
Appending to csv .

INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 71.43% - Requests: 7 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 83.33% - Requests: 6 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 7 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 62.50% - Requests: 8 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 5 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 71.43% - Requests: 7 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 85.71% - Requests: 7 - Batches: 1 - Items Remaining: 0
Appending to csv .

Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 83.33% - Requests: 6 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 85.71% - Requests: 7 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 5 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 62.50% - Requests: 8 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 83.33% - Requests: 6 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 5 - Batches: 1 - Items Remaining: 0
Appending to csv ... 
Parsing complete...
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 71.43% - Requests: 7 - Batches:

# Staging
Just a simple pipeline to store data.

In [None]:
submissions = pd.read_csv('submissions.csv')
comments = pd.read_csv('comments.csv')

### Data Cleaning

In [None]:
submissions['created_datetime'] = pd.to_datetime(submissions['created_utc'],unit='s')

Mapping pandas datatype to SQL datatype.

In [None]:
def sqlcol(dfparam):    
    dtypedict = {}
    for i,j in zip(dfparam.columns, dfparam.dtypes):
        if "object" in str(j):
            dtypedict.update({i: sqlalchemy.types.NVARCHAR(length=255)})
                                 
        if "datetime" in str(j):
            dtypedict.update({i: sqlalchemy.types.DateTime()})

        if "float" in str(j):
            dtypedict.update({i: sqlalchemy.types.Float(precision=3, asdecimal=True)})

        if "int" in str(j):
            dtypedict.update({i: sqlalchemy.types.INT()})
    return dtypedict

outputdict = sqlcol(df)    
column_errors.to_sql('load_errors', 
                     push_conn, 
                     if_exists = 'append', 
                     index = False, 
                     dtype = outputdict)

### PostgreSQL
Not writing any sql queries here since I perform these queries locally. The following are just illustration.

In [None]:
%load_ext sql
from sqlalchemy import create_engine

<b>After bypass the authentication phase, any SQL query and manipulation can be done by starting with %%sql</b>

In [None]:
### Example
%%sql

CREATE TABLE submissions (
    id BIGSERIAL NOT NULL PRIMARY KEY,
    post_id VARCHAR(100) NOT NULL,
    kind VARCHAR(100) NOT NULL,
    category VARCHAR(100),
    created_utc DATETIME,
    author VARCHAR(100) NOT NULL,
    name VARCHAR(100) NOT NULL,
    subreddit_id VARCHAR(100) NOT NULL,
    subreddit_subscriber INT NOT NULL,
    subreddit VARCHAR(100) NOT NULL,
    title TEXT NOT NULL,
    selftext TEXT,
    upvote_ratio FLOAT,
    url VARCHAR(100),
    num_comments INT,
    ups INT,
    downs INT,
    score INT
);


CREATE TABLE comments (
    id BIGSERIAL NOT NULL PRIMARY KEY,
    post_id VARCHAR(100) NOT NULL,
    kind VARCHAR(100) NOT NULL,
    category VARCHAR(100),
    created_utc DATETIME,
    author VARCHAR(100) NOT NULL,
    name VARCHAR(100) NOT NULL,
    subreddit_id VARCHAR(100) NOT NULL,
    subreddit_subscriber INT NOT NULL,
    subreddit VARCHAR(100) NOT NULL,
    title NVARCHAR NOT NULL,
    selftext NVARCHAR,
    upvote_ratio FLOAT,
    url VARCHAR(100),
    num_comments INT,
    ups INT,
    downs INT,
    score INT
);