# Data Acquistion from Reddit

Go to <a href=#bookmark>bookmark</a>

### 2019-06-08 - Goal - Develop End-to-End Data Flow, at least at small scale.
# OR BUST

![](https://images.unsplash.com/photo-1515255384510-23e8b6a6ca3c?ixlib=rb-1.2.1&auto=format&fit=crop&w=1489&q=80)

---

## Libraries

In [1]:
# Install libs on this computer:
# !pip install praw
# !pip install pymongo
# !pip install psycopg2

In [2]:
import os             # file system stuff
import json           # digest json
import praw           # reddit API
import pandas as pd   # Dataframes
import pymongo        # MongoDB
import numpy as np    # math and arrays

from time import time # To time stuff

#DATA STORAGE
from sqlalchemy import create_engine # SQL helper
import psycopg2 as psql #PostgreSQL DBs

In [3]:
import helper     # Custom helper functions

---

## 1A Load Reddit keys

Step 3: Create your first Authorized Reddit Instance

In [4]:
# Define path to secret

secret_path = os.path.join(os.environ['HOME'], '.secret', 'reddit.json')
#secret_path = os.path.join(os.environ['HOME'], 'mia/.secret', 'reddit_api.json')

secret_path

'/Users/werlindo/.secret/reddit.json'

#### Save submissions to DB

In [5]:
# Define path to secret

secret_path_aws = os.path.join(os.environ['HOME'], '.secret', 
                           'aws_ps_flatiron.json')
secret_path_aws

'/Users/werlindo/.secret/aws_ps_flatiron.json'

## 1B Load AWS-PostgreSQL DB keys

#### Load keys

In [6]:
aws_keys = helper.get_keys(secret_path_aws)
user = aws_keys['user']
ps = aws_keys['password']
host = aws_keys['host']
db = aws_keys['db_name']

In [7]:
aws_ps_engine = ('postgresql://' + user + ':' + ps + '@' + host + '/' + db)

### Use SQLAlchemy to create PSQL engine

In [8]:
# dialect+driver://username:password@host:port/database
sql_alc_engine = create_engine(aws_ps_engine)

## 2 Load keys, Create Reddit Instance

In [9]:
keys = helper.get_keys(secret_path)

In [10]:
reddit = praw.Reddit(client_id=keys['client_id'] 
                     ,client_secret=keys['api_key']
                     ,username=keys['username']
                     ,password=keys['password']
                     ,user_agent='reddit_research accessAPI:v0.0.1 (by /u/FlatDubs)')

## 3 Obtain a Subreddit Instance(s) from your Reddit Instance

In [11]:
politics = reddit.subreddit('politics')

#### Instantiate subreddit

In [12]:
#got = reddit.subreddit('gameofthrones') #Let's start with got for now. If can dev flow for one, can just dupe for other

## 4 Get subreddit submissions and comments; save to dataframe

#### Initialize parameters for this submissions pull

https://ballotpedia.org/Presidential_candidates,_2020

Democrats

Michael Bennet (D)
Joe Biden (D)
Bill de Blasio (D)
Cory Booker (D)
Steve Bullock (D)
Pete Buttigieg (D)
Julián Castro (D)
John Delaney (D)
Tulsi Gabbard (D)
Kirsten Gillibrand (D)
Mike Gravel (D)
Kamala Harris (D)
John Hickenlooper (D)
Jay Inslee (D)
Amy Klobuchar (D)
Wayne Messam (D)
Seth Moulton (D)
Beto O'Rourke (D)
Tim Ryan (D)
Bernie Sanders (I)[1]
Eric Swalwell (D)
Elizabeth Warren (D)
Marianne Williamson (D)
Andrew Yang (D)
Republicans

Donald Trump (R)
Bill Weld (R)

In [67]:
##### The 'OR' is CASE-SENSITIVE - ALWAYS CAPS!!!

# ROUND 1
# persons = """"
# doran" OR "davos"
# """

# persons = """
#             "bran" OR "brandon stark" OR "jon snow" OR "jon" 
#                         OR "khaleesi" OR "dany" OR "daenerys" OR "danyris"
#          """

# persons = """
#             "kamala" OR "senator harris" OR "K. Harris" OR "biden" OR 
#             "mayor pete" OR "buttigidg" OR "buttigieg" OR "bootijedge"
#         """

# persons = """
#             "gillibrand" OR "hickenlooper" OR "klobuchar" OR "warren" OR
#             "booker" OR "inslee" OR "castro" OR "gabbard" OR "sanders" 
#         """

# persons = """
#             "de blasio" OR "bullock" OR "gravel" OR "messam"  
#         """

# persons = """
#             "o'rourke"  
#         """

# persons = """
#             "bennet" OR "delaney" OR "moulton" OR "swalwell" OR "williamson"
#             OR "yang"
#         """

results_lim = 10000

#### Execute Search

In [68]:
start_time = time()

In [69]:
dems_search = politics.search(persons, 
                        sort='comments',
                       limit= results_lim
                       ,time_filter='month')

# Count # of results
# num_results = sum(1 for s in dems_search)
# print('Returned {} results.'.format(num_results))

# Compile submission into list
title = [] 
num_comments = []
upvote_ratio = []
sub_id = []
i=0

for submission in dems_search:
    i+=1
    title.append(submission.title)
    num_comments.append(submission.num_comments)
    upvote_ratio.append(submission.upvote_ratio)
    sub_id.append(submission.id)
#     body.append(?) #look at this later! is it comment[0]? 
    if i%100 == 0:
        print(f'{i} submissions completed')

subs_df = pd.DataFrame(
    {'title': title,
     'num_comments': num_comments,
     'upvote_ratio': upvote_ratio,
     'id':sub_id
    })

#df_got

#### Now loop through each sub and grab it's comments

In [71]:
# List to hold all the comments dfs
comm_dfs = []

for index, row in subs_df.iterrows():
#     print(row['id'])
    submission = reddit.submission(id=row['id'])

    # Instantiate lists to hold comments data
    comment_body = []
    comment_id = []
    sub_id = []

    while True:
        try:
            submission.comments.replace_more()
            break
        except PossibleExceptions:
            print('Handling replace_more exception')
            sleep(1)
    
    # Loop through comments and put into list
    for comment in submission.comments.list():
    #     print(comment.body)
    #     print(comment.id)
        comment_id.append(comment.id)
        comment_body.append(comment.body)
        sub_id.append(row['id'])

    # create df from lists
    this_df = pd.DataFrame({
        'comment': comment_body,
        'comment_id':comment_id,
        'sub_id':sub_id
    })
    
    # Add this sub's comments df to list of dfs
    comm_dfs.append(this_df)


#### Put all the comments into common df

In [72]:
all_comm_df = pd.concat(comm_dfs, axis=0).reset_index(drop=True)

## 5 Save dataframes' contents to PS DB

#### Use `pandas.to_sql` to write the dataframe to the PostgreSQL database, using the SQLAlchemy engine.
    

In [73]:
# df_got.to_sql('got_subs', con=sql_alc_engine, if_exists='append')

# df_got_comm.to_sql('got_comms', con=sql_alc_engine, if_exists='append')

In [74]:
subs_df.to_sql('dems_subs', con=sql_alc_engine, if_exists='append')

In [75]:
all_comm_df.to_sql('dems_comms', con=sql_alc_engine, if_exists='append')

In [76]:
# Timing Stuff
end_time = time()

mins_to_complete = (end_time - start_time)/60 
print("It took {:.2f} minutes to complete.".format(mins_to_complete))
print("There were {} submissions added.".format(subs_df.shape[0]))
print("There were {:,} comments added.".format(all_comm_df.shape[0]))


It took 18.63 minutes to complete.
There were 97 submissions added.
There were 6,907 comments added.


### Results

persons = """
            "kamala" OR "senator harris" OR "K. Harris" OR "biden" OR 
            "mayor pete" OR "buttigidg" OR "buttigieg" OR "bootijedge"
        """
        
It took 80.46 minutes to complete.  
There were 249 submissions added.  
There were 61,298 comments added.

persons = """
            "gillibrand" OR "hickenlooper" OR "klobuchar" OR "warren" OR
            "booker" OR "inslee" OR "castro" OR "gabbard" OR "sanders" 
            
It took 122.33 minutes to complete.  
There were 250 submissions added.  
There were 95,034 comments added.  

persons = """
            "de blasio" OR "bullock" OR "gravel" OR "messam"  
        """  
        
It took 7.84 minutes to complete.  
There were 91 submissions added.  
There were 2,486 comments added.

persons = """
            "o'rourke"  
        """
        
It took 17.92 minutes to complete.  
There were 87 submissions added.  
There were 7,150 comments added.

persons = """
            "bennet" OR "delaney" OR "moulton" OR "swalwell" OR "williamson"
            OR "yang"
        """  
        
It took 18.63 minutes to complete.  
There were 97 submissions added.  
There were 6,907 comments added.

### Left off here <a name='bookmark' />

![](https://images.unsplash.com/photo-1534224563519-fea04849cadf?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=1350&q=80
 )

![](https://images.unsplash.com/photo-1553058296-61093581de13?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=1351&q=80)

### f. Check that the table was created, or can be appended.

In [35]:
# Setup PSQL connection
conn = psql.connect(
    database=db,
    user=user,
    password=ps,
    host=host,
    port='5432'
)

In [36]:
# QUERY TO GET LIST OF TABLES
# query = """
#     SELECT * FROM pg_catalog.pg_tables
#     WHERE schemaname = 'public';
# """

In [37]:
# Instantiate cursor
cur = conn.cursor()

In [38]:
# Set up query
query = """
    SELECT * FROM dems_comms limit 10;
"""

In [39]:
# Set up query
query = """
    SELECT count(*) ct FROM dems_comms;
"""

In [40]:
# Execute the query
cur.execute(query)

In [41]:
# conn.rollback()

In [42]:
# Check results
df_clone = pd.DataFrame(cur.fetchall())
df_clone.columns = [col.name for col in cur.description]

In [43]:
conn.commit()

In [44]:
df_clone

Unnamed: 0,ct
0,156332


In [45]:
conn.close()