# Webscraping to Collect Data for Subreddit Categorization

In order to gather data to test how natural language processing can predict subreddit categories based on the text within each post, I scrape this data from Reddit's API - an interface that allows me to access this data and analyze it.


### Scraping Subreddit Thread Info from Reddit.com
#### subreddits: /r/climbing, /r/yoga

In [4]:
import pandas as pd
import numpy as np
import requests
import json
import time

### Create a global reddit dataframe

In [5]:
reddit_df = pd.DataFrame()

### Pull /r/climbing json (first pull of session)

In [6]:
url = "http://www.reddit.com/r/climbing.json?after=t3_9byemn"

In [7]:
res = requests.get(url,headers = {"User-Agent": "Roman Browser"})

In [8]:
print(res.status_code)
data = res.json()
data

200


{'kind': 'Listing',
 'data': {'modhash': '',
  'dist': 25,
  'children': [{'kind': 't3',
    'data': {'approved_at_utc': None,
     'subreddit': 'climbing',
     'selftext': "Am super impressed with my improvement in technique over the last few months, with this climb being my high water mark so far.\n\nI did have to rest on a cam after pumping out, so it wasn't clean. Gotta keep working on my endurance for that. In terms of moves though, I made some really good ones to get up some committing sections.\n\nAnyone else made some progress lately?\n\nedit: why the downvotes people? Is discussing improvements frowned upon? I don't get it...",
     'user_reports': [],
     'saved': False,
     'mod_reason_title': None,
     'gilded': 0,
     'clicked': False,
     'title': 'Progress Report: Lead my first 5.9 trad',
     'link_flair_richtext': [],
     'subreddit_name_prefixed': 'r/climbing',
     'hidden': False,
     'pwls': 6,
     'link_flair_css_class': None,
     'downs': 0,
     'thumb

In [7]:
print(res.status_code)
data = res.json()

#Convert to list
title_list = []
selftext_list = []
subreddit_list = []
for n in range(0,len(data['data']['children'])):
    title_list.append(data['data']['children'][n]['data']['title'])
    selftext_list.append(data['data']['children'][n]['data']['selftext'])
    subreddit_list.append(data['data']['children'][n]['data']['subreddit'])

#Convert to dataframe
climb_df = pd.DataFrame()
climb_df['title'] = title_list
climb_df['selftext']= selftext_list
climb_df['subreddit']= subreddit_list

#Append to global list
reddit_df = reddit_df.append(climb_df,ignore_index=True)

200


In [50]:
#Save this for additional pulls
next_pg_c = data['data']['after']
next_pg_c

't3_9798yb'

#### I use `res.json()` to convert the response into a dictionary format and set this to a variable. 


In [117]:
data = res.json()

#### Subsequent Pulls from /r/climbing

Since I can only get 25 posts with each pull, I use a for loop to pull each subsequent 25-post set, using the `data['data']['after']` tag that identifies the end of the last pull.

In [3]:
#Save this for additional pulls (for when I leave the notebook and come back to it)
next_pg_c = data['data']['after']
next_pg_c

In [22]:
for i in range(0,5):
    ##Pull from API
    next_pg_c = data['data']['after']
    url2 = "http://www.reddit.com/r/climbing.json" + "?after=" + str(next_pg_c)
    res = requests.get(url2, headers = {"User-Agent": "Roman Browser"})
    print(res.status_code)
    data = res.json()

    #Convert to list
    title_list = []
    selftext_list = []
    subreddit_list = []
    for n in range(0,len(data['data']['children'])):
        title_list.append(data['data']['children'][n]['data']['title'])
        selftext_list.append(data['data']['children'][n]['data']['selftext'])
        subreddit_list.append(data['data']['children'][n]['data']['subreddit'])
    
    #Convert to dataframe
    climb_df = pd.DataFrame()
    climb_df['title'] = title_list
    climb_df['selftext']= selftext_list
    climb_df['subreddit']= subreddit_list

    #Append to global list
    reddit_df = reddit_df.append(climb_df,ignore_index=True)

    time.sleep(10)

200
200
200
200
200


In [23]:
reddit_df.shape

(400, 3)

In [24]:
reddit_df = reddit_df.drop_duplicates()

In [25]:
reddit_df.shape

(400, 3)

In [26]:
reddit_df['subreddit'].value_counts()

climbing    400
Name: subreddit, dtype: int64

In [27]:
552-227

325

### Pull /r/yoga json

#### Initial pull 

In [28]:
url = "http://www.reddit.com/r/yoga.json?after=t3_94ks21"

In [29]:
res = requests.get(url,headers = {"User-Agent": "Roman Browser"})

In [30]:
print(res.status_code)
data_y = res.json()

#convert to list
title_list = []
selftext_list = []
subreddit_list = []
for n in range(0,len(data_y['data']['children'])):
    child = data_y['data']['children'][n]['data']
    title_list.append(child['title'])
    selftext_list.append(child['selftext'])
    subreddit_list.append(child['subreddit'])

#Convert to DataFrame
yoga_df = pd.DataFrame(columns = ['title','selftext','subreddit'])
yoga_df['title'] = title_list
yoga_df['selftext']= selftext_list
yoga_df['subreddit']= subreddit_list  

#Append to global reddit df
reddit_df = reddit_df.append(yoga_df,ignore_index = True)


200


#### Subsequent pulls from /r/yoga

In [33]:
len(data_y['data']['children'])

25

In [51]:
next_pg = data_y['data']['after']
next_pg

't3_8t8t0s'

In [43]:
for i in range(0,5):
    #Pull new page
    next_pg = data_y['data']['after']
    url2 = "http://www.reddit.com/r/yoga.json" + "?after=" + str(next_pg)
    res = requests.get(url2,headers = {"User-Agent": "Roman Browser"})
    print(res.status_code)
    data_y = res.json()
    
    #convert to list
    title_list = []
    selftext_list = []
    subreddit_list = []
    for n in range(0,len(data_y['data']['children'])):
        child = data_y['data']['children'][n]['data']
        title_list.append(child['title'])
        selftext_list.append(child['selftext'])
        subreddit_list.append(child['subreddit'])

    #Convert to DataFrame
    yoga_df = pd.DataFrame(columns = ['title','selftext','subreddit'])
    yoga_df['title'] = title_list
    yoga_df['selftext']= selftext_list
    yoga_df['subreddit']= subreddit_list  

    #Append to global reddit df
    reddit_df = reddit_df.append(yoga_df,ignore_index = True)
    
    time.sleep(10)

200
200
200
200
200


In [44]:
reddit_df.shape

(800, 3)

In [45]:
reddit_df = reddit_df.drop_duplicates()

In [46]:
reddit_df.shape

(800, 3)

#### Write reddit dataframe to a csv

In [47]:
#Write to csv
reddit_df.to_csv('./reddit_data/reddit_climb_yoga_5.csv')
                #('./reddit_data/reddit_climb_yoga_4.csv')
                #('./reddit_data/reddit_climb_yoga_3.csv')
                #('./reddit_data/reddit_climb_yoga_2.csv')
                #('./reddit_data/reddit_climb_yoga.csv')