In [22]:
# CITATION: https://utkuufuk.com/2018/07/29/reddit-scraping/
# !pip3 install beautifulsoup4
# !pip3 install requests
# !pip3 install lxml
from bs4 import BeautifulSoup
import time as datetime
import argparse
import requests
import re
import json
from multiprocessing import Process, Manager
SITE_URL = 'https://old.reddit.com/'
REQUEST_AGENT = 'Mozilla/5.0 Chrome/47.0.2526.106 Safari/537.36'

In [23]:
def keyword_to_url(keyword="pit bull"):
  # SITE_URL = 'https://old.reddit.com/'
  search_url = SITE_URL + 'search?q="' + keyword + '"'
  return search_url

def createSoup(url):
    return BeautifulSoup(requests.get(url, headers={'User-Agent':REQUEST_AGENT}).text, 'lxml')

def getSearchResults(searchUrl):
    posts = []
    while True:
        resultPage = createSoup(searchUrl)
        posts += resultPage.findAll('div', {'class':'search-result-link'})
        footer = resultPage.findAll('a', {'rel':'nofollow next'})
        if footer:
            searchUrl = footer[-1]['href']
        else:
            return posts
            
def parsePost(post, results):
  try:
    time = post.find('time')['datetime']
    date = datetime.strptime(time[:19], '%Y-%m-%dT%H:%M:%S')
    title = post.find('a', {'class':'search-title'}).text
    # print(title)
    score = post.find('span', {'class':'search-score'}).text
    score = int(re.match(r'[+-]?\d+', score).group(0))
    author = post.find('a', {'class':'author'}).text
    subreddit = post.find('a', {'class':'search-subreddit-link'}).text
    commentsTag = post.find('a', {'class':'search-comments'})
    url = commentsTag['href']
    numComments = int(re.match(r'\d+', commentsTag.text).group(0))
    commentTree = {} if numComments == 0 else parseComments(url)
    # results.append({'title':title, 'url':url, 'date':str(date), 'score':score,
    #                 'author':author, 'subreddit':subreddit, 'comments':commentTree})
    results.append(title)
    [results.append(comment['text']) for comment in commentTree.values()]
  except:
    pass

def parseComments(commentsUrl):
    commentTree = {}
    commentsPage = createSoup(commentsUrl)
    commentsDiv = commentsPage.find('div', {'class':'sitetable nestedlisting'})
    comments = commentsDiv.findAll('div', {'data-type':'comment'})
    for comment in comments:
        numReplies = int(comment['data-replies'])
        tagline = comment.find('p', {'class':'tagline'})
        author = tagline.find('a', {'class':'author'})
        author = "[deleted]" if author == None else author.text
        date = tagline.find('time')['datetime']
        date = datetime.strptime(date[:19], '%Y-%m-%dT%H:%M:%S')
        commentId = comment.find('p', {'class':'parent'}).find('a')['name']
        content = comment.find('div', {'class':'md'}).text.replace('\n','')
        # print(content)
        score = comment.find('span', {'class':'score unvoted'})
        score = 0 if score == None else int(re.match(r'[+-]?\d+', score.text).group(0))
        parent = comment.find('a', {'data-event-action':'parent'})
        parentId = parent['href'][1:] if parent != None else ''
        parentId = '' if parentId == commentId else parentId
        commentTree[commentId] = {'author':author, 'reply-to':parentId, 'text':content,
                                  'score':score, 'num-replies':numReplies, 'date':str(date)}
    return commentTree

In [24]:
def create_product_dump_file(keyword = 'pit bull'):
  TARGET_URL = 'drive/MyDrive/DS4A Team 73/DS4A_projectdata/NLP Data/Reddit Posts/'
  try:
    product = json.load(open(TARGET_URL + keyword + '.json'))
  except FileNotFoundError:
    print('WARNING: Database file not found. Creating a new one...')
    product = {}
  search_url = keyword_to_url(keyword)
  posts = getSearchResults(search_url)
  print('Started scraping', len(posts), 'posts.')
  keyword_hyphenated = keyword.replace(' ', '-')
  results = Manager().list()
  jobs = []
  for post in posts:
        job = Process(target=parsePost, args=(post, results))
        jobs.append(job)
        job.start()
  for job in jobs:
    job.join()
  # product[keyword] = list(results)
  # product = parsePosts(posts, product, keyword)
  with open(TARGET_URL + keyword + '.json', 'w', encoding='utf-8') as f:
        json.dump(list(results), f, indent=4, ensure_ascii=False)

In [None]:
# create_product_dump_file('pit bull')

## Automating Dataset Generation

In [1]:
!ls 'drive/MyDrive/DS4A Team 73/DS4A_projectdata/NLP Data'

'Reddit Posts'	 SentimentAnalysis.ipynb   StandardDogBreeds.json


In [2]:
import json
dog_breeds_file = 'drive/MyDrive/DS4A Team 73/DS4A_projectdata/NLP Data/StandardDogBreeds.json'
try:
    dog_breeds_json = json.load(open(dog_breeds_file))
except FileNotFoundError:
    print('WARNING: Database file not found.')

list_of_subbreeds = [(subbreed + ' ' + breed) for breed, list_of_subbreeds in dog_breeds_json.items() for subbreed in list_of_subbreeds ]
list_of_breeds_with_no_subbreed = [breed for breed, list_of_subbreeds in dog_breeds_json.items()]
print(len(list_of_subbreeds), len(list_of_breeds_with_no_subbreed))
full_list_of_dogs = list_of_subbreeds + list_of_breeds_with_no_subbreed
print(len(full_list_of_dogs))

81 92
173


In [None]:
# Download all text for all subbreeds
[create_product_dump_file(subbreed) for subbreed in list_of_subbreeds]

Started scraping 178 posts.
Started scraping 85 posts.
Started scraping 71 posts.
Started scraping 244 posts.
Started scraping 242 posts.
Started scraping 41 posts.
Started scraping 72 posts.
Started scraping 0 posts.
Started scraping 239 posts.
Started scraping 0 posts.
Started scraping 0 posts.
Started scraping 101 posts.
Started scraping 240 posts.
Started scraping 7 posts.
Started scraping 251 posts.
Started scraping 242 posts.
Started scraping 215 posts.
Started scraping 245 posts.
Started scraping 240 posts.
Started scraping 25 posts.
Started scraping 137 posts.
Started scraping 231 posts.
Started scraping 110 posts.
Started scraping 239 posts.
Started scraping 241 posts.
Started scraping 234 posts.
Started scraping 245 posts.
Started scraping 243 posts.
Started scraping 135 posts.
Started scraping 249 posts.
Started scraping 196 posts.
Started scraping 0 posts.
Started scraping 234 posts.
Started scraping 240 posts.
Started scraping 247 posts.
Started scraping 27 posts.
Started 

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [None]:
list_of_breeds_with_no_subbreed = [breed for breed, list_of_subbreeds in dog_breeds_json.items()]
list_of_breeds_with_no_subbreed
# Download all text for all breeds that don't have subbreeds, need to do this because I only downloaded dogs with subbreeds earlier
[create_product_dump_file(breed) for breed in list_of_breeds_with_no_subbreed]

Started scraping 221 posts.
Started scraping 246 posts.
Started scraping 249 posts.
Started scraping 250 posts.
Started scraping 248 posts.
Started scraping 241 posts.
Started scraping 221 posts.
Started scraping 243 posts.
Started scraping 247 posts.
Started scraping 16 posts.
Started scraping 237 posts.
Started scraping 144 posts.
Started scraping 234 posts.
Started scraping 227 posts.
Started scraping 246 posts.
Started scraping 203 posts.
Started scraping 245 posts.
Started scraping 248 posts.
Started scraping 249 posts.
Started scraping 246 posts.
Started scraping 246 posts.
Started scraping 248 posts.
Started scraping 146 posts.
Started scraping 246 posts.
Started scraping 250 posts.
Started scraping 246 posts.
Started scraping 247 posts.
Started scraping 230 posts.
Started scraping 248 posts.
Started scraping 248 posts.
Started scraping 248 posts.
Started scraping 238 posts.
Started scraping 184 posts.
Started scraping 242 posts.


NameError: ignored