# Web scraping the redflagdeals forum

Project summary:

In [561]:
# Packages
import requests # Scraping
from bs4 import BeautifulSoup # HTML parsing
import pandas as pd
import numpy as np
import json
import datetime
import re

## Retrieving data from the "Hot Deals - All Categories" sub-forum

Page format: `url/page#/`

In [428]:
# URL base, current page, and total number of pages. Used to iterate over page URLs.
current_page = "" # page number used to format base URL
total_pages = 0 # total number for endpoint of iteration
page_url = "https://forums.redflagdeals.com/hot-deals-f9/" # base url for "Hot Deals"

# URL base to generate links to specific posts
base_url = "https://forums.redflagdeals.com"

# Dataframe to store scraped data
table = pd.DataFrame(columns=['title', 'source', 'url', 'votes', 'replies', 'views', 'creation_date', 'last_reply', 'author'])

In [435]:
def get_posts(page: str):
    """Returns list of all HTML post elements found on page"""
    
    # Initalize list of posts on page class="row topic"
    posts = []
    
    # Get entire page content
    response = requests.get(page)
    content = response.content

    # URL parser
    parser = BeautifulSoup(content, 'html.parser')
    
    # Find total number of pages
    # Format of text: " {current page #} of {total page #} "
    # Need to strip white space and extract total page #
    pages = parser.select(".pagination_menu_trigger")[0].text.strip().split("of ")[1]
    total_pages = int(pages)
    
    # Find and return list of topics
    topics = parser.find_all("li", class_="row topic")
    return topics

In [562]:
def additional_info(post: str) -> dict:
    """Extracts and returns additional information from a specific post:
    url-link to the deal, the price, the discount percentage, and the expiry date
    if available"""
    
    # Additional information about deal
    add = {}
    
    # Get content of post
    response = requests.get(post)
    content = response.content
    
    # Parse URL
    parser = BeautifulSoup(content, 'html.parser')
    
    # Offer-summary field: may contain deal link, price, saving, and retailer
    summary = parser.select(".post_offer_fields") # format example: "Price:\n$200\nSaving:\n70%"
    try:
        summary_list = summary[0].text.split("\n") 
    except: summary_list = []
        
    # Go through summary elements and save relevant information
    for i in range(1, (len(summary_list) -1), 2): # index 0 is empty string
        current_element = summary_list[i] # content of current list element
        next_element = summary_list[i+1] # next list element
        
        # Price, saving, and expiry date information contained in the next list element will be saved
        if current_element.startswith("Price") or current_element.startswith("Saving") or current_element.startswith("Expiry"):
            add[current_element]  = next_element # next elements corrsponds to content
            
    # URL to link
    try: 
        url = str(summary[0]).split('href="')[1].split('"')[0] # select link between href=" and "
        add['Link:'] = url
    except: add['Link:'] = np.nan
        
    
    # If any of the elements is not found in the field add None value to dictionary 
    if "Price:" not in add:
        add['Price:'] = np.nan
        
    if "Savings:" not in add:
        add['Savings:'] = np.nan
        
    if "Expiry:" not in add:
        add['Expiry:'] = np.nan
    
    return add # Return dictionary containing with information on price, saving and expiry  

In [571]:
def fill_table(posts: list) -> None:
    '''Fills table with data from elements of the post objects'''
    
    # For appending data 
    tmp_table = pd.DataFrame() # temporary DataFrame that holds all column objects. Will be appended to the global `table`. 
    
    # Initializing columns for tmp_table
    title_col = pd.Series()
    source_col = pd.Series()
    url_col = pd.Series()
    votes_col = pd.Series()
    replies_col = pd.Series()
    views_col = pd.Series()
    creation_date_col = pd.Series()
    last_reply_col = pd.Series()
    author_col = pd.Series()
    price_col = pd.Series()
    saving_col = pd.Series()
    expiry_col = pd.Series()
    

    # Iterate through post elements and extract data for table
    for post in posts:
        
        # Retailer corresponding to deal
        try: 
            source = post.select(".topictitle_retailer")[0].text.split("\n")[0] # split and remove line-break characters
            source_series = pd.Series(source) # transform into Series object
        except: source_series = pd.Series(np.nan)
        source_col = source_col.append(source_series, ignore_index=True) # append to column and ignore index to avoid complications when merging with DataFrame object

        # Number of votes
        try: 
            votes = post.select(".post_voting")[0].text.split("\n")[1] # split and remove line-break characters
            votes_series = pd.Series(votes) # transform into Series object
        except: votes_series = pd.Series(0)
        votes_col = votes_col.append(votes_series, ignore_index=True) # append to column
            
        # Title 
        try:
            topic = post.select(".topic_title_link") # contains title and sub-url to post
            title = topic[0].text.split('\n')[1] # extract text and remove line-break characters
            title_series = pd.Series(title)
        except: title_series = pd.Series(np.nan)
        title_col = title_col.append(title_series, ignore_index=True)

        # Date of initial posting
        try: 
            creation = post.select(".first-post-time")[0].text.split("\n")[0] # remove line-breaks
            creation_series = pd.Series(creation)
        except: creation_series = pd.Series(np.nan)
        creation_date_col = creation_date_col.append(creation_series, ignore_index=True) # append to column
        
        # Date of most recent replie
        try: 
            last_replie = post.select(".last-post-time")[0].text.split("\n")[0] # remove line-breaks
            last_replie_series = pd.Series(last_replie)
        except: last_replie_series = pd.Series(np.nan)
        last_reply_col = last_reply_col.append(last_replie_series, ignore_index=True) # append to column
        
        # Author user-name
        try:
            author = post.select(".thread_meta_author")[0].text.split("\n")[0]
            author_series = pd.Series(author)
        except: author_series = pd.Series(np.nan)
        author_col = author_col.append(author_series, ignore_index=True)
        
        
        # Number of replies
        try:
            replies = post.select(".posts")[0].text.split("\n")[0]
            replies = replies.replace(",","") # replace any commas to prepare for data type switch to integer
            replies_series = pd.Series(replies)
        except: replies_series = pd.Series(np.nan)
        replies_col = replies_col.append(replies_series, ignore_index=True)
        
        # Number of views
        try:
            views = post.select(".views")[0].text.split("\n")[0]
            views = views.replace(",","") # replace any commas to prepare for data type switch to integer
            views_series = pd.Series(views)
        except: replies_series = pd.Series(np.nan)
        views_col = views_col.append(views_series, ignore_index=True)
        
        # Link to current post
        try:
            link = str(topic).split('href="')[1] # split at href to extract link
            link_clean = link.split('">')[0] # remove superfluous characters
        except: 
            link_clean = np.nan
        
        # Additional information post
        if link_clean != None: # retrieve information from post, if it exists
            post_url = (base_url + "{}").format(link_clean) # merge base-, and sub-url to generate the complete post-link
            add_info = additional_info(post_url) # get additonal information on price, saving, and expiry-date
            
            # Fill columns with additional information from add_info dictionary
            price_col = price_col.append(pd.Series(add_info['Price:']), ignore_index=True)
            saving_col = saving_col.append(pd.Series(add_info['Savings:']), ignore_index=True)
            expiry_col = expiry_col.append(pd.Series(add_info['Expiry:']), ignore_index=True)
            url_col = url_col.append(pd.Series(add_info['Link:']), ignore_index=True)
        else:
            price_col = price_col.append(np.nan)
            saving_col = saving_col.append(np.nan)
            expiry_col = expiry_col.append(np.nan)
            url_col = url_col.append(np.nan)
        
            
    # Fill temporary table
    tmp_table['title'] = title_col
    tmp_table['votes'] = votes_col.astype(int)
    tmp_table['source'] = source_col
    tmp_table['creation_date'] = creation_date_col
    tmp_table['last_reply'] = last_reply_col
    tmp_table['author'] = author_col
    tmp_table['replies'] = replies_col.astype(int)
    tmp_table['views'] = views_col.astype(int)
    tmp_table['price'] = price_col
    tmp_table['saving'] = saving_col
    tmp_table['expiry'] = expiry_col
    tmp_table['url'] = url_col
        
    # Print result
    print(tmp_table.head())

In [572]:
# List of posts
posts = get_posts(url)

# Test
fill_table(posts)

# Test
# tmp = "https://forums.redflagdeals.com/bed-bath-and-beyond-zojirushi-ns-tsc10-5-5-cup-micom-rice-cooker-145-59-20-off-coupon-ymmv-no-coupon-181-99-2388699/"
# additional_info(tmp)

                                               title  votes  \
0  $499.99 Acer Aspire 3 Ryzen 3 3200U / 8GB RAM ...     20   
1                Ryobi Gasoline Backpack Blower YMMV     -2   
2                 Quickjack 7000slx $200 off - $1300     25   
3  Amex Personal Platinum offer - $250 credits fo...    144   
4  Lenovo Q27h QHD 27” monitor IPS/USB-C/VESA/Spe...     53   

               source           creation_date               last_reply  \
0  Shoppers Drug Mart  Jul 10th, 2020 4:40 pm  Jul 11th, 2020 11:17 pm   
1          Home Depot  Jul 11th, 2020 3:15 pm  Jul 11th, 2020 11:14 pm   
2              Costco   Jul 6th, 2020 9:56 am  Jul 11th, 2020 11:14 pm   
3    American Express   Jun 4th, 2020 3:44 am  Jul 11th, 2020 11:14 pm   
4       Lenovo Canada  Jun 13th, 2020 9:50 pm  Jul 11th, 2020 11:12 pm   

       author  replies  views       price     saving           expiry  \
0    mangoman      157  17998  $499 + 20x        NaN    July 12, 2020   
1   HarelD475       15   2415 