# Web scraping the redflagdeals forum

RedFlagDeals.com is a forum where users can post sales that they have come accross. This first part of the project is focused on scraping relevant information from the "All Hot Deals" section, which includes all product categories. In the second and third part I will clean and visualize the data to display interesting deals.

In [561]:
# Packages
import requests # Scraping
from bs4 import BeautifulSoup # HTML parsing
import pandas as pd
import numpy as np
import json
import datetime
import re

## I. Retrieving data from the "Hot Deals - All Categories" sub-forum

Page format: `url/page#/`

In [617]:
# URL base, current page, and total number of pages. Used to iterate over page URLs.
current_page = "" # page number used to format base URL
total_pages = 1 # total number for endpoint of iteration
root_url = "https://forums.redflagdeals.com/hot-deals-f9/" # base url for "Hot Deals"

# URL base to generate links to specific posts
base_url = "https://forums.redflagdeals.com"

# Dataframe to store scraped data
# gloale keyword allows modification in function
table = pd.DataFrame(columns=
    ['title',
    'votes',
    'source',
    'creation_date',
    'last_reply',
    'author',
    'replies',
    'views',
    'price',
    'saving',
    'expiry',
    'url'])

In [614]:
def get_posts(page: str):
    """Returns list of all HTML post elements found on page and sets total_pages variable"""
    
    # Initalize list of posts on page class="row topic"
    posts = []
    
    # Get entire page content
    response = requests.get(page)
    content = response.content

    # URL parser
    parser = BeautifulSoup(content, 'html.parser')
    
    # Find total number of pages
    # Format of text: " {current page #} of {total page #} "
    # Need to strip white space and extract total page #
    pages = parser.select(".pagination_menu_trigger")[0].text.strip().split("of ")[1]
    global total_pages # access to global variable
    total_pages = int(pages)
    
    # Find and return list of topics
    topics = parser.find_all("li", class_="row topic")
    return topics

In [615]:
def additional_info(post: str) -> dict:
    """Extracts and returns additional information from a specific post:
    url-link to the deal, the price, the discount percentage, the expiry date and 
    the parent/thread categories of the product. Returns NaN for objects that are not found."""
    
    # Additional information about deal
    add = {}
    
    # Get content of post
    response = requests.get(post)
    content = response.content
    
    # Parse URL
    parser = BeautifulSoup(content, 'html.parser')
    
    # Offer-summary field: may contain deal link, price, saving, and retailer
    summary = parser.select(".post_offer_fields") # format example: "Price:\n$200\nSaving:\n70%"
    try:
        summary_list = summary[0].text.split("\n") 
    except: summary_list = []
        
    # Go through summary elements and save relevant information
    for i in range(1, (len(summary_list) -1), 2): # index 0 is empty string
        current_element = summary_list[i] # content of current list element
        next_element = summary_list[i+1] # next list element
        
        # Price, saving, and expiry date information contained in the next list element will be saved
        if current_element.startswith("Price") or current_element.startswith("Saving") or current_element.startswith("Expiry"):
            add[current_element]  = next_element # next elements corrsponds to content
            
    # URL to link. Full link not available through .text
    try: 
        url = str(summary[0]).split('href="')[1].split('"')[0] # select link between href=" and "
        add['Link:'] = url
    except: add['Link:'] = np.nan
        
    
    # If any of the elements is not found in the summary-field add None value to dictionary 
    if "Price:" not in add:
        add['Price:'] = np.nan
        
    if "Savings:" not in add:
        add['Savings:'] = np.nan
        
    if "Expiry:" not in add:
        add['Expiry:'] = np.nan
    
    return add # Return dictionary containing with information on price, saving and expiry  

In [616]:
def fill_table(posts: list) -> None:
    '''Fills table with data from elements of the post objects'''
    
    # For appending data 
    tmp_table = pd.DataFrame() # temporary DataFrame that holds all column objects. Will be appended to the global `table`. 
    
    # Initializing columns for tmp_table
    title_col = pd.Series()
    source_col = pd.Series()
    url_col = pd.Series()
    votes_col = pd.Series()
    replies_col = pd.Series()
    views_col = pd.Series()
    creation_date_col = pd.Series()
    last_reply_col = pd.Series()
    author_col = pd.Series()
    price_col = pd.Series()
    saving_col = pd.Series()
    expiry_col = pd.Series()
    

    # Iterate through post elements and extract data for table
    for post in posts:
        
        # Retailer corresponding to deal
        try: 
            source = post.select(".topictitle_retailer")[0].text.split("\n")[0] # split and remove line-break characters
            source_series = pd.Series(source) # transform into Series object
        except: source_series = pd.Series(np.nan)
        source_col = source_col.append(source_series, ignore_index=True) # append to column and ignore index to avoid complications when merging with DataFrame object

        # Number of votes
        try: 
            votes = post.select(".post_voting")[0].text.split("\n")[1] # split and remove line-break characters
            votes_series = pd.Series(votes) # transform into Series object
        except: votes_series = pd.Series(0)
        votes_col = votes_col.append(votes_series, ignore_index=True) # append to column
            
        # Title 
        try:
            topic = post.select(".topic_title_link") # contains title and sub-url to post
            title = topic[0].text.split('\n')[1] # extract text and remove line-break characters
            title_series = pd.Series(title)
        except: title_series = pd.Series(np.nan)
        title_col = title_col.append(title_series, ignore_index=True)

        # Date of initial posting
        try: 
            creation = post.select(".first-post-time")[0].text.split("\n")[0] # remove line-breaks
            creation_series = pd.Series(creation)
        except: creation_series = pd.Series(np.nan)
        creation_date_col = creation_date_col.append(creation_series, ignore_index=True) # append to column
        
        # Date of most recent replie
        try: 
            last_replie = post.select(".last-post-time")[0].text.split("\n")[0] # remove line-breaks
            last_replie_series = pd.Series(last_replie)
        except: last_replie_series = pd.Series(np.nan)
        last_reply_col = last_reply_col.append(last_replie_series, ignore_index=True) # append to column
        
        # Author user-name
        try:
            author = post.select(".thread_meta_author")[0].text.split("\n")[0]
            author_series = pd.Series(author)
        except: author_series = pd.Series(np.nan)
        author_col = author_col.append(author_series, ignore_index=True)
        
        
        # Number of replies
        try:
            replies = post.select(".posts")[0].text.split("\n")[0]
            replies = replies.replace(",","") # replace any commas to prepare for data type switch to integer
            replies_series = pd.Series(replies)
        except: replies_series = pd.Series(np.nan)
        replies_col = replies_col.append(replies_series, ignore_index=True)
        
        # Number of views
        try:
            views = post.select(".views")[0].text.split("\n")[0]
            views = views.replace(",","") # replace any commas to prepare for data type switch to integer
            views_series = pd.Series(views)
        except: replies_series = pd.Series(np.nan)
        views_col = views_col.append(views_series, ignore_index=True)
        
        # Link to current post
        try:
            link = str(topic).split('href="')[1] # split at href to extract link
            link_clean = link.split('">')[0] # remove superfluous characters
        except: 
            link_clean = np.nan
        
        # Additional information post
        if link_clean != None: # retrieve information from post, if it exists
            post_url = (base_url + "{}").format(link_clean) # merge base-, and sub-url to generate the complete post-link
            add_info = additional_info(post_url) # get additonal information on price, saving, and expiry-date
            
            # Fill columns with additional information from add_info dictionary
            price_col = price_col.append(pd.Series(add_info['Price:']), ignore_index=True)
            saving_col = saving_col.append(pd.Series(add_info['Savings:']), ignore_index=True)
            expiry_col = expiry_col.append(pd.Series(add_info['Expiry:']), ignore_index=True)
            url_col = url_col.append(pd.Series(add_info['Link:']), ignore_index=True)
        else:
            price_col = price_col.append(np.nan)
            saving_col = saving_col.append(np.nan)
            expiry_col = expiry_col.append(np.nan)
            url_col = url_col.append(np.nan)
        
            
    # Fill temporary table
    tmp_table['title'] = title_col
    tmp_table['votes'] = votes_col.astype(int)
    tmp_table['source'] = source_col
    tmp_table['creation_date'] = creation_date_col
    tmp_table['last_reply'] = last_reply_col
    tmp_table['author'] = author_col
    tmp_table['replies'] = replies_col.astype(int)
    tmp_table['views'] = views_col.astype(int)
    tmp_table['price'] = price_col
    tmp_table['saving'] = saving_col
    tmp_table['expiry'] = expiry_col
    tmp_table['url'] = url_col
        
    # Print result
    global table # gloabal keyword allows modification inside function
    table = table.append(tmp_table)
    print("Current table length: ", table.shape[0])

In [620]:
# # First page information, and set total_pages through get_posts()
# if total_pages == 1:
#      # Generate list of posts on first page
#     posts = get_posts(url)
    
#     # Fill table from information on first page and corresponding posts
#     fill_table(posts)
# else:
#     #Loop through pages and fill table
#     for page in range(2, total_pages):
#         next_url = root_url + str(page) + "/" # URL of next page: base-url + number + "/"
#         print(next_url)
#         # Generate list of posts on current page
#         posts = get_posts(next_url)

#         # Fill table from information on current page and posts
#         fill_table(posts)

# table.head(10)

https://forums.redflagdeals.com/hot-deals-f9/2/
Current table dimensions:  60
https://forums.redflagdeals.com/hot-deals-f9/3/
Current table dimensions:  90
https://forums.redflagdeals.com/hot-deals-f9/4/
Current table dimensions:  120
https://forums.redflagdeals.com/hot-deals-f9/5/
Current table dimensions:  150
https://forums.redflagdeals.com/hot-deals-f9/6/
Current table dimensions:  179
https://forums.redflagdeals.com/hot-deals-f9/7/
Current table dimensions:  209
https://forums.redflagdeals.com/hot-deals-f9/8/
Current table dimensions:  239
https://forums.redflagdeals.com/hot-deals-f9/9/
Current table dimensions:  269
https://forums.redflagdeals.com/hot-deals-f9/10/
Current table dimensions:  299
https://forums.redflagdeals.com/hot-deals-f9/11/
Current table dimensions:  329
https://forums.redflagdeals.com/hot-deals-f9/12/
Current table dimensions:  359
https://forums.redflagdeals.com/hot-deals-f9/13/
Current table dimensions:  389
https://forums.redflagdeals.com/hot-deals-f9/14/
C

Unnamed: 0,title,votes,source,creation_date,last_reply,author,replies,views,price,saving,expiry,url
0,"$65/20GB, with free Samsung A71",4,Freedom Mobile,"Jul 11th, 2020 3:54 pm","Jul 12th, 2020 9:52 am",pjw918,18,3750,$0,$600,,https://www.freedommobile.ca/en-CA
1,[CC] i5-9400 $175 after coupon + FS,7,Canada Computers,"Jul 11th, 2020 12:32 pm","Jul 12th, 2020 9:52 am",Desperadude,12,2391,,,,https://www.canadacomputers.com/product_info.p...
2,Dudios True Wireless Earbuds - $19.99 with Prime,5,Amazon.ca,"Jul 12th, 2020 9:06 am","Jul 12th, 2020 9:52 am",Indubitably,3,657,19.99,$23 off,,http://www.amazon.ca/gp/redirect.html?ie=UTF8&...
3,Watchdogs 2 PC version free on July 12th,100,,"Jul 6th, 2020 12:49 pm","Jul 12th, 2020 9:51 am",Blackdove77,103,23586,,100%,"July 12, 2020",https://news.ubisoft.com/en-us/article/41nS5f7...
4,CIBC Earn $300 and 12 Month Fee Rebate with a ...,57,CIBC,"Feb 29th, 2020 8:24 am","Jul 12th, 2020 9:50 am",pigqq,434,134661,,,"September 30, 2020",https://www.cibc.com/en/special-offers/smart-f...
5,$25 GC + $300 Cash When Opening a Scotibank Ch...,-3,Scotiabank,"Jun 29th, 2020 5:49 pm","Jul 12th, 2020 9:50 am",MikeTO2,17,5413,,,"July 31, 2020",https://www.ratesupermarket.ca/deals
6,Get a $25 Costco.ca voucher when signing up to...,-1,Costco,"Jul 10th, 2020 7:57 am","Jul 12th, 2020 9:50 am",danzerino,20,4212,,,,
7,Honeywell Air Purifier HPA300 $187.49 ON only?,3,Best Buy,"Jul 11th, 2020 10:37 pm","Jul 12th, 2020 9:49 am",manho,18,2552,$187.49,,,https://bestbuyca.o93x.net/c/341376/644465/102...
8,Corsair Semi-Modular ATX CX650M Power Supply C...,-2,Amazon.ca,"Jul 12th, 2020 9:45 am","Jul 12th, 2020 9:48 am",2Riskit,1,93,$ 79.99,,,http://www.amazon.ca/gp/redirect.html?ie=UTF8&...
9,[Koodo] Free TCL 4K Smart TV when you get the ...,5,,"Jul 11th, 2020 8:41 pm","Jul 12th, 2020 9:47 am",thispig,13,4579,,,,https://www.koodomobile.com/phones/tcl-10-pro


In [623]:
# Write data to csv file
table.to_csv('C:/Users/User/Documents/GitHub/Data-Science/rfd_scrape.csv')

In [625]:
df = pd.read_csv('rfd_scrape.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1409 entries, 0 to 1408
Data columns (total 13 columns):
Unnamed: 0       1409 non-null int64
title            1409 non-null object
votes            1409 non-null int64
source           1061 non-null object
creation_date    1409 non-null object
last_reply       1409 non-null object
author           1409 non-null object
replies          1409 non-null int64
views            1409 non-null int64
price            962 non-null object
saving           564 non-null object
expiry           421 non-null object
url              1099 non-null object
dtypes: int64(4), object(9)
memory usage: 143.2+ KB


## II. Data wrangling