# Steam Game Web Scraping

Web Scraper for Steam review data. 
We will obtain review data, include: 
* User Steam ID
* Profile URL
* Review Content
* Review Length (Approximately)
* Recommend or Not Recommend
* Play Hours
* Date Posted
* Number of Award of a Review
* Number of Helpful and Funny of a Review

Inside the `while` loop to modify the data you want to obtain.

In [None]:
# import selenium
from msedge.selenium_tools import Edge, EdgeOptions
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from time import sleep
from datetime import datetime
import csv

# Before Web Scraping

You will need to install the Selenium package and corresponding Web Driver for your browser.

To install the corresponding driver, check the browser version that you are planning to use and go to the [Selenium](https://www.selenium.dev/downloads/) page under **Browsers**, and click into the documentation of the browser you plan to use and download the corresponding version.

After your download, simply move the Web Driver to your jupyter notebook directory. No INSTALLATION is required!

# Game ID

Each game in Steam will have a unique id and that will help us to do the web scraping.

For the review URL link, you can filter the language and then replace it with the game id to do the web scraping.

https://steamcommunity.com/app/289070/reviews/?browsefilter=toprated&snr=1_5_100010_

In [None]:
game_id = 289070

In [None]:
# default URL Review Link
template = 'https://steamcommunity.com/app/{}/reviews/?browsefilter=toprated&snr=1_5_100010_'

# URL after filter with only Enlgish
template_with_language = 'https://steamcommunity.com/app/{}/reviews/?browsefilter=toprated&snr=1_5_100010&filterLanguage=english'

# insert the ID into the URL
url = template_with_language.format(game_id)

In [None]:
# edge browser settings
options = EdgeOptions()
options.use_chromium = True
options.add_experimental_option('useAutomationExtension', False)
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option("detach", True)
driver = Edge(options=options)

In [None]:
driver.maximize_window()
driver.get(url)

In [None]:
last_position = driver.execute_script("return window.pageYOffset;")

reviews = []
review_ids = set()
running = True

while running:
    # get data on the page
    cards = driver.find_elements_by_class_name('apphub_Card')

    for card in cards[-20:]:  # only the tail end are new data

        # profile url
        profile_url = card.find_element_by_xpath('.//div[@class="apphub_friend_block"]/div/a[2]').get_attribute('href')

        # steam user id
        steam_id = profile_url.split('/')[-2]
        
        # check to see if collected for this review
        if steam_id in review_ids:
            continue
        else:
            review_ids.add(steam_id)

        # username
        user_name = card.find_element_by_xpath('.//div[@class="apphub_friend_block"]/div/a[2]').text

        # language of the review
        date_posted = card.find_element_by_xpath('.//div[@class="apphub_CardTextContent"]/div').text
        review_content = card.find_element_by_xpath('.//div[@class="apphub_CardTextContent"]').text.replace(date_posted,'').strip()    

        # helpful & funny
        helpful = card.find_element_by_xpath('.//div[@class="apphub_UserReviewCardContent"]/div').text
        
        # award
        award = card.find_element_by_xpath('.//div[@class="found_helpful"]/div').text
        
        # review length
        review_length = len(review_content.replace(' ', ''))    

        # recommendation
        thumb_text = card.find_element_by_xpath('.//div[@class="reviewInfo"]/div[2]').text
        thumb_text    

        # amount of play hours
        play_hours = card.find_element_by_xpath('.//div[@class="reviewInfo"]/div[3]').text
        play_hours    

        # save review
        review = (steam_id, profile_url, review_content, thumb_text, review_length, play_hours, date_posted, award, helpful)
        reviews.append(review)    
        
    # attempt to scroll down by position 3 times, then break
    scroll_attempt = 0
    while True:
        
        try:
            driver.find_element_by_xpath('.//div[@class="apphub_GetMoreContent"]/div/a[1]').click();
        except NoSuchElementException:
            pass
            
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")    
        sleep(1)
        curr_position = driver.execute_script("return window.pageYOffset;")
        
        
        if curr_position == last_position:
            scroll_attempt += 1
            sleep(1)
            
            if curr_position >= 3:
                running = False
                break
        else:
            last_position = curr_position
            break  # continue scraping the results

# shutdown the web driver (close the browser)
driver.close()

In [None]:
# save the file to a CSV file
today = datetime.today().strftime('%Y%m%d')   
with open(f'Steam_Reviews_{game_id}_{today}.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['SteamId', 'ProfileURL', 'ReviewText', 'Review', 'ReviewLength(Chars)', 'PlayHours', 'DatePosted', 'Award','HelpFunny'])
    writer.writerows(reviews)