# Steam Game Review Scraper

Scrape game review data from Steam, including the user, profile link, and the content of the review itself

In [None]:

from selenium.webdriver import ChromeOptions, Chrome
from selenium.webdriver.common.keys import Keys
import re
from time import sleep
from datetime import datetime
from openpyxl import Workbook
import csv

## Requirements
You'll need to install the following libraries before beginning this project:
- [Selenium](https://www.selenium.dev/downloads/) : for automating the web browser; this can be involved... so check my [short YouTube video](https://youtu.be/9XAH_TvxwLg) for a walkthrough.
- [OpenPyXL](https://openpyxl.readthedocs.io/en/stable/) : for saving the data to an Excel spreadsheet (optional)

## Example
If you want to see an example of the output... you can see the results of me running the scaper for about 5 minutes on a particular game  
[Click to view Excel file](https://drive.google.com/file/d/1Ld04lwFY7OjIMU2wJxRcgdPvJ0o43BRo/view?usp=sharing)

## Getting started

Lookup the game id by doing a search on steam, navigate to the game homepage, and then get the number embedded in the URL before the game title.

In [14]:
#get the list of all steam game ids
# https://api.steampowered.com/ISteamApps/GetAppList/v2/
# get the json from this link
listOfAppIds = []
data = None
import urllib.request, json 
with urllib.request.urlopen("https://api.steampowered.com/ISteamApps/GetAppList/v2/") as url:
    data = json.load(url)

for app in data['applist']['apps']:
    if ("DLC").lower() in app['name'].lower() or app['name'] == "" or app['appid'] % 10 != 0:
        continue
    listOfAppIds.append(app['appid'])
    
# remove duplciates
listOfAppIds = list(set(listOfAppIds))
print(listOfAppIds)
print(len(listOfAppIds))


[1835010, 524290, 1048580, 2359300, 262150, 1572870, 2097160, 10, 1310730, 1835020, 524300, 2359310, 1572880, 786450, 2097170, 20, 1310740, 1835030, 524310, 2359320, 1572890, 786460, 2097180, 30, 1310750, 1835040, 524320, 1048610, 2359330, 786470, 2097190, 40, 1835050, 524330, 2359340, 262190, 1572910, 2097200, 50, 1310770, 1835060, 1048630, 1572920, 786490, 60, 1310780, 1835070, 524350, 1048640, 262210, 1572930, 70, 1310790, 1835080, 524360, 1572940, 786510, 2097230, 80, 1310800, 1835090, 1048660, 262230, 1572950, 786520, 2097240, 90, 1310810, 1835100, 524380, 1048670, 262240, 1572960, 786530, 100, 1310820, 524390, 1572970, 786540, 2097260, 1835120, 1048690, 262260, 786550, 1310840, 1835130, 524410, 2359420, 1572990, 786560, 2097280, 130, 1310850, 524420, 1048710, 2359430, 262280, 1573000, 786570, 2097290, 1310860, 524430, 1573010, 786580, 2097300, 150, 524440, 262300, 1573020, 786590, 2097310, 1310880, 1835170, 524450, 1048740, 2359460, 1573030, 786600, 2097320, 2359470, 1573040, 209

The url template below can be altered to filter by sentiment, language, and recency.  

Check the [website](https://steamcommunity.com/app/387990/positivereviews/?browsefilter=mostrecent) to see what options are available. For this project, I'm going to focus on **Positive** reviews only and sort by **Most Recent**.

In [None]:
# template = 'https://steamcommunity.com/app/{}/positivereviews/?browsefilter=mostrecent'
# template_with_language = 'https://steamcommunity.com/app/{}/positivereviews/?browsefilter=mostrecent&filterLanguage=english'

# url = template_with_language.format(game_id)

In [None]:
# setup driver
options = ChromeOptions()
# options.use_chromium = True
driver = Chrome(options=options)

Maximize the window and get the starting url

In [None]:
# game_id = 1222730


## Scrape the data

The page is continously scrolling, so you'll need to grab the cards, then scroll down to the bottom and repeat until finished. For this project, we are going to collect the following information:
- Steam ID
- Profile URL
- Review Text
- Review Recommendation
- Review Length (chars)
- Play Hours
- Date Posted

In [None]:

for gameId in listOfAppIds:

    template_with_language = 'https://steamcommunity.com/app/{}/reviews/?browsefilter=mostrecent&filterLanguage=english'

    url = template_with_language.format(gameId)

    driver.maximize_window()
    try:
        driver.get(url)
        if driver.current_url == "https://store.steampowered.com/":
            continue
    except Exception as e:
        continue

    # get current position of y scrollbar
    last_position = driver.execute_script("return window.pageYOffset;")
    from selenium.webdriver.common.by import By

    reviews = []
    review_ids = set()
    running = True

    while running:
        # get cards on the page, skip if no cards
        try:
            cards = driver.find_elements(By.CLASS_NAME,'apphub_Card')
            # if driver.current_url == "https://store.steampowered.com/app/" + str(gameId):
            # if url does not begin with https://store.steampowered.com/app/str(gameId)/reviews
            print(driver.current_url,driver.current_url.startswith("https://steamcommunity.com/app/" + str(gameId) + "/reviews"))
            if driver.current_url.startswith("https://steamcommunity.com/app/" + str(gameId) + "/reviews") == False:
                break
                continue
            if len(cards) == 0:
                break
                continue
    
            for card in cards[-20:]:  # only the tail end are new cards

                # gamer profile url
                profile_url = card.find_element(By.XPATH,'.//div[@class="apphub_friend_block"]/div/a[2]').get_attribute('href')

                # steam id
                steam_id = profile_url.split('/')[-2]
                
                # check to see if I've already collected this review
                if steam_id in review_ids:
                    continue
                else:
                    review_ids.add(steam_id)

                # username
                user_name = card.find_element(By.XPATH,'.//div[@class="apphub_friend_block"]/div/a[2]').text

                # language of the review
                date_posted = card.find_element(By.XPATH,'.//div[@class="apphub_CardTextContent"]/div').text
                review_content = card.find_element(By.XPATH,'.//div[@class="apphub_CardTextContent"]').text.replace(date_posted,'').strip()    

                # review length
                review_length = len(review_content.replace(' ', ''))    

                # recommendation
                thumb_text = card.find_element(By.XPATH,'.//div[@class="reviewInfo"]/div[2]').text
                thumb_text    

                # amount of play hours
                play_hours = card.find_element(By.XPATH,'.//div[@class="reviewInfo"]/div[3]').text
                play_hours    

                # save review
                review = (gameId ,steam_id, profile_url, review_content, thumb_text, review_length, play_hours, date_posted)
                reviews.append(review)    
                
            # attempt to scroll down thrice.. then break
            scroll_attempt = 0
            while True:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")    
                sleep(0.5)
                curr_position = driver.execute_script("return window.pageYOffset;")
                
                if curr_position == last_position:
                    scroll_attempt += 1
                    sleep(0.5)
                    
                    if curr_position >= 3:
                        running = False
                        break
                else:
                    last_position = curr_position
                    break  # continue scraping the results
        except Exception as e:
            print(e)
            continue
        
        today = datetime.today().strftime('%Y%m%d')   
        with open(f'Steam_Reviews__{today}.csv', 'a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(["GameId",'SteamId', 'ProfileURL', 'ReviewText', 'Review', 'ReviewLength(Chars)', 'PlayHours', 'DatePosted'])
            writer.writerows(reviews)


# shutdown the web driver
driver.close()

## Save the results

You can push the data wherever you want. However, for this project, I'm going to save the data to an Excel spreadsheet using the [OpenPyXL](https://openpyxl.readthedocs.io/en/stable/) library

In [None]:
# # save the file to Excel Worksheet
# wb = Workbook()
# ws = wb.worksheets[0]
# ws.append(["GameId",'SteamId', 'ProfileURL', 'ReviewText', 'Review', 'ReviewLength(Chars)', 'PlayHours', 'DatePosted'])
# for row in reviews:
#     ws.append(row)
    
# today = datetime.today().strftime('%Y%m%d')    
# wb.save(f'Steam_Reviews_{game_id}_{today}.xlsx')    
# wb.close()

In [None]:
# save the file to a CSV file
# today = datetime.today().strftime('%Y%m%d')   
# with open(f'Steam_Reviews_{game_id}_{today}.csv', 'w', newline='', encoding='utf-8') as f:
#     writer = csv.writer(f)
#     writer.writerow(["GameId",'SteamId', 'ProfileURL', 'ReviewText', 'Review', 'ReviewLength(Chars)', 'PlayHours', 'DatePosted'])
#     writer.writerows(reviews)