# Steam Game Review Scraper

Scrape game review data from Steam, including the user, profile link, and the content of the review itself

In [37]:

from selenium.webdriver import ChromeOptions, Chrome
from selenium.webdriver.common.keys import Keys
import re
from time import sleep
from datetime import datetime
from openpyxl import Workbook
import csv

## Requirements
You'll need to install the following libraries before beginning this project:
- [Selenium](https://www.selenium.dev/downloads/) : for automating the web browser; this can be involved... so check my [short YouTube video](https://youtu.be/9XAH_TvxwLg) for a walkthrough.
- [OpenPyXL](https://openpyxl.readthedocs.io/en/stable/) : for saving the data to an Excel spreadsheet (optional)

## Example
If you want to see an example of the output... you can see the results of me running the scaper for about 5 minutes on a particular game  
[Click to view Excel file](https://drive.google.com/file/d/1Ld04lwFY7OjIMU2wJxRcgdPvJ0o43BRo/view?usp=sharing)

## Getting started

Lookup the game id by doing a search on steam, navigate to the game homepage, and then get the number embedded in the URL before the game title.

In [38]:
#get the list of all steam game ids
# https://api.steampowered.com/ISteamApps/GetAppList/v2/
# get the json from this link
listOfAppIds = []
data = None
import urllib.request, json 
with urllib.request.urlopen("https://api.steampowered.com/ISteamApps/GetAppList/v2/") as url:
    data = json.load(url)

for app in data['applist']['apps']:
    listOfAppIds.append(app['appid'])
    
print(listOfAppIds)


[1383152, 1897482, 2112761, 1829051, 1983382, 216938, 660010, 660130, 1118314, 1275822, 1343832, 1828741, 662172, 1360782, 1820332, 1927051, 1496152, 1808781, 1977312, 1700632, 1567401, 2016512, 2092072, 2119422, 596501, 2156011, 2177061, 1825161, 2170321, 1941401, 1496243, 2121741, 1375021, 2292161, 1312081, 1401340, 1401360, 1401370, 1401390, 1401400, 1401461, 1401462, 1401470, 1401480, 1401490, 1401500, 1401520, 1401530, 1401540, 1401550, 1401560, 1401570, 1401600, 1401610, 1401620, 1401630, 1401640, 1401660, 1401670, 1401680, 1401750, 1400970, 1400980, 1401020, 1401070, 1401110, 1401130, 1401140, 1401150, 1401170, 1401180, 1401200, 1401220, 1401260, 1401310, 1401320, 1400470, 1400480, 1400490, 1400500, 1400520, 1400550, 1400551, 1400560, 1400570, 1400590, 1400600, 1400610, 1400620, 1400630, 1400650, 1400680, 1400700, 1400710, 1400720, 1400740, 1400760, 1400770, 1400780, 1400790, 1400800, 1400810, 1400830, 1400850, 1400860, 1400870, 1400910, 1400920, 1400930, 1400940, 1400950, 14000

The url template below can be altered to filter by sentiment, language, and recency.  

Check the [website](https://steamcommunity.com/app/387990/positivereviews/?browsefilter=mostrecent) to see what options are available. For this project, I'm going to focus on **Positive** reviews only and sort by **Most Recent**.

In [39]:
# template = 'https://steamcommunity.com/app/{}/positivereviews/?browsefilter=mostrecent'
# template_with_language = 'https://steamcommunity.com/app/{}/positivereviews/?browsefilter=mostrecent&filterLanguage=english'

# url = template_with_language.format(game_id)

In [40]:
# setup driver
options = ChromeOptions()
# options.use_chromium = True
driver = Chrome(options=options)

Maximize the window and get the starting url

In [41]:
# game_id = 1222730


## Scrape the data

The page is continously scrolling, so you'll need to grab the cards, then scroll down to the bottom and repeat until finished. For this project, we are going to collect the following information:
- Steam ID
- Profile URL
- Review Text
- Review Recommendation
- Review Length (chars)
- Play Hours
- Date Posted

In [42]:

for gameId in listOfAppIds:
    print(gameId)
    template_with_language = 'https://steamcommunity.com/app/{}/reviews/?browsefilter=mostrecent&filterLanguage=english'

    url = template_with_language.format(gameId)

    # driver.maximize_window()
    try:
        driver.get(url)
        if driver.current_url == "https://store.steampowered.com/":
            reviews = []
            review_ids = set()
            continue
    except Exception as e:
        reviews = []
        review_ids = set()
        continue

    # get current position of y scrollbar
    last_position = driver.execute_script("return window.pageYOffset;")
    from selenium.webdriver.common.by import By

    reviews = []
    review_ids = set()
    running = True

    while running:
        # get cards on the page, skip if no cards
        try:
            cards = driver.find_elements(By.CLASS_NAME,'apphub_Card')
            # if driver.current_url == "https://store.steampowered.com/app/" + str(gameId):
            # if url does not begin with https://store.steampowered.com/app/str(gameId)/reviews
            print(driver.current_url,driver.current_url.startswith("https://steamcommunity.com/app/" + str(gameId) + "/reviews"))
            if driver.current_url.startswith("https://steamcommunity.com/app/" + str(gameId) + "/reviews") == False:
                break
                continue
            if len(cards) == 0:
                break
                continue
    
            for card in cards[-20:]:  # only the tail end are new cards

                # gamer profile url
                profile_url = card.find_element(By.XPATH,'.//div[@class="apphub_friend_block"]/div/a[2]').get_attribute('href')

                # steam id
                steam_id = profile_url.split('/')[-2]
                
                # check to see if I've already collected this review
                if steam_id in review_ids:
                    continue
                else:
                    review_ids.add(steam_id)

                # username
                user_name = card.find_element(By.XPATH,'.//div[@class="apphub_friend_block"]/div/a[2]').text

                # language of the review
                date_posted = card.find_element(By.XPATH,'.//div[@class="apphub_CardTextContent"]/div').text
                review_content = card.find_element(By.XPATH,'.//div[@class="apphub_CardTextContent"]').text.replace(date_posted,'').strip()    

                # review length
                review_length = len(review_content.replace(' ', ''))    

                # recommendation
                thumb_text = card.find_element(By.XPATH,'.//div[@class="reviewInfo"]/div[2]').text
                thumb_text    

                # amount of play hours
                play_hours = card.find_element(By.XPATH,'.//div[@class="reviewInfo"]/div[3]').text
                play_hours    

                # save review
                review = (gameId ,steam_id, profile_url, review_content, thumb_text, review_length, play_hours, date_posted)
                print(review)
                reviews.append(review)    
                
            # attempt to scroll down thrice.. then break
            scroll_attempt = 0
            while True:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")    
                sleep(0.5)
                curr_position = driver.execute_script("return window.pageYOffset;")
                
                if curr_position == last_position:
                    scroll_attempt += 1
                    sleep(0.5)
                    
                    if scroll_attempt >= 3:
                        running = False
                        break
                else:
                    last_position = curr_position
                    break  # continue scraping the results
        except Exception as e:
            print(e)
            continue
        
        today = datetime.today().strftime('%Y%m%d')   
        with open(f'Steam_Reviews__{today}.csv', 'a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            if f.tell() == 0:
                writer.writerow(["GameId",'SteamId', 'ProfileURL', 'ReviewText', 'Review', 'ReviewLength(Chars)', 'PlayHours', 'DatePosted'])
            writer.writerows(reviews)


# shutdown the web driver
driver.close()

1383152
1897482
2112761
1829051
1983382
216938
660010
660130
1118314
1275822
1343832
1828741
662172
1360782
1820332
1927051
1496152
1808781
1977312
1700632
1567401
2016512
2092072
2119422
596501
2156011
2177061
1825161
2170321
1941401
1496243
2121741
1375021
2292161
1312081
1401340
https://steamcommunity.com/app/1401340/reviews/?browsefilter=mostrecent&filterLanguage=english True
(1401340, '76561199515776818', 'https://steamcommunity.com/profiles/76561199515776818/', 'Tea Garden Simulator has been an unexpectedly delightful experience for me. Despite the mixed reviews, I decided to take the plunge based on my personal interest in tea culture and gardening, and boy, am I glad I did. This game has taught me a valuable lesson about not letting negative reviews dictate my choices, and I\'ve been rewarded with a unique and enjoyable virtual tea garden journey.\n\nSure, the game may have its flaws and imperfections, as some reviews pointed out and there are a few bugs here and there. But her

## Save the results

You can push the data wherever you want. However, for this project, I'm going to save the data to an Excel spreadsheet using the [OpenPyXL](https://openpyxl.readthedocs.io/en/stable/) library

In [None]:
# # save the file to Excel Worksheet
# wb = Workbook()
# ws = wb.worksheets[0]
# ws.append(["GameId",'SteamId', 'ProfileURL', 'ReviewText', 'Review', 'ReviewLength(Chars)', 'PlayHours', 'DatePosted'])
# for row in reviews:
#     ws.append(row)
    
# today = datetime.today().strftime('%Y%m%d')    
# wb.save(f'Steam_Reviews_{game_id}_{today}.xlsx')    
# wb.close()

In [None]:
# save the file to a CSV file
# today = datetime.today().strftime('%Y%m%d')   
# with open(f'Steam_Reviews_{game_id}_{today}.csv', 'w', newline='', encoding='utf-8') as f:
#     writer = csv.writer(f)
#     writer.writerow(["GameId",'SteamId', 'ProfileURL', 'ReviewText', 'Review', 'ReviewLength(Chars)', 'PlayHours', 'DatePosted'])
#     writer.writerows(reviews)