# Steam game review analysis

## Imports

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import json
from tqdm import tqdm
import csv

debug = False
url = 'https://store.steampowered.com/search/?category1=998&supportedlang=english'

# URL contains # which the scraping loop will replace with the start value in each iteration
url_raw = 'https://store.steampowered.com/search/results/?query&start=#&count=50&dynamic_data=&sort_by=_ASC&category1=998&supportedlang=english&snr=1_7_7_230_7&infinite=1'
# url_raw = 'https://store.steampowered.com/search/results/?query&start=#&count=50&dynamic_data=&force_infinite=1&category1=998&supportedlang=english&filter=topsellers&ndl=1&snr=1_7_7_7000_7&infinite=1'

## Get number of games on page

In [2]:
def scrape_num_games(url):
    page = requests.get(url) # Get game homepage
    
    soup = BeautifulSoup(page.content, "html.parser")
    div = soup.find("div", {"id": "search_results_filtered_warning"})
    
    children = [str(i) for i in div.children]
    result = [i for i in children if '<div>' in i][0] # First child div contains game count
    game_count = result[result.index(">")+1:result.index(" ")].replace(",", "") # Get number of games returned by query
    excluded_game_count = result[result.index(". ")+2:result.index(" titles")].replace(",", "") # Get number of games not shown due to search preferences
    total_count = int(game_count) - int(excluded_game_count) # Calculate total number of games returned and shown

    return total_count

game_count = scrape_num_games(url)
print(f'[INFO] {game_count} games found') 

[INFO] 79162 games found


## Scrape game information

In [None]:
def scrape_game_info(records, genres, url):
    page = requests.get(url)
    
    result = str(page.content.decode())
    
    string = '<span class=\"title\">Steam Deck<\/span>'
    results = re.findall(r'<a href=\\"https:\\\/\\\/store\.steampowered\.com\\\/app.{1,2750}\\r\\n\\t\\t\\t\\t', result)

    if debug:
        iters = 1
    else:
        iters = len(results)
        
    for i in range(iters):
        current = results[i]

        # DEBUGGING - Print the info about the current game to see the request content structure
        if debug:
            print(current)

        try:
            # Get 'title'
            title = re.findall(r'class=\\"title\\">.{1,300}<\\\/span', current)[0]
            title = title[title.index(">")+1:title.index("<\/span>")]
    
            # Get 'prices'
            if re.search(r'class=\\"discount_prices\\">.{1,300}<\\\/div', current) == None: # No price information given
                discount_price = None
                final_price = None
            else:
                price = re.findall(r'class=\\"discount_prices\\">.{1,300}<\\\/div', current)[0] # All price information
                if "discount_original_price" in price: # The price of the game before discount
                    discount_price = price[price.index("original_price")+17:price.index("<\/div>")]
                    price = price[price.index("<\/div>")+7:]
                else: # There is no discount
                    discount_price = None
                if 'free' in price:
                    final_price = 'Free'
                else:
                    final_price = price[price.index('final_price\\')+14:price.index("<\/div>")] # The current price of the game, either regular or after discount
                if discount_price == None: # If no discount, set pre-discount price to same as current (final) price
                    discount_price = final_price

            # Get 'reviews'
            if re.search(r'class=\\"search_review_summary positive\\".{1,300}\\">', current) == None:
                review_class = None # Overwhelmingly positive, positive, mixed etc.
                reviews = None
                review_percent = None # Percent of reviews that are positive
                review_absolute = None # Number of reviews
            else:
                review = re.findall(r'class=\\"search_review_summary positive\\".{1,300}\\">', current)[0]
                review_class = review[review.index('html=\\"')+7:review.index("&lt")]
                reviews = review[review.index('&gt;')+4:review.index('.')]
                review_percent = reviews[:reviews.index('%')]
                review_absolute = reviews[reviews.index('the ')+4:reviews.index(' user')].replace(",", "")
                
            # Get 'date'
            if re.search(r'class=\\"col search_released responsive_secondrow\\">.{1,200}<\\\/div>', current) == None:
                date = None
            else:
                date = re.findall(r'class=\\"col search_released responsive_secondrow\\">.{1,200}<\\\/div>', current)[0]
                date = date[date.index(r'\r\n')+4:date.index('<\/div>')].lstrip().rstrip()
                if date == "": # If no date, will match to whitespace so will be stripped to empty string
                    date = None
    
            game_page_url = re.findall(r'<a href=\\"https:\\\/\\\/store\.steampowered\.com\\\/app.{1,100}\\r\\n\\t\\t\\t', current)[0]
            game_page_url = game_page_url[game_page_url.index("https"):game_page_url.index('\\"\\r')].replace("\\", "")
            # print(game_page_url)
            
            game_page = requests.get(game_page_url)
            soup = BeautifulSoup(game_page.content, "html.parser")
            genre_info = soup.find("div", {"id": "genresAndManufacturer"})
            # print(genres)
            game_genres = []
            children = genre_info.findChildren("a", recursive=True)
            for child in children:
                if "genre" in str(child):
                    # print(child.text)
                    game_genres.append(child.text)
            # print(game_genres)
            genres.append(game_genres)
            records.append([title, discount_price, final_price, review_percent, review_absolute, review_class, date])
            
           
        except:
            print('[INFO] Cannot parse following info.')
            print(current)
            print('=====')
            
    return records, genres

# Scrapes game info in batches of 50, so total number of games scraped = ~50*lim
if debug:
    lim = 1
else:
    lim = game_count // 50
records = []
genres = []
print('[INFO] Scraping... ')
for i in tqdm(range(lim)):
    records, genres = scrape_game_info(records, genres, url_raw.replace('#', str(i*50)))
    # if i % 5 == 0:
    #     with open('data_records_backup.csv', 'w', newline='') as f:
    #         writer = csv.writer(f)
    #         writer.writerows(records)
    #         f.close()
    #     with open('data_genres_backup.csv', 'w', newline='') as f:
    #         writer = csv.writer(f)
    #         writer.writerows(genres)
    #         f.close()

[INFO] Scraping... 


  1%|█                                                                            | 21/1583 [22:43<16:07:49, 37.18s/it]

### One-hot encode genres and convert data to pandas DF

In [41]:
def one_hot_encode_genres(X):

    # Create array of all genres in X
    genres = []
    for i in X:
        for j in i:
            if j not in genres:
                genres.append(j)
    
    # print(genres)

    # Initialise empty genre dictionary
    genre_dict = {}
    for i in genres:
        genre_dict[i] = []
    
    # print(genre_dict)

    # Add 1 for each genre in game i, add 0 for each genre not in game i
    for i in X:
        for j in i:
            genre_dict[j].append(1)
        for k in [x for x in genres if x not in i]:
            genre_dict[k].append(0)

    return genre_dict

one_hot_genres = one_hot_encode_genres(genres)
genre_df = pd.DataFrame.from_dict(one_hot_genres)

df = pd.DataFrame(records, columns=['title', 'pre-discount-price', 'sale-price', 'positive-review-percent', 'num-reviews', 'review-class', 'date-released'])
df = pd.concat([df, genre_df], axis=1)
df

Unnamed: 0,title,pre-discount-price,sale-price,positive-review-percent,num-reviews,review-class,date-released,Adventure,RPG,Strategy
0,Baldur's Gate 3,£49.99,£44.99,96,454201,Overwhelmingly Positive,"3 Aug, 2023",1,1,1


## Change date format

In [5]:
from sklearn.preprocessing import LabelEncoder

# Change month into number format
def switch(date):
    if date == 'Jan':
        return '01'
    elif date == 'Feb':
        return '02'
    elif date == 'Mar':
        return '03'
    elif date == 'Apr':
        return '04'
    elif date == 'May':
        return '05'
    elif date == 'Jun':
        return '06'
    elif date == 'Jul':
        return '07'
    elif date == 'Aug':
        return '08'
    elif date == 'Sep':
        return '09'
    elif date == 'Oct':
        return '10'
    elif date == 'Nov':
        return '11'
    elif date == 'Dec':
        return '12'
    return None

# Change date format to YYYY-MM-DD
print('[INFO] Changing date format...')
for i in tqdm(range(len(df))):
    date = str(df['date-released'].iloc[i])
    if not date == 'None' and not '-' in date:
        try:
            year = date[date.index(", ")+2:]
            month = date[date.index(" ")+1:date.index(",")]
            day = date[:date.index(" ")]
            month = switch(month) # Turn month into number format
            date = year + '-' + month + '-' + day.zfill(2)
        except:
            date = 'None'
        df['date-released'].iloc[i] = date

df_raw = df

[INFO] Changing date format...


100%|███████████████████████████████████████████████████████████████████████████| 77851/77851 [03:02<00:00, 427.31it/s]


## Clean up data - remove missing values and change formatting

In [6]:
# Remove rows where review-class is None
df = df.loc[df['review-class'].notna()]

df = df.loc[df['sale-price'].notna()]
df = df.loc[df['pre-discount-price'].notna()]


for i in tqdm(range(len(df))):
    discount_price = df['pre-discount-price'].iloc[i]
    sale_price = df['sale-price'].iloc[i]

    if sale_price == 'Free':
        sale_price = '0.00'
        discount_price = '0.00'

    discount_price = discount_price.replace("£", "")
    sale_price = sale_price.replace("£", "")

    df['pre-discount-price'].iloc[i] = discount_price
    df['sale-price'].iloc[i] = sale_price

df

100%|███████████████████████████████████████████████████████████████████████████| 29209/29209 [00:54<00:00, 535.92it/s]


Unnamed: 0,title,pre-discount-price,sale-price,positive-review-percent,num-reviews,review-class,date-released
0,Lethal Company,8.50,8.50,98,148630,Overwhelmingly Positive,2023-10-23
1,Baldur's Gate 3,49.99,49.99,96,447993,Overwhelmingly Positive,2023-08-03
2,Counter-Strike 2,0.00,0.00,87,7794022,Very Positive,2012-08-21
4,House Flipper 2,32.49,29.24,83,1792,Very Positive,2023-12-14
5,Rust,34.99,17.49,87,820392,Very Positive,2018-02-08
...,...,...,...,...,...,...,...
34806,Shell's Kitchen,0.00,0.00,70,20,Mostly Positive,2023-11-24
34807,Retro Combat,0.00,0.00,71,38,Mostly Positive,2023-10-28
34808,Touhou Meijinka ~ Song of Divine Tempest,0.00,0.00,75,12,Mostly Positive,2023-12-07
34809,DOMINO The Little One,0.00,0.00,79,54,Mostly Positive,2023-11-23


## Save DataFrame to CSV

In [7]:
def save_df(df):
    df.to_csv('data.csv', header=False, index=False)

save_df(df)