# Steam game review analysis

## Imports

In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import json

debug = False
url = 'https://store.steampowered.com/search/?supportedlang=english&filter=topsellers&ndl=1'
url_raw = 'https://store.steampowered.com/search/results/?query&start=#&count=50&dynamic_data=&sort_by=_ASC&supportedlang=english&snr=1_7_7_7000_7&filter=topsellers&infinite=1'

## Scrape game information

In [130]:
def scrape_game_info(records, url):
    page = requests.get(url)
    
    result = str(page.content.decode())
    
    string = '<span class=\"title\">Steam Deck<\/span>'
    results = re.findall(r'<a href=\\"https:\\\/\\\/store\.steampowered\.com\\\/app.{1,2750}\\r\\n\\t\\t\\t\\t', result)
    for i in range(len(results)):
        current = results[i]

        # DEBUGGING - Print the info about the current game to see the request content structure
        # print(current)

        # Get 'title'
        title = re.findall(r'class=\\"title\\">.{1,300}<\\\/span', current)[0]
        title = title[title.index(">")+1:title.index("<\/span>")]

        # Get 'prices'
        if re.search(r'class=\\"discount_prices\\">.{1,300}<\\\/div', current) == None:
            discount_price = None
            final_price = None
        else:
            price = re.findall(r'class=\\"discount_prices\\">.{1,300}<\\\/div', current)[0]
            if "discount_original_price" in price:
                discount_price = price[price.index("original_price")+17:price.index("<\/div>")]
                price = price[price.index("<\/div>")+7:]
            else:
                discount_price = None
            if 'free' in price:
                final_price = 'Free'
            else:
                final_price = price[price.index('final_price\\')+14:price.index("<\/div>")]

        # Get 'reviews'
        if re.search(r'class=\\"search_review_summary positive\\".{1,300}\\">', current) == None:
            review_class = None # Overwhelmingly positive, positive, mixed etc.
            reviews = None
            review_percent = None # Percent of reviews that are positive
            review_absolute = None # Number of reviews
        else:
            review = re.findall(r'class=\\"search_review_summary positive\\".{1,300}\\">', current)[0]
            review_class = review[review.index('html=\\"')+7:review.index("&lt")]
            reviews = review[review.index('&gt;')+4:review.index('.')]
            review_percent = reviews[:reviews.index('%')]
            review_absolute = reviews[reviews.index('the ')+4:reviews.index(' user')].replace(",", "")
            
        # Get 'date'
        if re.search(r'class=\\"col search_released responsive_secondrow\\">.{1,200}<\\\/div>', current) == None:
            date = None
        else:
            date = re.findall(r'class=\\"col search_released responsive_secondrow\\">.{1,200}<\\\/div>', current)[0]
            date = date[date.index(r'\r\n')+4:date.index('<\/div>')].lstrip().rstrip()
            if date == "": # If no date, will match to whitespace so will be stripped to empty string
                date = None

        records.append([title, discount_price, final_price, review_percent, review_absolute, review_class, date])

    return records


if debug:
    lim = 1
else:
    lim = 150
records = []
for i in range(lim):
    records = scrape_game_info(records, url_raw.replace('#', str(i*50)))

df = pd.DataFrame(records, columns=['title', 'pre-discount-price', 'sale-price', 'positive-review-percent', 'num-reviews', 'review-class', 'date-released'])
df

                                             title pre-discount-price  \
0                                       Steam Deck               None   
1                                  House Flipper 2             £32.49   
2                                  Baldur's Gate 3               None   
3                                   Lethal Company               None   
4           Five Nights at Freddy's: Help Wanted 2               None   
...                                            ...                ...   
7378           Mystery Solitaire. Cthulhu Mythos 2              £4.29   
7379  Worms Ultimate Mayhem - Multiplayer Pack DLC              £1.99   
7380                                   Apocalipsis              £5.89   
7381                                   Pretty Neko              £0.89   
7382                                        Reflex               None   

     sale-price positive-review-percent num-reviews             review-class  \
0       £309.00                    None    

## Change date format

In [148]:
from sklearn.preprocessing import LabelEncoder

# Change month into number format
def switch(date):
    if date == 'Jan':
        return '01'
    elif date == 'Feb':
        return '02'
    elif date == 'Mar':
        return '03'
    elif date == 'Apr':
        return '04'
    elif date == 'May':
        return '05'
    elif date == 'Jun':
        return '06'
    elif date == 'Jul':
        return '07'
    elif date == 'Aug':
        return '08'
    elif date == 'Sep':
        return '09'
    elif date == 'Oct':
        return '10'
    elif date == 'Nov':
        return '11'
    elif date == 'Dec':
        return '12'
    return None

# Change date format to YYYY-MM-DD
for i in range(len(df)):
    date = str(df['date-released'].iloc[i])
    if not date == 'None' and not '-' in date:
        try:
            year = date[date.index(", ")+2:]
            month = date[date.index(" ")+1:date.index(",")]
            day = date[:date.index(" ")]
            month = switch(month) # Turn month into number format
            date = year + '-' + month + '-' + day.zfill(2)
        except:
            date = 'None'
        df['date-released'].iloc[i] = date

df_raw = df

## Clean up missing values