In [2]:
# SI 618 Final Project

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import json
import requests
import time

## Steam Store Reviews Data

Data for the review for each game in the steam store can be scraped from the web from the base url: `https://store.steampowered.com/appreviews/<appid>?json=1`

This data is returned in the form of a json file

In [4]:
def get_steam_reviews(
    appid,
    params={
        "json": 1,
        "filter": "all",
        "language": "english",
    },
):
    """
    Get Steam reviews for a specific game/app by scraping the steamstore website endpoint

    Parameters
    ----------
    appid : int
            Steam app id
    params : dict
            Parameters to pass to API call

    Returns
    -------
    json
            Steam reviews as json
    """
    base_url = "https://store.steampowered.com/appreviews/"

    # Uses a header to avoid being timed out by steam
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
    }

    # Call API
    response = requests.get(
        base_url + str(appid), headers=headers, params=params
    )

    # Return as  json
    return response.json()

In [5]:
def get_n_reviews(appid, n=100):
    """
    Function code orginially written by Andrew Muller at https://andrew-muller.medium.com/scraping-steam-user-reviews-9a43f9e38c92

    The most reviews that can be returned at once is 100. This function will return n reviews for a given appid by using the cursor to get the next 100 reviews until n reviews have been returned.

    Parameters
    ----------
    appid : int
        The appid of the game to get reviews for
    n : int
        The number of reviews to return

    Returns
    -------
    reviews : list
        A list of dictionaries containing the reviews

    """
    reviews = []
    cursor = "*"
    params = {
        "json": 1,
        "filter": "all",
        "language": "english",
        "day_range": 9223372036854775807,
        "review_type": "all",
        "purchase_type": "all",
    }

    while n > 0:
        params["cursor"] = cursor.encode()
        params["num_per_page"] = min(100, n)
        n -= 100

        response = get_steam_reviews(appid, params)
        cursor = response["cursor"]
        reviews += response["reviews"]

        if len(response["reviews"]) < 100:
            break

    return reviews

## Steam Store Game Data



In [6]:
def get_steam_game_data(params={"request": "top100forever", "format": "json"}):
    """
    Uses the steamspy API to return data for games on the steam marketplace
    default params are for the top 100 games by playtime forever, all games can be retrieved by changing the request param to all

    Parameters
    ----------

    Returns
    -------
    json
        Steam game data as json

    """
    base_url = "https://steamspy.com/api.php?"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
    }

    response = requests.get(base_url, params=params, headers=headers)
    return response.json()

In [7]:
def get_all_steam_data(n):
    """
    when passing 'all' as the request param, the steamspy API returns data for 1000 games per page. This function will return data for all games on steam by using the page param to get the next 1000 games until n games have been returned.
    After each page has been returned, the data is compiled and stored in a csv and the next page is retrieved. The new page is then added to the csv file and the process continues until n games are returned. This is done to avoid losing all data if the API call fails.

    The function returns a dataframe with the data for all games on steam from the csv file generated.

    Parameters
    ----------
    n : int
        The number of games to return

    Returns
    -------
    df : dataframe
        A dataframe containing the data for all games on steam
    """
    params = {"request": "all", "format": "json"}
    df = pd.DataFrame()
    page = 1
    while n > 0:
        params["page"] = page
        response = get_steam_game_data(params)
        df = df.append(pd.DataFrame.from_dict(response, orient="index"))
        n -= 1000
        page += 0
        df.to_csv("steam_data.csv")

    return df

## Steam Store Listing Data

SotrefrontAPI data was found at https://wiki.teamfortress.com/wiki/User:RJackson/StorefrontAPI

In [8]:
def get_steam_store_data(appid):
    base_url = "https://store.steampowered.com/api/appdetails/"

    params = {"appids": appid}

    response = requests.get(base_url, params=params)
    return response.json()

## Create Dataframes

In [9]:
def create_review_df(game_df, n_reviews=100):
    review_list = []
    for appid in game_df.appid:
        try:
            # Get Review Data for each appid in the dataframe
            response = get_n_reviews(appid, n_reviews)
            for review in response:
                review_dict = {
                    "appid": appid,
                    "recommendationid": review["recommendationid"],
                    "author": review["author"]["steamid"],
                    "author_playtime_forever": review["author"][
                        "playtime_forever"
                    ],
                    "author_playtime_last_two_weeks": review["author"][
                        "playtime_last_two_weeks"
                    ],
                    "author_last_played": review["author"]["last_played"],
                    "author_num_reviews": review["author"]["num_reviews"],
                    "votes_up": review["votes_up"],
                    "votes_funny": review["votes_funny"],
                    "weighted_vote_score": review["weighted_vote_score"],
                    "comment_count": review["comment_count"],
                    "timestamp_created": review["timestamp_created"],
                    "timestamp_updated": review["timestamp_updated"],
                    "steam_purchase": review["steam_purchase"],
                    "received_for_free": review["received_for_free"],
                    "written_during_early_access": review[
                        "written_during_early_access"
                    ],
                    "review": review["review"],
                }

                review_list.append(review_dict)
        except:
            print(appid)

    return pd.DataFrame(review_list)

In [10]:
def create_store_df(game_df):
    store_data_list = []
    for appid in game_df.appid:
        try:
            store_data_list.append(
                get_steam_store_data(appid)[str(appid)]["data"]
            )
        except:
            print(appid)

    return pd.DataFrame(store_data_list)

# Data to dataframes

In [11]:
# game_df = pd.DataFrame().from_dict(get_steam_game_data(), orient="index")
game_df = pd.read_csv("data/steamspy_data.csv")

In [12]:
game_df["n_reviews"] = game_df["positive"] + game_df["negative"]
game_df = game_df[game_df["n_reviews"] > 100]

In [13]:
review_df = pd.read_csv("data/steam_review_data.csv")

In [24]:
# reduce review_df to only include reviews for games in game_df
review_df = review_df[review_df.appid.isin(game_df.appid)]

14642

In [27]:
review_df.to_csv(
    "data/steam_review_data1.csv.gz", compression="gzip", index=False
)

In [16]:
review_df.head()

Unnamed: 0.1,Unnamed: 0,appid,recommendationid,author,author_playtime_forever,author_playtime_last_two_weeks,author_last_played,author_num_reviews,votes_up,votes_funny,weighted_vote_score,comment_count,timestamp_created,timestamp_updated,steam_purchase,received_for_free,written_during_early_access,review
0,0,10,22451084,76561198097242611,216835,243,1697252981,11,405,0,0.976047,1,1460808481,1460808481,False,False,False,Best FPS game i ever played
1,1,10,32025207,76561198149765695,822,0,1688071376,4,428,284,0.953599,20,1495993232,1499801573,True,False,False,How to correctly play this game:\n-Noisiest fu...
2,2,10,28057463,76561197962340525,409384,0,1662243139,1,248,9,0.95077,220,1480429968,1653066625,True,False,False,[h1]⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ⠀⠀ ♡[/h1]
3,3,10,15153766,76561198026396278,512754,919,1697938165,7,2487,4130,0.947676,120,1427890049,1480048737,False,False,False,Ruined my life.
4,4,10,6941190,76561198080043584,776344,0,1696323991,29,195,0,0.938016,9,1357415282,1547414438,False,False,False,Actually the best game in this world. It still...


In [17]:
review_df.shape

(2117312, 18)

In [18]:
game_df.head()

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,...,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags,n_reviews
0,10,Counter-Strike,Valve,Valve,,193046,4940,0,"10,000,000 .. 20,000,000",8486,...,196,128,999.0,999.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,16360,"{'Action': 5379, 'FPS': 4801, 'Multiplayer': 3...",197986
1,20,Team Fortress Classic,Valve,Valve,,5412,895,0,"5,000,000 .. 10,000,000",551,...,14,58,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,87,"{'Action': 745, 'FPS': 306, 'Multiplayer': 257...",6307
2,30,Day of Defeat,Valve,Valve,,5003,556,0,"5,000,000 .. 10,000,000",421,...,23,73,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Spain",Action,122,"{'FPS': 787, 'World War II': 248, 'Multiplayer...",5559
3,40,Deathmatch Classic,Valve,Valve,,1852,412,0,"5,000,000 .. 10,000,000",318,...,7,0,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,7,"{'Action': 629, 'FPS': 139, 'Classic': 107, 'M...",2264
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,,13262,662,0,"5,000,000 .. 10,000,000",676,...,150,0,499.0,499.0,0.0,"English, French, German, Korean",Action,122,"{'FPS': 881, 'Action': 322, 'Classic': 251, 'S...",13924
