In [34]:
import os
import pandas as pd
import numpy as np
import seaborn as sns

In [35]:
# Paths for each csv file
base_path = os.path.join(os.getcwd(), '..', 'steam-insights-main')

games_df_path = os.path.join(base_path, 'games.csv')
descriptions_path = os.path.join(base_path, 'descriptions.csv')
categories_path = os.path.join(base_path, 'categories.csv')
genres_path = os.path.join(base_path, 'genres.csv')
promotional_path = os.path.join(base_path, 'promotional.csv')
reviews_path = os.path.join(base_path, 'reviews.csv')
steamspy_insights_path = os.path.join(base_path, 'steamspy_insights.csv') 
tags_path = os.path.join(base_path, 'tags.csv')

In [36]:
# Read the CSV files 
games = pd.read_csv(
    games_df_path,
    usecols=["app_id", "name"],
    quotechar='"',        # Specify the quote character
    doublequote=True,     # Handle double quotes
    escapechar='\\',      # Specify the escape character
    delimiter=',',        # Specify the delimiter
    engine='python',      # Use the Python parsing engine for better handling of complex cases
    on_bad_lines='skip'   # Skip bad lines
)

descriptions = pd.read_csv(
    descriptions_path,
    quotechar='"',        # Specify the quote character
    doublequote=True,     # Handle double quotes
    escapechar='\\',      # Specify the escape character
    delimiter=',',        # Specify the delimiter
    engine='python',      # Use the Python parsing engine for better handling of complex cases
    on_bad_lines='skip'   # Skip bad lines
)

steamspy = pd.read_csv(
    steamspy_insights_path,
    usecols=["app_id", "developer", "owners_range", "price"],
    quotechar='"',
    doublequote=True,
    escapechar='\\',
    delimiter=',',
    engine='python',
    on_bad_lines='skip'
)

tags_df = pd.read_csv(
    tags_path,
    quotechar='"',
    doublequote=True,
    escapechar='\\',
    delimiter=',',
    engine='python',
    on_bad_lines='skip'
)

tags_grouped = tags_df.groupby("app_id")["tag"].apply(lambda x: ", ".join(x.astype(str))).reset_index()

genres_df = pd.read_csv(
    genres_path,
    usecols=["app_id", "genre"],
    quotechar='"',
    doublequote=True,
    escapechar='\\',
    delimiter=',',
    engine='python',
    on_bad_lines='skip'
)

reviews = pd.read_csv(
    reviews_path,
    usecols=["app_id", "review_score", "review_score_description", "positive", "negative", "total", "metacritic_score", "recommendations"],
    quotechar='"',
    doublequote=True,
    escapechar='\\',
    delimiter=',',
    engine='python',
    on_bad_lines='skip'
)


In [37]:
# Merge data 
df_merged = pd.merge(games, steamspy, on="app_id", how="left")
df_merged = pd.merge(df_merged, tags_grouped, on="app_id", how="left")
df_merged = pd.merge(df_merged, genres_df, on="app_id", how="left")
df_merged = pd.merge(df_merged, reviews, on="app_id", how="left")


df_merged['price'] = pd.to_numeric(df_merged['price'], errors='coerce')
df_merged['price'] = df_merged['price'] / 100

df_merged.head()


Unnamed: 0,app_id,name,developer,owners_range,price,tag,genre,review_score,review_score_description,positive,negative,total,metacritic_score,recommendations
0,10,Counter-Strike,Valve,"10,000,000 .. 20,000,000",9.99,"1980s, 1990's, Action, Assassin, Classic, Comp...",Action,9,Overwhelmingly Positive,235403,6207,241610,88,153259
1,20,Team Fortress Classic,Valve,"5,000,000 .. 10,000,000",4.99,"1990's, Action, Class-Based, Classic, Co-op, C...",Action,8,Very Positive,7315,1094,8409,N,6268
2,30,Day of Defeat,Valve,"5,000,000 .. 10,000,000",4.99,"Action, Class-Based, Classic, Co-op, Difficult...",Action,8,Very Positive,6249,672,6921,79,4146
3,40,Deathmatch Classic,Valve,"5,000,000 .. 10,000,000",4.99,"1990's, Action, Arena Shooter, Classic, Co-op,...",Action,8,Very Positive,2542,524,3066,N,2218
4,50,Half-Life: Opposing Force,Gearbox Software,"2,000,000 .. 5,000,000",4.99,"1990's, Action, Adventure, Aliens, Atmospheric...",Action,9,Overwhelmingly Positive,22263,1111,23374,N,20144


In [None]:
# IDEA
# use the owners range (from steamspy_insight.csv) and analyse how it relates to the game’s price.
# take tags/genres into consideration
# maybe then make a model that guesses how many players there will be based on the price and tags/genres



