In [18]:
#Steam Game Similarity Project
#COP3530

#This file takes a csv file containing the raw information of games on steam, processes it, and outputs two files.

#The first output is a Numpy file containing a 2D Numpy Array of the processed game data
#The second output is a JSON file containing metadata in the following format:
    #First line contains a list of column names for all the columns in output 1
    #Second line onwards contains a dictionary for every game, mapping the unique AppID to various game metadata

In [None]:
import pandas as pd
import numpy as np
import os

#Creates an input path for the raw data
dir = "resources"
subfolder = "raw_data"
sub_path = os.path.join(dir, subfolder)
raw_data_path = os.path.join(sub_path, "games.csv")

#Reads raw data file (.csv) and creates a Panda dataframe from it
df = pd.read_csv(raw_data_path)

df

In [None]:
#Replaces invalid NaN fields with blank strings
df['Categories'] = df['Categories'].fillna('')
df['Genres'] = df['Genres'].fillna('')
df['Tags'] = df['Tags'].fillna('')
df['Name'] = df['Name'].fillna('')

#Creates a new dataframe for the metadata of each game
df_metadata = df[["AppID", "Name", "Release date", "About the game", "Developers", "Categories", "Genres", "Tags"]]

#Setting the index to the AppID
df.set_index("AppID", inplace=True)

#Drops columns that aren't needed for data processing
df.drop(columns = ["Name", "Release date", "Estimated owners", "Peak CCU", "Required age", "Price", "Discount", "DLC count", "About the game", "Supported languages", "Full audio languages", "Reviews", "Header image", "Website", "Support url", "Support email", "Windows", "Mac", "Linux", "Metacritic url", "User score", "Score rank", "Achievements", "Recommendations", "Notes", "Median playtime forever", "Median playtime two weeks", "Developers", "Publishers", "Screenshots", "Movies"], inplace = True)

df

In [None]:
    #Separates data within columns for categories/genres/tags to be processed


#Lists containing categories/genres/tags to be used
categories_keep = [
    "co-op", "cross-platform multiplayer", "full controller support",
    "mmo", "mods", "multi-player", "pvp", "single-player", "vr only",
    "vr support", "vr supported"
]

genres_keep = [
    "action", "adventure", "casual", "early access", "education",
    "episodic", "free to play", "gore", "indie", "massively multiplayer",
    "movie", "nudity", "rpg", "racing", "sexual content", "short",
    "simulation", "sports", "strategy", "violent"
]

tags_keep = [
    "arcade", "shooter", "action-adventure", "platformer", "horror",
    "visual novel", "2d platformer", "sexual content", "point & click",
    "fps", "rogue-like", "rogue-lite", "hidden object", "3d platformer",
    "sandbox", "action roguelike", "survival", "side scroller",
    "action rpg", "open world", "bullet hell", "interactive fiction",
    "turn-based strategy", "shoot 'em up", "choose your own adventure",
    "dating sim", "immersive sim", "walking simulator", "clicker",
    "management", "turn-based tactics", "jrpg", "card game", "building",
    "hack and slash", "top-down shooter", "dungeon crawler",
    "survival horror", "precision platformer", "education",
    "tower defense", "life sim", "board game", "idler",
    "third-person shooter", "rts", "time management", "collectathon",
    "arena shooter", "runner", "base building", "strategy rpg",
    "real time tactics", "city builder", "stealth", "beat 'em up",
    "wargame", "flight", "card battler", "2d fighter", "metroidvania",
    "investigation", "party-based rpg", "rhythm", "tactical rpg",
    "match 3", "souls-like", "twin stick shooter", "3d fighter",
    "automobile sim", "word game"
]

#Converts the lists of categories for all the games to column headers with binary values (1 if present for a game, 0 if not present) and stores it in a new dataframe
col_data = df["Categories"].str.get_dummies(sep=',')
#Iterates through each column in new category dataframe and removes any columns that aren't named above
for col in col_data.columns:
    if col.lower() not in categories_keep:
        col_data.drop(columns = col, inplace = True)
#Concatonates the main dataframe with new, processed categories dataframe
df = pd.concat([df, col_data], axis=1)

#Repeat for genres/tags columns

col_data = df["Genres"].str.get_dummies(sep=',')
for col in col_data.columns:
    if col.lower() not in genres_keep:
        col_data.drop(columns = col, inplace = True)
df = pd.concat([df, col_data], axis=1)

col_data = df["Tags"].str.get_dummies(sep=',')
for col in col_data.columns:
    if col.lower() not in tags_keep:
        col_data.drop(columns = col, inplace = True)
df = pd.concat([df, col_data], axis=1)

df


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Attempt to use TF-IDF to more methodically determine weights to be used in Cosine Similarity
# Currently not implemented/finished

'''#Custom tokenizer used for TF-IDF analysis
def data_delimiter(raw_data):
    #If data is a string, returns an array of individual tokens in string
    if isinstance(raw_data, str):
        return [token.strip() for token in raw_data.split(',')]
    #Returns empty array if data is not a string
    return []

#Creates a vectorizer for the category column that tokenizes the data (also filters out undesired categories)
category_vectorizer = TfidfVectorizer(tokenizer=data_delimiter, vocabulary=categories_keep, lowercase=True)
#Applies a fit and transform to the vectorizer object and creates a matrix containing the TF-IDF values
category_matrix = category_vectorizer.fit_transform(df['Categories'])
#Converts the matrix to a pandas dataframe
tfidf_category = pd.DataFrame(category_matrix.toarray(), columns=category_vectorizer.get_feature_names_out())

#Repeat for genres/tags

genre_vectorizer = TfidfVectorizer(tokenizer=data_delimiter, vocabulary=genres_keep, lowercase=True)
genre_matrix = genre_vectorizer.fit_transform(df['Genres'])
tfidf_genre = pd.DataFrame(genre_matrix.toarray(), columns=genre_vectorizer.get_feature_names_out())

tag_vectorizer = TfidfVectorizer(tokenizer=data_delimiter, vocabulary=tags_keep, lowercase=True)
tag_matrix = tag_vectorizer.fit_transform(df['Tags'])
tfidf_tag = pd.DataFrame(tag_matrix.toarray(), columns=tag_vectorizer.get_feature_names_out())

#Concatonates results for all three dataframes
tfidf_df = pd.concat([tfidf_category, tfidf_genre, tfidf_tag], axis=1)

tfidf_df'''

In [None]:
from sklearn.preprocessing import StandardScaler

#Drops additional unecessary columns and creates new copy of dataframe
df.drop(columns=["Categories", "Genres", "Tags"], inplace=True)
df_clean = df

#Uses a standard scaler object to standardize the highly variable numeric columns
SS_obj = StandardScaler()
input_data = df_clean[["Metacritic score", "Positive", "Negative", "Average playtime forever", "Average playtime two weeks"]]
scaled_data = SS_obj.fit_transform(input_data)

#Applies the scaled data to the cleaned dataframe
df_clean[["Metacritic score", "Positive", "Negative", "Average playtime forever", "Average playtime two weeks"]] = scaled_data
df_clean


In [None]:
#Creates an array to store weights for each attribute in cleaned dataframe
weights = [0.7] * 5
weights.extend([1.5] * 11)
weights.extend([2] * 21)
weights.extend([2.5] * 70)

col_count = 0

#Applies weight to each column
for col in df_clean.columns:
    df_clean[col] = df_clean[col].mul(weights[col_count])
    col_count += 1


#Calculates the euclidean (L2) norm of each row and divides the data in each row by the euclidean norm
#Saves time when performing Cosine Similarity in main algorithm implementations
eucl_norm = np.linalg.norm(df_clean, axis=1)
normalized_df = df_clean.div(eucl_norm, axis=0)

normalized_df

In [None]:
import json

#Makes the AppID a main column again and converts the dataframe to 2D numpy array
normalized_df = normalized_df.reset_index()
numpy_matrix = normalized_df.to_numpy()

#Creates output paths for 2d numpy array and metadata files
dir = "resources"
subfolder = "preprocessed_data"
sub_path = os.path.join(dir, subfolder)
numpy_path = os.path.join(sub_path, "preprocessed_data_matrix.npy")
metadata_path = os.path.join(sub_path, "game_metadata.json")

#Writes the 2d numpy array to a npy file
np.save(numpy_path, numpy_matrix)

#Converts dataframe containing metadata to a dictionary, with AppID as the key
game_metadata_dict = df_metadata.to_dict('index')

#Writes a list of column headers for the numpy matrix and metadata dictionary to a JSON file
with open(metadata_path, 'w') as file:
    '''json.dump(normalized_df.columns.to_list(), file)
    file.write('\n')'''
    json.dump(game_metadata_dict, file, indent=5)

In [None]:
#For testing/easy visualization of processed data
normalized_df = pd.concat([df_metadata[["Name"]], normalized_df], axis=1)
normalized_df.to_csv("preprocessed_data.csv", index=False)

numpy_matrix_test = np.load('preprocessed_data_matrix.npy')
print(numpy_matrix_test)