In [31]:
#Steam Game Similarity Project
#COP3530

#This file takes a csv file containing the raw information of games on steam, processes it, and outputs two files.

#The first output is a Numpy file containing a 2D Numpy Array of the processed game data
#The second output is a JSON file containing metadata in the following format:
    #First line contains a list of column names for all the columns in output 1
    #Second line onwards contains a dictionary for every game, mapping the unique AppID to various game metadata

In [32]:
import pandas as pd
import numpy as np

#Reads raw data file (.csv) and creates a Panda dataframe from it, setting the index to the AppID
df = pd.read_csv(r"data\games.csv")
df.set_index("AppID", inplace=True)

df

Unnamed: 0_level_0,Name,Release date,Estimated owners,Peak CCU,Required age,Price,Discount,DLC count,About the game,Supported languages,...,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,0,Galactic Bowling is an exaggerated and stylize...,['English'],...,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",...,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1355720,Henosis™,"Jul 23, 2020",0 - 20000,0,0,5.99,0,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1139950,Two Weeks in Painland,"Feb 3, 2020",0 - 20000,0,0,0.00,0,0,ABOUT THE GAME Play as a hacker who has arrang...,"['English', 'Spanish - Spain']",...,0,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3600970,Paragon Of Time,"Apr 10, 2025",0 - 20000,0,0,2.99,0,0,"You stand at the edge of time, trying to save ...",['English'],...,0,0,0,Webcess,Webcess,"Single-player,Full controller support,Steam Cl...","Action,Casual,Indie","Action Roguelike,Bullet Hell,Hack and Slash,Ro...",https://shared.akamai.steamstatic.com/store_it...,http://video.akamai.steamstatic.com/store_trai...
3543710,A Few Days With : Hazel,"Apr 11, 2025",0 - 20000,0,0,2.69,10,0,"Join Hazel, an attractive young lady, and enjo...","['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Hentai Panda,Hentai Panda,"Single-player,Steam Achievements,Steam Cloud,F...","Casual,Indie",,https://shared.akamai.steamstatic.com/store_it...,http://video.akamai.steamstatic.com/store_trai...
3265370,MosGhost,"Apr 1, 2025",0 - 20000,0,0,7.99,0,0,Story : Andrei moved to Moscow for work and re...,"['English', 'Russian', 'French', 'Italian', 'G...",...,0,0,0,Sinka Games,"Sinka Games,Arkuda Inc.","Single-player,Family Sharing",Simulation,"Simulation,Walking Simulator,Idler,First-Perso...",https://shared.akamai.steamstatic.com/store_it...,http://video.akamai.steamstatic.com/store_trai...
3423620,AccuBow VR,"Mar 11, 2025",0 - 0,0,0,0.00,0,0,AccuBow VR: Master Realistic Archery in Immers...,['English'],...,0,0,0,AccuBow LLC,AccuBow LLC,"Single-player,Tracked Controller Support,VR On...","Action,Adventure,Free To Play",,https://shared.akamai.steamstatic.com/store_it...,http://video.akamai.steamstatic.com/store_trai...


In [33]:
#Replaces invalid NaN fields with blank strings
df['Categories'] = df['Categories'].fillna('')
df['Genres'] = df['Genres'].fillna('')
df['Tags'] = df['Tags'].fillna('')
df['Name'] = df['Name'].fillna('')

#Creates a new dataframe for the metadata of each game
df_metadata = df[["Name", "Release date", "About the game", "Developers", "Categories", "Genres", "Tags"]]

#Drops columns that aren't needed for data processing
df.drop(columns = ["Name", "Release date", "Estimated owners", "Peak CCU", "Required age", "Price", "Discount", "DLC count", "About the game", "Supported languages", "Full audio languages", "Reviews", "Header image", "Website", "Support url", "Support email", "Windows", "Mac", "Linux", "Metacritic url", "User score", "Score rank", "Achievements", "Recommendations", "Notes", "Median playtime forever", "Median playtime two weeks", "Developers", "Publishers", "Screenshots", "Movies"], inplace = True)

df

Unnamed: 0_level_0,Metacritic score,Positive,Negative,Average playtime forever,Average playtime two weeks,Categories,Genres,Tags
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
20200,0,6,11,0,0,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling"
655370,0,53,5,0,0,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc..."
1732930,0,0,0,0,0,Single-player,"Action,Adventure,Indie,Strategy",
1355720,0,3,0,0,0,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz..."
1139950,0,50,8,0,0,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,..."
...,...,...,...,...,...,...,...,...
3600970,0,5,0,0,0,"Single-player,Full controller support,Steam Cl...","Action,Casual,Indie","Action Roguelike,Bullet Hell,Hack and Slash,Ro..."
3543710,0,0,0,0,0,"Single-player,Steam Achievements,Steam Cloud,F...","Casual,Indie",
3265370,0,24,12,0,0,"Single-player,Family Sharing",Simulation,"Simulation,Walking Simulator,Idler,First-Perso..."
3423620,0,0,0,0,0,"Single-player,Tracked Controller Support,VR On...","Action,Adventure,Free To Play",


In [34]:
    #Separates data within columns for categories/genres/tags to be processed


#Lists containing categories/genres/tags to be used
categories_keep = [
    "co-op", "cross-platform multiplayer", "full controller support",
    "mmo", "mods", "multi-player", "pvp", "single-player", "vr only",
    "vr support", "vr supported"
]

genres_keep = [
    "action", "adventure", "casual", "early access", "education",
    "episodic", "free to play", "gore", "indie", "massively multiplayer",
    "movie", "nudity", "rpg", "racing", "sexual content", "short",
    "simulation", "sports", "strategy", "violent"
]

tags_keep = [
    "arcade", "shooter", "action-adventure", "platformer", "horror",
    "visual novel", "2d platformer", "sexual content", "point & click",
    "fps", "rogue-like", "rogue-lite", "hidden object", "3d platformer",
    "sandbox", "action roguelike", "survival", "side scroller",
    "action rpg", "open world", "bullet hell", "interactive fiction",
    "turn-based strategy", "shoot 'em up", "choose your own adventure",
    "dating sim", "immersive sim", "walking simulator", "clicker",
    "management", "turn-based tactics", "jrpg", "card game", "building",
    "hack and slash", "top-down shooter", "dungeon crawler",
    "survival horror", "precision platformer", "education",
    "tower defense", "life sim", "board game", "idler",
    "third-person shooter", "rts", "time management", "collectathon",
    "arena shooter", "runner", "base building", "strategy rpg",
    "real time tactics", "city builder", "stealth", "beat 'em up",
    "wargame", "flight", "card battler", "2d fighter", "metroidvania",
    "investigation", "party-based rpg", "rhythm", "tactical rpg",
    "match 3", "souls-like", "twin stick shooter", "3d fighter",
    "automobile sim", "word game"
]

#Converts the lists of categories for all the games to column headers with binary values (1 if present for a game, 0 if not present) and stores it in a new dataframe
col_data = df["Categories"].str.get_dummies(sep=',')
#Iterates through each column in new category dataframe and removes any columns that aren't named above
for col in col_data.columns:
    if col.lower() not in categories_keep:
        col_data.drop(columns = col, inplace = True)
#Concatonates the main dataframe with new, processed categories dataframe
df = pd.concat([df, col_data], axis=1)

#Repeat for genres/tags columns

col_data = df["Genres"].str.get_dummies(sep=',')
for col in col_data.columns:
    if col.lower() not in genres_keep:
        col_data.drop(columns = col, inplace = True)
df = pd.concat([df, col_data], axis=1)

col_data = df["Tags"].str.get_dummies(sep=',')
for col in col_data.columns:
    if col.lower() not in tags_keep:
        col_data.drop(columns = col, inplace = True)
df = pd.concat([df, col_data], axis=1)

df


Unnamed: 0_level_0,Metacritic score,Positive,Negative,Average playtime forever,Average playtime two weeks,Categories,Genres,Tags,Co-op,Cross-Platform Multiplayer,...,Time Management,Top-Down Shooter,Tower Defense,Turn-Based Strategy,Turn-Based Tactics,Twin Stick Shooter,Visual Novel,Walking Simulator,Wargame,Word Game
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20200,0,6,11,0,0,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",0,0,...,0,0,0,0,0,0,0,0,0,0
655370,0,53,5,0,0,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",0,0,...,0,0,0,0,0,0,0,0,0,0
1732930,0,0,0,0,0,Single-player,"Action,Adventure,Indie,Strategy",,0,0,...,0,0,0,0,0,0,0,0,0,0
1355720,0,3,0,0,0,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",0,0,...,0,0,0,0,0,0,0,0,0,0
1139950,0,50,8,0,0,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3600970,0,5,0,0,0,"Single-player,Full controller support,Steam Cl...","Action,Casual,Indie","Action Roguelike,Bullet Hell,Hack and Slash,Ro...",0,0,...,0,0,0,0,0,0,0,0,0,0
3543710,0,0,0,0,0,"Single-player,Steam Achievements,Steam Cloud,F...","Casual,Indie",,0,0,...,0,0,0,0,0,0,0,0,0,0
3265370,0,24,12,0,0,"Single-player,Family Sharing",Simulation,"Simulation,Walking Simulator,Idler,First-Perso...",0,0,...,0,0,0,0,0,0,0,1,0,0
3423620,0,0,0,0,0,"Single-player,Tracked Controller Support,VR On...","Action,Adventure,Free To Play",,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Attempt to use TF-IDF to more methodically determine weights to be used in Cosine Similarity - currently not implemented/finished


#Custom tokenizer used for TF-IDF analysis
def data_delimiter(raw_data):
    #If data is a string, returns an array of individual tokens in string
    if isinstance(raw_data, str):
        return [token.strip() for token in raw_data.split(',')]
    #Returns empty array if data is not a string
    return []

#Creates a vectorizer for the category column that tokenizes the data (also filters out undesired categories)
category_vectorizer = TfidfVectorizer(tokenizer=data_delimiter, vocabulary=categories_keep, lowercase=True)
#Applies a fit and transform to the vectorizer object and creates a matrix containing the TF-IDF values
category_matrix = category_vectorizer.fit_transform(df['Categories'])
#Converts the matrix to a pandas dataframe
tfidf_category = pd.DataFrame(category_matrix.toarray(), columns=category_vectorizer.get_feature_names_out())

#Repeat for genres/tags

genre_vectorizer = TfidfVectorizer(tokenizer=data_delimiter, vocabulary=genres_keep, lowercase=True)
genre_matrix = genre_vectorizer.fit_transform(df['Genres'])
tfidf_genre = pd.DataFrame(genre_matrix.toarray(), columns=genre_vectorizer.get_feature_names_out())

tag_vectorizer = TfidfVectorizer(tokenizer=data_delimiter, vocabulary=tags_keep, lowercase=True)
tag_matrix = tag_vectorizer.fit_transform(df['Tags'])
tfidf_tag = pd.DataFrame(tag_matrix.toarray(), columns=tag_vectorizer.get_feature_names_out())

#Concatonates results for all three dataframes
tfidf_df = pd.concat([tfidf_category, tfidf_genre, tfidf_tag], axis=1)

tfidf_df



Unnamed: 0,co-op,cross-platform multiplayer,full controller support,mmo,mods,multi-player,pvp,single-player,vr only,vr support,...,investigation,party-based rpg,rhythm,tactical rpg,match 3,souls-like,twin stick shooter,3d fighter,automobile sim,word game
0,0.0,0.0,0.000000,0.0,0.0,0.926523,0.0,0.376239,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.921856,0.0,0.0,0.000000,0.0,0.387532,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,1.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.921856,0.0,0.0,0.000000,0.0,0.387532,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,1.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111447,0.0,0.0,0.921856,0.0,0.0,0.000000,0.0,0.387532,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111448,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,1.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111449,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,1.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111450,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.199348,0.979929,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
from sklearn.preprocessing import StandardScaler

#Drops additional unecessary columns and creates new copy of dataframe
df.drop(columns=["Categories", "Genres", "Tags"], inplace=True)
df_clean = df

#Uses a standard scaler object to standardize the highly variable numeric columns
SS_obj = StandardScaler()
input_data = df_clean[["Metacritic score", "Positive", "Negative", "Average playtime forever", "Average playtime two weeks"]]
scaled_data = SS_obj.fit_transform(input_data)

#Applies the scaled data to the cleaned dataframe
df_clean[["Metacritic score", "Positive", "Negative", "Average playtime forever", "Average playtime two weeks"]] = scaled_data
df_clean


Unnamed: 0_level_0,Metacritic score,Positive,Negative,Average playtime forever,Average playtime two weeks,Co-op,Cross-Platform Multiplayer,Full controller support,MMO,Mods,...,Time Management,Top-Down Shooter,Tower Defense,Turn-Based Strategy,Turn-Based Tactics,Twin Stick Shooter,Visual Novel,Walking Simulator,Wargame,Word Game
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20200,-0.190981,-0.034980,-0.028695,-0.081253,-0.054548,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
655370,-0.190981,-0.032783,-0.030193,-0.081253,-0.054548,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1732930,-0.190981,-0.035260,-0.031443,-0.081253,-0.054548,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1355720,-0.190981,-0.035120,-0.031443,-0.081253,-0.054548,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1139950,-0.190981,-0.032923,-0.029444,-0.081253,-0.054548,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3600970,-0.190981,-0.035026,-0.031443,-0.081253,-0.054548,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3543710,-0.190981,-0.035260,-0.031443,-0.081253,-0.054548,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3265370,-0.190981,-0.034138,-0.028445,-0.081253,-0.054548,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3423620,-0.190981,-0.035260,-0.031443,-0.081253,-0.054548,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
#Creates an array to store weights for each attribute in cleaned dataframe
weights = [0.7] * 5
weights.extend([1.5] * 11)
weights.extend([2] * 21)
weights.extend([2.5] * 70)

col_count = 0

#Applies weight to each column
for col in df_clean.columns:
    df_clean[col] = df_clean[col].mul(weights[col_count])
    col_count += 1


#Calculates the euclidean (L2) norm of each row and divides the data in each row by the euclidean norm
#Saves time when performing Cosine Similarity in main algorithm implementations
eucl_norm = np.linalg.norm(df_clean, axis=1)
normalized_df = df_clean.div(eucl_norm, axis=0)

normalized_df

Unnamed: 0_level_0,Metacritic score,Positive,Negative,Average playtime forever,Average playtime two weeks,Co-op,Cross-Platform Multiplayer,Full controller support,MMO,Mods,...,Time Management,Top-Down Shooter,Tower Defense,Turn-Based Strategy,Turn-Based Tactics,Twin Stick Shooter,Visual Novel,Walking Simulator,Wargame,Word Game
AppID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20200,-0.032888,-0.006024,-0.004941,-0.013992,-0.009393,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
655370,-0.030854,-0.005296,-0.004878,-0.013127,-0.008813,0.0,0.0,0.346193,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1732930,-0.031273,-0.005774,-0.005149,-0.013305,-0.008932,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1355720,-0.020746,-0.003815,-0.003416,-0.008827,-0.005926,0.0,0.0,0.232779,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1139950,-0.022509,-0.003880,-0.003470,-0.009577,-0.006429,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3600970,-0.019342,-0.003547,-0.003184,-0.008229,-0.005524,0.0,0.0,0.217019,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3543710,-0.041709,-0.007700,-0.006867,-0.017745,-0.011913,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3265370,-0.026725,-0.004777,-0.003980,-0.011370,-0.007633,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.499765,0.0,0.0
3423620,-0.032888,-0.006072,-0.005415,-0.013992,-0.009393,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [38]:
import json

#Makes the AppID a main column again and converts the dataframe to 2D numpy array
normalized_df = normalized_df.reset_index()
numpy_matrix = normalized_df.to_numpy()
#Writes the 2d numpy array to a npy file
np.save('preprocessed_data_matrix.npy', numpy_matrix)

#Converts dataframe containing metadata to a dictionary, with AppID as the key
game_metadata_dict = df_metadata.to_dict('index')

#Writes a list of column headers for the numpy matrix and metadata dictionary to a JSON file
with open('game_metadata.json', 'w') as file:
    json.dump(normalized_df.columns.to_list(), file)
    file.write('\n')
    json.dump(game_metadata_dict, file, indent=5)

In [40]:
#For testing/easy visualization of processed data
normalized_df = pd.concat([df_metadata[["Name"]], normalized_df], axis=1)
normalized_df.to_csv("preprocessed_data.csv", index=False)

numpy_matrix_test = np.load('preprocessed_data_matrix.npy')
print(numpy_matrix_test)

[[ 2.02000000e+04 -3.28879539e-02 -6.02365815e-03 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 6.55370000e+05 -3.08543005e-02 -5.29626008e-03 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 1.73293000e+06 -3.12734661e-02 -5.77387795e-03 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 ...
 [ 3.26537000e+06 -2.67248000e-02 -4.77709889e-03 ...  4.99764839e-01
   0.00000000e+00  0.00000000e+00]
 [ 3.42362000e+06 -3.28878637e-02 -6.07193685e-03 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 3.18379000e+06 -3.12734661e-02 -5.77387795e-03 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]
