In [None]:
#standard imports
import numpy as np
import pandas as pd
import json

# customisations - ensure tables show all columns
pd.set_option("display.max_columns", 100)

#imports for graphing
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
steam_app_data = pd.read_csv('../../data/download/steam_app_data.csv')

In [None]:
steam_app_data.info()

In [None]:
steam_app_data.head(5)

In [None]:
steam_app_data.drop_duplicates(inplace=True)

Dropping columns not needed for analysis

In [None]:
steam_app_data.drop(columns=['controller_support', 'dlc', 'short_description', 'fullgame', 'supported_languages', 'website',
                            'pc_requirements', 'mac_requirements', 'linux_requirements', 'legal_notice', 'drm_notice', 
                            'ext_user_account_notice', 'demos', 'package_groups', 'platforms', 'metacritic', 'reviews',
                            'screenshots', 'movies', 'achievements', 'support_info', 'background', 'content_descriptors',
                            'price_overview', 'packages', 'categories', 'recommendations'], inplace=True)

In [None]:
steam_app_data = steam_app_data[steam_app_data['genres'].notna()]

In [None]:
steam_app_data.info()

In [None]:
steam_app_data.tail(3)

Trying to fix JSON columns

In [None]:
#The Steam API didn't have thier JSON format right, this will fix the formats
steam_app_data['genres'] = steam_app_data['genres'].str.replace("'", '"')

In [None]:
#This changes all the strings in genres and creates jsons
x = steam_app_data['genres'].apply(json.loads)

In [None]:
#This cracks open the Json and makes them usable
y = pd.json_normalize(x)

In [None]:
y[1][98041]['description']

In [None]:
y.info()

Taking the first three genres for each game and outting them back into the original df

In [None]:
genre_1 = []
for itr in range(len(y[0])):
        genre_1.append( y[0][itr]['description'])

steam_app_data['genre_1'] = genre_1

In [None]:
genre_2 = []
for itr in range(len(y[1])):
    if pd.isna(y[1][itr]):
        genre_2.append('No Second Genre')
    else:
        genre_2.append(y[1][itr]['description'])
        
steam_app_data['genre_2'] = genre_2

In [None]:
genre_3 = []
for itr in range(len(y[2])):
    if pd.isna(y[2][itr]):
        genre_3.append('No Third Genre')
    else:
        genre_3.append(y[2][itr]['description'])
        
steam_app_data['genre_3'] = genre_3

In [None]:
steam_app_data.drop(columns=['genres'], inplace=True)

Now to pull out the release date

In [None]:
#The Steam API didn't have thier JSON format right, this will fix the formats
steam_app_data['release_date'] = steam_app_data['release_date'].str.replace("'", '"')
steam_app_data['release_date'] = steam_app_data['release_date'].str.replace("False", '"False"')
steam_app_data['release_date'] = steam_app_data['release_date'].str.replace("True", '"True"')


In [None]:
steam_app_data['release_date']

In [None]:
#This changes all the strings in genres and creates jsons
date_jasons = steam_app_data['release_date'].apply(json.loads)

In [None]:
#This cracks open the Json and makes them usable
date_norm = pd.json_normalize(date_jasons)

In [None]:
steam_app_data['date_released'] = date_norm['date']

In [None]:
dates = []
for itr in range(len(date_norm)):
    if date_norm['date'][itr] == 'Coming Soon':
        dates.append('Coming Soon')
    else:
        dates.append(date_norm['date'][itr])
    
steam_app_data['release_date'] = dates

In [None]:
steam_app_data.drop(columns=['required_age', 'date_released', 'about_the_game', 'is_free'], inplace=True)

In [None]:
steam_app_data = steam_app_data[steam_app_data.type == 'game']

In [None]:
steam_app_data.info()

In [None]:
#Dropping some columns that are in another table
steam_app_data.drop(columns=['developers', 'publishers'], inplace=True)

In [None]:
steam_app_data.to_csv("../../data/download/steam_app_data.csv", index=False)