In [1]:
# Run this cell to ensure that altair plots show up without having
# the notebook be really large.
# We will talk more about what these lines do later in the course

import os
import altair as alt
import pandas as pd
from toolz.curried import pipe
import ast
import numpy as np

# Create a new data transformer that stores the files in a directory
def json_dir(data, data_dir='altairdata'):
    os.makedirs(data_dir, exist_ok=True)
    return pipe(data, alt.to_json(filename=data_dir + '/{prefix}-{hash}.{extension}') )

# Register and enable the new transformer
alt.data_transformers.register('json_dir', json_dir)
alt.data_transformers.enable('json_dir')

# Handle large data sets (default shows only 5000)
# See here: https://altair-viz.github.io/user_guide/data_transformers.html
alt.data_transformers.disable_max_rows()

alt.renderers.enable('jupyterlab')

RendererRegistry.enable('jupyterlab')

In [None]:
path = '../../data/processed/cleaned_games.csv'
data = pd.read_csv(path)
print(data.shape)
data.head()

(58041, 21)


Unnamed: 0.1,Unnamed: 0,steam_appid,name,developers,publishers,categories,genres,required_age,n_achievements,platforms,...,additional_content,total_reviews,total_positive,total_negative,review_score,review_score_desc,positive_percentual,metacritic,is_free,price_initial (USD)
0,0,2719580,勇者の伝説の勇者,['ぽけそう'],['ぽけそう'],"['Single-player', 'Family Sharing']","['Casual', 'Indie']",0,0,['windows'],...,[],0,0,0,0.0,No user reviews,0.0,0,False,0.99
1,2,2719600,Lorhaven: Cursed War,['GoldenGod Games'],['GoldenGod Games'],"['Single-player', 'Multi-player', 'PvP', 'Shar...","['RPG', 'Strategy']",0,32,"['windows', 'mac']",...,[],9,8,1,0.0,9 user reviews,88.9,0,False,9.99
2,3,2719610,PUIQ: Demons,['Giammnn'],['Giammnn'],"['Single-player', 'Steam Achievements', 'Famil...","['Action', 'Casual', 'Indie', 'RPG']",0,28,['windows'],...,[],0,0,0,0.0,No user reviews,0.0,0,False,2.99
3,4,2719650,Project XSTING,['Saucy Melon'],['Saucy Melon'],"['Single-player', 'Steam Achievements', 'Steam...","['Action', 'Casual', 'Indie', 'Early Access']",0,42,['windows'],...,[],9,9,0,0.0,9 user reviews,100.0,0,False,7.99
4,7,2719710,Manor Madness,['Apericot Studio'],['Apericot Studio'],"['Single-player', 'Steam Achievements', 'HDR a...","['Action', 'Adventure', 'Indie', 'RPG', 'Simul...",0,5,"['windows', 'mac', 'linux']",...,[],0,0,0,0.0,No user reviews,0.0,0,True,0.0


In [3]:
countByDeveloper = data.groupby('developers')['review_score'].mean().reset_index(name='Count')

top10Dev = countByDeveloper.sort_values('Count',ascending=False)


dataWithTop10Dev = data[data['developers'].isin(top10Dev['developers'])]
dataWithTop10Dev['categories'] = dataWithTop10Dev['categories'].apply(ast.literal_eval)
dataWithTop10Dev = dataWithTop10Dev.explode('categories')


removeGenres = ['Captions available','In-App Purchases','Remote Play Together' ,
                'Partial Controller Support','Cross-Platform Multiplayer',
                'Online PvP','Full controller support', 'HDR avaliable', 
                'Includes level editor','HDR available', 'Remote Play on Tablet',
                'Shared/Split Screen Co-op', 'Shared/Split Screen','Shared/Split Screen PvP',
                'Stats','Steam Trading Cards','Steam Workshop','Family Sharing','VR Only', 'VR Support', 
                'VR Supported', 'Tracked Controller Support','Steam Leaderboards', 'Steam Cloud', 'Mods', 'Steam Achievements',
                'Commentry Avaliable','Shared/Split Screen PvP','Steam Turn Notifications', 'Commentary available', 'Steam Timeline', 
                'Includes Source SDK', 'SteamVR Collectibles','Valve Anti-Cheat enabled', 'Mods (require HL2)',
                'LAN Co-op', 'LAN PvP']

filteredData = dataWithTop10Dev[~dataWithTop10Dev['categories'].isin(removeGenres)].dropna()

selector = alt.selection_point(fields=['categories'])

dropdown = alt.selection_point(
    name='categories',
    fields=['categories'],
    bind=alt.binding_select(options=filteredData['categories'].unique().tolist(), name='Select categories')
)

plotOfReviews = alt.Chart(filteredData).mark_line().encode(
    x=alt.X('review_score:Q',title='Review Scores'),
    y=alt.Y('count(total_reviews)',title='Total votes'),
    color=alt.condition(selector, 'categories:N', alt.value('lightgray')),
    tooltip='mean(review_score):Q'
).transform_filter(
    dropdown  
).add_params(
    selector,
    dropdown
).properties(
    width=700,
    height=500,
    title='Total amount of scores for each category'
)

categoryDistrabution = alt.Chart(filteredData).mark_bar().encode(
    x='count(categories):N',
    y='categories:N',
    color='categories:N',
    tooltip='count(categories):N'
).add_params(
    selector
).properties(
    title='Total count of each category'
)

pieCount = alt.Chart(filteredData).mark_arc().encode(
    theta='count(categories):N',  
    color=alt.condition(selector, 'categories:N', alt.value('lightgray')),
    tooltip='categories:N'
).add_params(
    selector
).properties(
    title='Relative Size comparisons of Categories'
)

dropChart = alt.Chart().mark_text().encode().add_params(dropdown)

view =  (dropChart & categoryDistrabution & pieCount) | (plotOfReviews)

view


<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


The research goal of this visualization is to answer: does a game higher with the existence of a certain category? This visualization shows that there does seem to be a difference in score when certain categories exist. Notice that the area under the single-player line, Multiplayer line, and Co-op line seem to be larger on the right-side end of the graph where the higher rated game is compared to the center of the graph where mediocre games exist.

I have chosen a line graph to show the trends of the review scores as we progress from lower end games to higher end ones. This is compared to the Total count of the all the total reviews that exist for each score point. The color channel is based on the Category type. The Bar chart and Pie chart help supplement this view by showing the total number of categories which exist as there could be an over representation bias. There does seem like there is way more single player games than any other category, however the rest of the categories share equal space. There are two interactions that exist here, first is that we can click and select any category to show just color just for that category. Second is that our dropdown allows us to have a blown-up view of a particular category on the line graph.

This view could be made better by having a reset button to return a chosen dropdown state to the original view. The line graph also has a large exasperation at score = 0 due to the shear number of games that have been rated at this score. One solution would be to just ignore this as the difference between a 1 game and 0 game is negligible and perhaps have a different graph comparing the that point for each category separately while cleaning up the main view of the line chart.

In [4]:
bottomRated = data.sort_values(by='review_score',ascending=True).head(20000)
topRated = data.sort_values(by='review_score',ascending=False).head(20000)

bottomRated['categories'] = bottomRated['categories'].apply(lambda x: x.split(','))
bottomRated['catSize'] = bottomRated['categories'].apply(len)

topRated['categories'] = topRated['categories'].apply(lambda x: x.split(','))
topRated['catSize'] = topRated['categories'].apply(len)

total = pd.DataFrame(data)
total['categories'] = total['categories'].apply(lambda x: x.split(','))
total['catSize'] = total['categories'].apply(len)



bottomChart = alt.Chart(bottomRated).mark_area().encode(
    x=alt.X('catSize:Q',title='Number of Categories'),
    y=alt.Y('count(n_achievement):Q',title='Total number of Achievements'),
    color='review_score'
).properties(
    width=300,
    height=300,
    title='Amount of Categories in Lowest 20000 Rated Games'
)

topChart = alt.Chart(topRated).mark_area().encode(
    x=alt.X('catSize:Q',title='Number of Categories'),
    y=alt.Y('count(n_achievement):Q',title='Total number of Achievements'),
    color='review_score'
).properties(
    width=300,
    height=300,
    title='Amount of Categories in Highest 20000 Rated Games'
)

slider = alt.selection_point(
    name='score_slider',
    fields=['review_score'],
    bind=alt.binding_range(min=0, max=9, step=1, name="Select Score"),
    value=5 
)

categoriesForIndividualScore = alt.Chart(total).mark_bar().encode(
    x=alt.X('catSize:Q',title='Number of Categories'),
    y=alt.Y('count(n_achievement):Q',title='Total number of Achievements'),
    color='review_score'
).add_params(
    slider
).transform_filter(
    slider
).properties(
    width=400,
    height=300,
    title='Distrabution of Category amount per Rating Score'
)

view = (bottomChart | topChart) & (categoriesForIndividualScore)
view


<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


The research goal of this visualization is to answer: In general, do games with more achievements have more categories associated with them? And are games with more achievement good games? This visualization shows that there does seem to be a difference where higher rated games tend to lean towards having more categories than the lower ranked games. This is noticed by how the area of the top-rated games have a sizable shift towards the higher end of the x axis compared to the lower rated games chart. This can also be shown by the bottom graph as one can see the movement of data from one side to the other as we go from lower rated games to higher rated ones. 

I have chosen area graphs to represent the number of categories on the x axis vs the total amount of achievements on the y axis. I have done this since there is so much data, other marks could become too cluttered and confusing. The color mark is then used to show how much each area belongs to a certain rating of games. This allows us to see the trends of how high rated games tend to behave given these two variables. To take a closer look at each individual score, another chart was added to show the same data but for one score rating at a time, which can be controlled by a slider. Essentially the top graphs can show us a holistic view while the bottom one can help see a more precise look to help us answer the question. 

This view could be made better by having perhaps lines which also show where the Metacritic score falls onto this data and see if any comparisons can be made from that. Also, perhaps instead of stacked area charts, maybe if they were facted instead it would make the visualization look a bit cleaner.

In [5]:
catCountData = pd.DataFrame(data)
catCountData['categories'] = data['categories'].apply(ast.literal_eval)
catCountData = catCountData.explode('categories')

catCountSummary = catCountData.groupby('developers').agg(
    count=('categories', 'nunique'),  
    score=('review_score', 'mean')  
).reset_index()

bottomRated = catCountSummary.sort_values(by='score', ascending=True).head(20000)
topRated = catCountSummary.sort_values(by='score', ascending=False).head(20000)

topRated['Group'] = 'Top Rated'
bottomRated['Group'] = 'Bottom Rated'
combined = pd.concat([topRated, bottomRated])



density = alt.Chart(combined).transform_density(
    'count',
    as_=['count', 'density'],
    groupby=['Group']
).mark_area(
    opacity=0.5
).encode(
    x='count:Q',
    y='density:Q',
    color='Group:N',
    tooltip=['Group:N', 'count:Q', 'density:Q']
).facet(
    column = alt.Column("Group:N", title=None)
).properties(
    title='Density of Categories in Lowest and Highest preforming games'
)

pie = alt.Chart(catCountSummary).mark_arc().encode(
    theta='score',
    color=alt.Color('count:Q', scale=alt.Scale(scheme='viridis')),
    tooltip='count'
).properties(
    title='Difference in amount of Categories for each score'
)

scores = [0,1,2,3,4,5,6,7,8,9]
scoreRadio = alt.binding_radio(options=scores, name="Scores")
selectScore = alt.selection_point(fields=['score'], bind=scoreRadio)

scoreColor = (
    alt.when(selectScore)
    .then(alt.Color("score:Q"))
    .otherwise(alt.value("lightgray"))
)

points = alt.Chart(catCountSummary).mark_bar(filled=True).encode(
    y='score',
    x='count'
).add_params(
    selectScore
).encode(
    color=scoreColor
).properties(title="Scores vs number of categories")

density & (points | pie)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


The research goal of this visualization is to answer: Does a game which has more categories associated with it do better than ones who meet few categories?  This visualization shows that there does seem to be a difference as the higher rated games tend to contain more categories than the lower rated ones. This is noticed by how the density of the top-rated games has a much longer tail to the higher counts of categories than the lower rated ones.

I have chosen density graphs as since I am using most of the large dataset, this would be an excellent way to showcase large data in an interpretable way. The density is comprised of the count of categories that each game has based on the count again for the x axis. Then to show how the scores compare to the counts, a bar chart with radio buttons will show the trend of the scores in the data. This graph shows that good games tend have a large range of how many categories they tend to have. The pie chart shows how many games have a particular number of categories, which allows us to see if there are any data significance to take into consideration. The bar chart and the pie chart are using this color theme since it allows us to more easily distinguish between sections while also showing ordinal progression.

This view could be made better by having a different view for the pie chart since some of the slices are very thin and are not as visible. There could also be more interactivity where we could be able to control the size of the data for the density to see how sample size effects this finding.