In [1]:
import pandas as pd
import plotly.express as px

In [2]:
data = pd.read_csv("Data/vgchartz-2024.csv")

In [3]:
# show all columns
pd.set_option("display.max_columns", None)

In [4]:
data.shape

(64016, 14)

In [5]:
data.head()

Unnamed: 0,img,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date,last_update
0,/games/boxart/full_6510540AmericaFrontccc.jpg,Grand Theft Auto V,PS3,Action,Rockstar Games,Rockstar North,9.4,20.32,6.37,0.99,9.85,3.12,2013-09-17,
1,/games/boxart/full_5563178AmericaFrontccc.jpg,Grand Theft Auto V,PS4,Action,Rockstar Games,Rockstar North,9.7,19.39,6.06,0.6,9.71,3.02,2014-11-18,2018-01-03
2,/games/boxart/827563ccc.jpg,Grand Theft Auto: Vice City,PS2,Action,Rockstar Games,Rockstar North,9.6,16.15,8.41,0.47,5.49,1.78,2002-10-28,
3,/games/boxart/full_9218923AmericaFrontccc.jpg,Grand Theft Auto V,X360,Action,Rockstar Games,Rockstar North,,15.86,9.06,0.06,5.33,1.42,2013-09-17,
4,/games/boxart/full_4990510AmericaFrontccc.jpg,Call of Duty: Black Ops 3,PS4,Shooter,Activision,Treyarch,8.1,15.09,6.18,0.41,6.05,2.44,2015-11-06,2018-01-14


## General Cleaning

The different questions require different cleaning, so we start with a general cleaning, and clean for each question as needed

In [6]:
data_general_clean = data.copy()


data_general_clean = data_general_clean.drop(["last_update"], axis=1)

In [7]:
data_general_clean.isnull().sum()

img                 0
title               0
console             0
genre               0
publisher           0
developer          17
critic_score    57338
total_sales     45094
na_sales        51379
jp_sales        57290
pal_sales       51192
other_sales     48888
release_date     7051
dtype: int64

In [8]:
data_general_clean.duplicated().sum()

np.int64(0)

In [None]:
data_general_clean = data_general_clean.dropna(subset=["total_sales"])

In [10]:
data_general_clean.shape

(18922, 13)

In [11]:
data_general_clean.sample(5)


Unnamed: 0,img,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date
8260,/games/boxart/full_9374370AmericaFrontccc.jpg,Steins;Gate 0,PSV,Adventure,PQube,5pb. Inc.,,0.15,,0.07,0.07,0.02,2016-11-29
16643,/games/boxart/full_5613576JapanFrontccc.jpg,La Corda d'Oro 2ff,PSV,Misc,Koei Tecmo,Koei Tecmo Games,,0.01,,0.01,,,2017-12-21
12196,/games/boxart/full_9711951AmericaFrontccc.jpg,Mary Skelter: Nightmares,PSV,Role-Playing,Idea Factory,Compile Heart,,0.06,0.01,0.04,0.0,0.01,2017-09-19
347,/games/boxart/full_7567790AmericaFrontccc.jpeg,NBA 2K14,X360,Sports,2K Sports,Visual Concepts,,2.57,2.11,0.0,0.19,0.27,2013-10-11
8920,/games/boxart/full_1623039JapanFrontccc.jpg,Persona Dancing: Endless Night Collection,PS4,Music,Atlus,Atlus,,0.13,0.07,0.04,,0.02,2018-12-04


In [12]:
data_general_clean.isnull().sum()

img                 0
title               0
console             0
genre               0
publisher           0
developer           4
critic_score    14796
total_sales         0
na_sales         6285
jp_sales        12196
pal_sales        6098
other_sales      3794
release_date       90
dtype: int64

In [13]:
data_general_clean[["na_sales", "jp_sales", "pal_sales", "other_sales"]] = data_general_clean[["na_sales", "jp_sales", "pal_sales", "other_sales"]].fillna(0)

In [14]:
data_general_clean.isnull().sum()

img                 0
title               0
console             0
genre               0
publisher           0
developer           4
critic_score    14796
total_sales         0
na_sales            0
jp_sales            0
pal_sales           0
other_sales         0
release_date       90
dtype: int64

In [15]:
data_general_clean_date = data_general_clean.copy()

data_general_clean_date["release_date"] = pd.to_datetime(data_general_clean_date["release_date"])

data_general_clean_date["month"] = data_general_clean_date["release_date"].dt.month.astype("Int64")
data_general_clean_date["year"] = data_general_clean_date["release_date"].dt.year.astype("Int64")

data_general_clean_date.sample(5)

Unnamed: 0,img,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date,month,year
2184,/games/boxart/full_9072140AmericaFrontccc.jpg,Need for Speed: Payback,XOne,Racing,Electronic Arts,Ghost Games,5.9,0.73,0.42,0.0,0.24,0.07,2017-11-10,11,2017
13266,/games/boxart/full_6748169AmericaFrontccc.jpg,Silhouette Mirage,PS,Platform,Working Designs,"Treasure Co., Ltd.",,0.05,0.03,0.0,0.02,0.0,1999-12-31,12,1999
16410,/games/boxart/full_3849742AmericaFrontccc.jpg,Animal Snap: Rescue Them 2 By 2,GBA,Puzzle,Ignition Entertainment,"Awesome Developments, Ltd.",,0.01,0.0,0.0,0.0,0.0,2002-11-01,11,2002
1606,/games/boxart/full_5307689AmericaFrontccc.jpg,NCAA Football 10,X360,Sports,EA Sports,EA Tiburon,8.1,0.94,0.87,0.0,0.0,0.07,2009-07-14,7,2009
11381,/games/boxart/full_9288721AmericaFrontccc.jpg,Godai: Elemental Force,PS2,Action,3DO,3DO,,0.07,0.04,0.0,0.03,0.01,2002-01-21,1,2002


In [19]:
data_general_clean_date_q1 = data_general_clean_date.dropna(subset=['release_date'])
data_general_clean_date_q1.isnull().sum()

img                 0
title               0
console             0
genre               0
publisher           0
developer           3
critic_score    14709
total_sales         0
na_sales            0
jp_sales            0
pal_sales           0
other_sales         0
release_date        0
month               0
year                0
dtype: int64

## When should a company release a game to maximize sales at launch?

In [24]:
# Base arguments for histogram
kwargs = {
    "x": "month",
    "y": "total_sales",
    "animation_frame": "year",
    "histfunc": "sum",  # aggregate sales if multiple entries per month
    "labels": {
        "total_sales": "Sales",
        "month": "Release Month",
        "year": "Release Year"
    },
    "title": "Monthly Release Histogram Animated by Year and Total Accumulated Sales"
}

# Add color only if category exists
if "category" in data_general_clean_date_q1.columns:
    kwargs["color"] = "category"

fig = px.histogram(data_general_clean_date_q1, **kwargs)

# Ensure months are in calendar order
fig.update_xaxes(categoryorder="array", categoryarray=[
    "Jan","Feb","Mar","Apr","May","Jun",
    "Jul","Aug","Sep","Oct","Nov","Dec"
])

# Reverse year order so animation starts at oldest year
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1000
fig.frames = sorted(fig.frames, key=lambda f: int(f.name))  # sort frames by year
fig.layout.sliders[0]["steps"] = sorted(fig.layout.sliders[0]["steps"], key=lambda s: int(s["label"]))

fig.show()


The Months and years shows when the game was released, not the date of the sale. Sales are measured accumulatively. But we assume that sales are highest around the release date.
We can see a trend in later years around the month 10 and 11, which we speculate sales leading up to Christmas(the holidays). We can also see a slight trend around month 3 (March), we speculate that a lot of our data comes from games sold for the US. This could possibly be explained by holidays in the US, for example Spring break (TODO: research this!). (TODO: Research if there is a summer trend)

In [None]:
# Base arguments for histogram
kwargs = {
    "x": "month",
    "y": "total_sales",
    "animation_frame": "year",
    "histfunc": "sum",  # aggregate sales if multiple entries per month
    "labels": {
        "total_sales": "Sales",
        "month": "Release Month",
        "year": "Release Year"
    },
    "title": "Monthly Release Histogram Animated by Year and Total Accumulated Sales"
}

# Add color only if category exists
if "category" in data_general_clean_date_q1.columns:
    kwargs["color"] = "category"

fig = px.histogram(data_general_clean_date_q1, **kwargs)

# Ensure months are in calendar order
fig.update_xaxes(categoryorder="array", categoryarray=[
    "Jan","Feb","Mar","Apr","May","Jun",
    "Jul","Aug","Sep","Oct","Nov","Dec"
])

# Reverse year order so animation starts at oldest year
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1000
fig.frames = sorted(fig.frames, key=lambda f: int(f.name))  # sort frames by year
fig.layout.sliders[0]["steps"] = sorted(fig.layout.sliders[0]["steps"], key=lambda s: int(s["label"]))

fig.show()
