In [45]:
import pandas as pd
import plotly.express as px

In [26]:
data = pd.read_csv("Data/vgchartz-2024.csv")

In [27]:
# show all columns
pd.set_option("display.max_columns", None)

In [28]:
data.shape

(64016, 14)

In [29]:
data.head()

Unnamed: 0,img,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date,last_update
0,/games/boxart/full_6510540AmericaFrontccc.jpg,Grand Theft Auto V,PS3,Action,Rockstar Games,Rockstar North,9.4,20.32,6.37,0.99,9.85,3.12,2013-09-17,
1,/games/boxart/full_5563178AmericaFrontccc.jpg,Grand Theft Auto V,PS4,Action,Rockstar Games,Rockstar North,9.7,19.39,6.06,0.6,9.71,3.02,2014-11-18,2018-01-03
2,/games/boxart/827563ccc.jpg,Grand Theft Auto: Vice City,PS2,Action,Rockstar Games,Rockstar North,9.6,16.15,8.41,0.47,5.49,1.78,2002-10-28,
3,/games/boxart/full_9218923AmericaFrontccc.jpg,Grand Theft Auto V,X360,Action,Rockstar Games,Rockstar North,,15.86,9.06,0.06,5.33,1.42,2013-09-17,
4,/games/boxart/full_4990510AmericaFrontccc.jpg,Call of Duty: Black Ops 3,PS4,Shooter,Activision,Treyarch,8.1,15.09,6.18,0.41,6.05,2.44,2015-11-06,2018-01-14


## General Cleaning

The different questions require different cleaning, so we start with a general cleaning, and clean for each question as needed

In [30]:
data_general_clean = data.copy()


data_general_clean = data_general_clean.drop(["last_update"], axis=1)

In [31]:
data_general_clean.isnull().sum()

img                 0
title               0
console             0
genre               0
publisher           0
developer          17
critic_score    57338
total_sales     45094
na_sales        51379
jp_sales        57290
pal_sales       51192
other_sales     48888
release_date     7051
dtype: int64

In [32]:
data_general_clean.duplicated().sum()

np.int64(0)

In [33]:
data_general_clean = data_general_clean.dropna(subset=["total_sales"])

In [34]:
data_general_clean.shape

(18922, 13)

In [39]:
data_general_clean.sample(5)


Unnamed: 0,img,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date
7357,/games/boxart/full_cabelas-survival-shadows-of...,Cabela's Survival: Shadows of Katmai,PS3,Sports,Activision,Activision,,0.19,0.11,0.0,0.05,0.03,2011-11-01
203,/games/boxart/full_3513275AmericaFrontccc.jpg,Mario & Sonic at the Olympic Winter Games,DS,Sports,Sega,Sega,7.6,3.47,1.21,0.27,1.63,0.37,2009-10-13
9171,/games/boxart/full_8348409JapanFrontccc.jpg,Soccer Tsuku DS: World Challenge 2010,DS,Sports,Sega,Sega,,0.13,0.0,0.13,0.0,0.0,2010-05-27
559,/games/boxart/full_4909422AmericaFrontccc.jpg,Star Wars Battlefront II (2017),XOne,Shooter,Electronic Arts,EA DICE,,1.92,1.15,0.0,0.59,0.17,2017-11-17
1824,/games/boxart/full_8042351AmericaFrontccc.png,BioShock: The Collection,PS4,Shooter,2K Games,"Blind Squirrel Digital, Inc",,0.86,0.41,0.03,0.28,0.14,2016-09-13


In [36]:
data_general_clean.isnull().sum()

img                 0
title               0
console             0
genre               0
publisher           0
developer           4
critic_score    14796
total_sales         0
na_sales         6285
jp_sales        12196
pal_sales        6098
other_sales      3794
release_date       90
dtype: int64

In [37]:
data_general_clean[["na_sales", "jp_sales", "pal_sales", "other_sales"]] = data_general_clean[["na_sales", "jp_sales", "pal_sales", "other_sales"]].fillna(0)

In [38]:
data_general_clean.isnull().sum()

img                 0
title               0
console             0
genre               0
publisher           0
developer           4
critic_score    14796
total_sales         0
na_sales            0
jp_sales            0
pal_sales           0
other_sales         0
release_date       90
dtype: int64

In [44]:
data_general_clean_date = data_general_clean.copy()

data_general_clean_date["release_date"] = pd.to_datetime(data_general_clean_date["release_date"])

data_general_clean_date["month"] = data_general_clean_date["release_date"].dt.month.astype("Int64")
data_general_clean_date["year"] = data_general_clean_date["release_date"].dt.year.astype("Int64")

data_general_clean_date.sample(5)

Unnamed: 0,img,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date,month,year
8307,/games/boxart/full_victorious-hollywood-arts-d...,Victorious: Hollywood Arts Debut,DS,Misc,D3 Publisher,Namco Bandai Games,,0.15,0.13,0.0,0.01,0.01,2011-11-15,11,2011
9066,/games/boxart/9590207ccc.jpg,Assault: Retribution,PS,Action,Midway Games,Candle Light Studios,,0.13,0.07,0.0,0.05,0.01,1998-10-31,10,1998
14308,/games/boxart/full_marble-madness-klax_4Americ...,Marble Madness / Klax,GBA,Puzzle,DSI Games,Frame Studios Interactive,,0.03,0.02,0.0,0.01,0.0,2005-08-16,8,2005
11824,/games/boxart/full_3418167JapanFrontccc.jpg,Kenyuu Densetsu Yaiba,SNES,Role-Playing,Banpresto,Atelier Double,,0.07,0.0,0.07,0.0,0.0,1994-03-25,3,1994
746,/games/boxart/full_lego-harry-potter-years-1-4...,LEGO Harry Potter: Years 1-4,X360,Adventure,Warner Bros. Interactive,Traveller's Tales,7.9,1.59,0.95,0.0,0.5,0.14,2010-06-29,6,2010


## When should a company release a game to maximize sales at launch?

In [54]:
fig = px.line(
    data_general_clean_date,
    x="month",
    y="total_sales",
    color="category" if "category" in data_general_clean_date.columns else None,  # optional grouping
    animation_frame="year",   # Each frame = different year
    markers=True,
    labels={
        "total_sales": "Sales",
        "month": "Month",
        "year": "Year"
    },
    title="Monthly Sales Animated by Year"
)

# Make sure months appear in the right order (not alphabetical)
fig.update_xaxes(categoryorder="array", categoryarray=[
    "Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"
])

fig.show()

TypeError: boolean value of NA is ambiguous