In [2]:
import numpy as np
import pandas as pd

<b>Read data from csv</b>

In [57]:
df = pd.read_csv('vgsales.csv')
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


<b>Replace NaNs in Year column</b>

In [58]:
print(f'Number of rows:\t {df.shape[0]}',
      f'\nNaNs count:\t {df.Year.isna().sum()}',
      f'\nYear mean:\t {df.Year.mean()}',
      f'\nYear median:\t {df.Year.median()}',
      f'\nYear mode:\t {df.Year.mode()[0]}')

# replace NaNs in Year column with mode value
df.loc[df.Year.isna(), 'Year'] = df.Year.mode()[0]

print(f'\nStats after NaNs replacement in Year column',
      f'\nNaNs count:\t {df.Year.isna().sum()}',
      f'\nYear mean:\t {df.Year.mean()}',
      f'\nYear median:\t {df.Year.median()}',
      f'\nYear mode:\t {df.Year.mode()[0]}')

# cast Year column to int32 type
df['Year'] = df.Year.astype('int32')

print(f'\nDataframe after casting Year column to int32')

df.head()

Number of rows:	 16598 
NaNs count:	 271 
Year mean:	 2006.4064433147546 
Year median:	 2007.0 
Year mode:	 2009.0

Stats after NaNs replacement in Year column 
NaNs count:	 0 
Year mean:	 2006.448789010724 
Year median:	 2007.0 
Year mode:	 2009.0

Dataframe after casting Year column to int32


Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


<b>Describe each column in the dataset</b>

In [70]:
# TODO: add functions for Year column (min, max and other like for qualitative column) and find why max is greater than count
def describe_rank_column(column, description):
    print(f'=== {column.name} column description ===',
          f'\n\n{description}',
          f'\n\nNA values: \t{column.isna().sum()}',
          f'\nValues count: \t{column.count()}',
          f'\nMin: \t\t{column.min()}',
          f'\nMax: \t\t{column.max()}\n')

def describe_name_column(column, description):
    field_length = column.astype(str).map(len)
    print(f'=== {column.name} column description ===',
          f'\n\n{description}',
          f'\n\nNA values: \t{column.isna().sum()}',
          f'\nValues count: \t{column.count()}',
          f'\nLongest name: \t{column[field_length.idxmax()]} ({field_length.max()})',
          f'\nLongest name game: https://www.play-asia.com/ds-yamamura-misa-suspense-maiko-kogiku-kisha-katherine-sougi/13/702on0',
          f'\nShortest name: \t{column[field_length.idxmin()]} ({field_length.min()})',
          f'\nShortest name game: https://en.wikipedia.org/wiki/D_(video_game)\n')
    
def describe_quantitative_column(column, description):
    print(f'=== {column.name} column description ===',
          f'\n\n{description}',
          f'\n\nNA values:   {column.isna().sum()}',
          f'\nVar: \t     {np.nanvar(column)}',
          f'\n{column.describe()}\n')

def describe_qualitative_column(column, description):
    print(f'=== {column.name} column description ===',
          f'\n\n{description}',
          f'\n\nNA values:  {column.isna().sum()}',
          f'\nValue  count\n{column.value_counts()}\n')
    
describe_rank_column(df.Rank, 'Game position in ranking based on global sales')

describe_name_column(df.Name, 'The name of the game')

describe_qualitative_column(df.Platform, 'Platform of the games release (Sega Saturn(SAT), Atari2600(2600),'
                            + ' Dreamcast(DC), Sega Genesis(GEN), Neo Geo(NG), \nSupplemental Computing Device(SCD),'
                            + ' WonderSwan(WS), Panasonic 3DO(3DO), TurboGrafx-16(TG16), NEC PC-FX(PCFX),'
                            + ' \nSega Game Gear(GG) etc.)')
describe_qualitative_column(df.Genre, 'Genre of the game')
describe_qualitative_column(df.Publisher, 'Publisher of the game')

describe_quantitative_column(df.NA_Sales, 'Sales in North America (in millions)')
describe_quantitative_column(df.EU_Sales, 'Sales in Europe (in millions)')
describe_quantitative_column(df.JP_Sales, 'Sales in Japan (in millions)')
describe_quantitative_column(df.Other_Sales, 'Sales in the rest of the world (in millions)')
describe_quantitative_column(df.Global_Sales, 'Total worldwide sales (in millions)')

=== Rank column description === 

Game position in ranking based on global sales 

NA values: 	0 
Values count: 	16598 
Min: 		1 
Max: 		16600

=== Name column description === 

The name of the game 

NA values: 	0 
Values count: 	16598 
Longest name: 	DS Yamamura Misa Suspense: Maiko Kogiku - Kisha Katherine - Sougiya Isa Akashi - Koto ni Maru Hana Sanrin: Kyoto Satujin Jinken File (132) 
Longest name game: https://www.play-asia.com/ds-yamamura-misa-suspense-maiko-kogiku-kisha-katherine-sougi/13/702on0 
Shortest name: 	D (1) 
Shortest name game: https://en.wikipedia.org/wiki/D_(video_game)

=== Platform column description === 

Platform of the games release (Sega Saturn(SAT), Atari2600(2600), Dreamcast(DC), Sega Genesis(GEN), Neo Geo(NG), 
Supplemental Computing Device(SCD), WonderSwan(WS), Panasonic 3DO(3DO), TurboGrafx-16(TG16), NEC PC-FX(PCFX), 
Sega Game Gear(GG) etc.) 

NA values:  0 
Value  count
DS      2163
PS2     2161
PS3     1329
Wii     1325
X360    1265
PSP     1213
PS   