In [5]:
import pandas as pd
import numpy as np

# Load the dataset from directory
data = pd.read_csv('Video_Games_Sales_as_at_22_Dec_2016.csv')

In [3]:
print(data.head())

                       Name Platform  Year_of_Release         Genre Publisher  \
0                Wii Sports      Wii           2006.0        Sports  Nintendo   
1         Super Mario Bros.      NES           1985.0      Platform  Nintendo   
2            Mario Kart Wii      Wii           2008.0        Racing  Nintendo   
3         Wii Sports Resort      Wii           2009.0        Sports  Nintendo   
4  Pokemon Red/Pokemon Blue       GB           1996.0  Role-Playing  Nintendo   

   NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  Critic_Score  \
0     41.36     28.96      3.77         8.45         82.53          76.0   
1     29.08      3.58      6.81         0.77         40.24           NaN   
2     15.68     12.76      3.79         3.29         35.52          82.0   
3     15.61     10.93      3.28         2.95         32.77          80.0   
4     11.27      8.89     10.22         1.00         31.37           NaN   

   Critic_Count User_Score  User_Count Developer Rating 

In [7]:
# Convert User_Score to a numeric type, turning errors ('tbd') into NaN (Not a Number)
data['User_Score'] = pd.to_numeric(data['User_Score'], errors='coerce')

# drop rows where key metrics are missing
data.dropna(subset=['Year_of_Release', 'Critic_Score', 'User_Score', 'Publisher'], inplace=True)

In [9]:
print(data.head())

                    Name Platform  Year_of_Release     Genre Publisher  \
0             Wii Sports      Wii           2006.0    Sports  Nintendo   
2         Mario Kart Wii      Wii           2008.0    Racing  Nintendo   
3      Wii Sports Resort      Wii           2009.0    Sports  Nintendo   
6  New Super Mario Bros.       DS           2006.0  Platform  Nintendo   
7               Wii Play      Wii           2006.0      Misc  Nintendo   

   NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  Critic_Score  \
0     41.36     28.96      3.77         8.45         82.53          76.0   
2     15.68     12.76      3.79         3.29         35.52          82.0   
3     15.61     10.93      3.28         2.95         32.77          80.0   
6     11.28      9.14      6.50         2.88         29.80          89.0   
7     13.96      9.18      2.93         2.84         28.92          58.0   

   Critic_Count  User_Score  User_Count Developer Rating  
0          51.0         8.0       322.0

In [11]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 6893 entries, 0 to 16709
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             6893 non-null   object 
 1   Platform         6893 non-null   object 
 2   Year_of_Release  6893 non-null   float64
 3   Genre            6893 non-null   object 
 4   Publisher        6893 non-null   object 
 5   NA_Sales         6893 non-null   float64
 6   EU_Sales         6893 non-null   float64
 7   JP_Sales         6893 non-null   float64
 8   Other_Sales      6893 non-null   float64
 9   Global_Sales     6893 non-null   float64
 10  Critic_Score     6893 non-null   float64
 11  Critic_Count     6893 non-null   float64
 12  User_Score       6893 non-null   float64
 13  User_Count       6893 non-null   float64
 14  Developer        6889 non-null   object 
 15  Rating           6825 non-null   object 
dtypes: float64(10), object(6)
memory usage: 915.5+ KB
None


In [13]:
print(data.describe())

       Year_of_Release     NA_Sales     EU_Sales     JP_Sales  Other_Sales  \
count      6893.000000  6893.000000  6893.000000  6893.000000  6893.000000   
mean       2007.481793     0.390968     0.234536     0.063876     0.082012   
std           4.236497     0.963293     0.684262     0.286481     0.268637   
min        1985.000000     0.000000     0.000000     0.000000     0.000000   
25%        2004.000000     0.060000     0.020000     0.000000     0.010000   
50%        2007.000000     0.150000     0.060000     0.000000     0.020000   
75%        2011.000000     0.390000     0.210000     0.010000     0.070000   
max        2016.000000    41.360000    28.960000     6.500000    10.570000   

       Global_Sales  Critic_Score  Critic_Count   User_Score    User_Count  
count   6893.000000   6893.000000   6893.000000  6893.000000   6893.000000  
mean       0.771576     70.261860     28.842449     7.184985    174.366894  
std        1.954908     13.859256     19.195964     1.439028    58

In [15]:
# Select columns for our analysis
analysis_cols = ['Global_Sales', 'Critic_Score', 'User_Score', 'User_Count']
stats_df = pd.DataFrame(index=['mean', 'median', 'mode', 'std_dev'])

for col in analysis_cols:
    stats_df[col] = [
        data[col].mean(),
        data[col].median(),
        data[col].mode()[0],
        data[col].std()
    ]

print(stats_df.round(2))

         Global_Sales  Critic_Score  User_Score  User_Count
mean             0.77         70.26        7.18      174.37
median           0.29         72.00        7.50       27.00
mode             0.02         80.00        7.80        6.00
std_dev          1.95         13.86        1.44      584.91


## Correlation Between Features

In [19]:
# Using numerical columns for correlation calculation
numerical_data = data[['Global_Sales', 'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count']]

print(numerical_data.corr())

              Global_Sales  Critic_Score  Critic_Count  User_Score  User_Count
Global_Sales      1.000000      0.237057      0.290678    0.088329    0.264008
Critic_Score      0.237057      1.000000      0.394976    0.579437    0.265660
Critic_Count      0.290678      0.394976      1.000000    0.194814    0.365507
User_Score        0.088329      0.579437      0.194814    1.000000    0.018643
User_Count        0.264008      0.265660      0.365507    0.018643    1.000000
