## Data Analysis with Pandas
## Bike Rental Stats 
### Nick Dorkins 1/20/2021

In [156]:
import pandas as pd
import numpy as np

In [157]:
df = pd.read_csv('./vgsales.csv')

In [158]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16598 non-null  int64  
 1   Name          16598 non-null  object 
 2   Platform      16598 non-null  object 
 3   Year          16327 non-null  float64
 4   Genre         16598 non-null  object 
 5   Publisher     16540 non-null  object 
 6   NA_Sales      16598 non-null  float64
 7   EU_Sales      16598 non-null  float64
 8   JP_Sales      16598 non-null  float64
 9   Other_Sales   16598 non-null  float64
 10  Global_Sales  16598 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB


In [159]:
# Which company is the most common video game publisher?
most_published = df['Publisher'].mode()[0]
most_published


'Electronic Arts'

In [160]:
# What’s the most common platform?
most_common_platform = df['Platform'].mode()[0]
most_common_platform


'DS'

In [161]:
# What about the most common genre?
most_common_genre = df['Genre'].mode()[0]
most_common_genre


'Action'

In [162]:
# What are the top 20 highest grossing games?
df[['Global_Sales','Name']].set_index('Global_Sales').sort_values('Global_Sales', ascending=False).head(20)


Unnamed: 0_level_0,Name
Global_Sales,Unnamed: 1_level_1
82.74,Wii Sports
40.24,Super Mario Bros.
35.82,Mario Kart Wii
33.0,Wii Sports Resort
31.37,Pokemon Red/Pokemon Blue
30.26,Tetris
30.01,New Super Mario Bros.
29.02,Wii Play
28.62,New Super Mario Bros. Wii
28.31,Duck Hunt


In [163]:
# For North American video game sales, what’s the median?
median_sales = df['NA_Sales'].median(skipna = True)
median_sales


0.08

In [164]:
# Provide a secondary output showing ten games surrounding the median sales output
# assume that games with same median value are sorted in descending order


# median_ten = df['NA_Sales'].median(10)
# median_ten

# df[['NA_Sales','Name']].set_index('NA_Sales').sort_values('NA_Sales', ascending=False).head(10)

# below_median = df[df['NA_Sales'] < df['NA_Sales'].median()][['Name','NA_Sales']].sort_values('NA_Sales').tail(5)

north_american_sales = df.sort_values('NA_Sales', ascending = False)
median_ten = (north_american_sales.shape[0] +1) // 2
median_ten


8299

In [165]:
north_american_sales.iloc[median_ten -5: median_ten +6]


Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
10802,10804,Bladestorm: The Hundred Years' War,X360,2007.0,Action,Tecmo Koei,0.08,0.01,0.0,0.01,0.1
9569,9571,Batman: Arkham Knight,PC,2015.0,Action,Warner Bros. Interactive Entertainment,0.08,0.03,0.0,0.01,0.13
8550,8552,Wakeboarding Unleashed Featuring Shaun Murray,PS2,2003.0,Sports,Activision,0.08,0.06,0.0,0.02,0.16
10799,10801,Mind Quiz,PSP,2006.0,Action,Ubisoft,0.08,0.01,0.0,0.01,0.1
9197,9199,The Walking Dead: Season Two,XOne,2014.0,Adventure,Telltale Games,0.08,0.05,0.0,0.01,0.14
9192,9194,Dino Crisis 3,XB,2003.0,Action,Capcom,0.08,0.03,0.03,0.0,0.14
10181,10183,Smash Court Tennis 3,X360,2007.0,Sports,Atari,0.08,0.02,0.0,0.01,0.11
8802,8804,Disney's The Lion King: Simba's Mighty Adventure,PS,2000.0,Action,Activision,0.08,0.06,0.0,0.01,0.15
8724,8726,Driven,PS2,2001.0,Racing,BAM! Entertainment,0.08,0.06,0.0,0.02,0.15
10793,10795,Glory Days 2,DS,2007.0,Strategy,Ghostlight,0.08,0.0,0.0,0.01,0.1


In [181]:
# For the top-selling game of all time, how many standard deviations above/below the mean are its sales for North America?

top_seller = df.sort_values('Global_Sales', ascending=False).head(1)
global_sales_mean = df['Global_Sales'].mean()
global_stds = df['Global_Sales'].std()
top_game_sales = top_seller.loc[0]['Global_Sales']
total_stds = ((top_game_sales - global_sales_mean) / global_stds)


total_stds


52.862432541684456

In [183]:
# The Nintendo Wii seems to have outdone itself with games. How does its average number of sales compare with all of the other platforms?

get_wii_games = df[df['Platform'] == 'Wii']
get_wii_games

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
7,8,Wii Play,Wii,2006.0,Misc,Nintendo,14.03,9.20,2.93,2.85,29.02
8,9,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,14.59,7.06,4.70,2.26,28.62
...,...,...,...,...,...,...,...,...,...,...,...
16517,16520,Teenage Mutant Ninja Turtles,Wii,2007.0,Action,Konami Digital Entertainment,0.00,0.01,0.00,0.00,0.01
16552,16555,Mahou Sensei Negima!? Neo-Pactio Fight!!,Wii,2007.0,Fighting,Marvelous Interactive,0.00,0.00,0.01,0.00,0.01
16573,16576,Mini Desktop Racing,Wii,2007.0,Racing,Popcorn Arcade,0.01,0.00,0.00,0.00,0.01
16574,16577,Yattaman Wii: BikkuriDokkiri Machine de Mou Ra...,Wii,2008.0,Racing,Takara Tomy,0.00,0.00,0.01,0.00,0.01


In [191]:
mean_wii_sales = get_wii_games['Global_Sales'].mean()
mean_global = df['Global_Sales'].mean()
wii_difference = mean_wii_sales - mean_global

wii_difference


0.1619631180842429

In [None]:
# Come up with 3 more questions that can be answered with this data set.
