## Heroes of Pymoli Video Game Demographics Analysis

In [1]:
import pandas as pd
import numpy as np

# pandas tools
from functools import reduce

### Import Data 

The Assumption is that the dataset contains only sales data and does not need to be adjusted for returns or refunds. 

In [2]:
filepath = './HeroesOfPymoli_Resources_purchase_data.csv'

In [3]:
df = pd.read_csv(filepath, encoding="ISO-8859-1")

In [4]:
# Preview of Purchase Analysis
df.head(10)

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44
5,5,Yalae81,22,Male,81,Dreamkiss,3.61
6,6,Itheria73,36,Male,169,"Interrogator, Blood Blade of the Queen",2.18
7,7,Iskjaskst81,20,Male,162,Abyssal Shard,2.67
8,8,Undjask33,22,Male,21,Souleater,1.1
9,9,Chanosian48,35,Other / Non-Disclosed,136,Ghastly Adamantite Protector,3.58


In [5]:
# Identify incomplete rows
df.count()

Purchase ID    780
SN             780
Age            780
Gender         780
Item ID        780
Item Name      780
Price          780
dtype: int64

In [6]:
# Review Data Types
df.dtypes

Purchase ID      int64
SN              object
Age              int64
Gender          object
Item ID          int64
Item Name       object
Price          float64
dtype: object

### Data Analysis

In [7]:
# Calculate the Total Number of Players
playercount = df['SN'].nunique()
playercount

576

In [8]:
## To confirm a unique Item ID per Item Name
# count the number of unique id numbers per item name
num_of_ids_per_item_name = df.groupby('Item Name')['Item ID'].nunique().reset_index(name= "# of Item IDs")

# Only show values greater than 1
items_with_duplicate_ids  = num_of_ids_per_item_name.loc[num_of_ids_per_item_name['# of Item IDs'] > 1].reset_index(drop=True)
items_with_duplicate_ids

Unnamed: 0,Item Name,# of Item IDs
0,Crucifer,2
1,Final Critic,2
2,Persuasion,2
3,Stormcaller,2


In [9]:
## Purchase Analysis Subtotal Row
### Calculate Number of Unique Items, Avg purch Price, Total Num of Puchases & Total Revenue

# Due to the duplicate item ids, we will be using the item names to count unique purchases
num_of_unique_products = round(df['Item Name'].nunique(),1)

avg_purch_price = round(df['Price'].mean(),1)

purchases_qty = round(df['Purchase ID'].count(),1)

total_revenue = round(df['Price'].sum(),1)


# create a subtotal row
subtotal_row = pd.DataFrame({"Number of Unique Items": num_of_unique_products,
                                      "Average Purchase Price":avg_purch_price,
                                      "Total Number of Purchases": purchases_qty,
                                      "Total Revenue":total_revenue}, index=[0])
subtotal_row

Unnamed: 0,Number of Unique Items,Average Purchase Price,Total Number of Purchases,Total Revenue
0,179,3.1,780,2379.8


### Gender Demographics

In [10]:
# Gender demographics
# number of players per gender
players = df.groupby('Gender')['SN'].nunique()
players

Gender
Female                    81
Male                     484
Other / Non-Disclosed     11
Name: SN, dtype: int64

In [11]:
### Gender Summary Table

# to assign demographic gender values 
male_players = players['Male']
female_players = players['Female']
other_players = players['Other / Non-Disclosed']


# calculating the % Count per Gender Bracket
maleplayers_perc = (male_players /playercount)*100
femaleplayers_perc = (female_players /playercount)*100
otherplayers_perc = (other_players /playercount)*100

# summarize data into a table for review
gender_demographic = pd.DataFrame([
    {"Description":"Male Players","Count": male_players,"Percentage": maleplayers_perc},
    {"Description":"Female Players","Count": female_players,"Percentage": femaleplayers_perc},
    {"Description":"Other Players","Count": other_players,"Percentage": otherplayers_perc}
])

gender_demographic

Unnamed: 0,Description,Count,Percentage
0,Male Players,484,84.027778
1,Female Players,81,14.0625
2,Other Players,11,1.909722


### Gender Demographics Analysis

In [12]:
# to calculate purchasing behaviors by gender
gender_demo_analysis = df.groupby('Gender')['Price'].agg(['mean','sum']).reset_index()
gender_demo_analysis

Unnamed: 0,Gender,mean,sum
0,Female,3.203009,361.94
1,Male,3.017853,1967.64
2,Other / Non-Disclosed,3.346,50.19


In [13]:
# to rename column headers
gender_demo_analysis = gender_demo_analysis.rename(columns={'mean':'Avg Purchase Price',
                                                            'sum':'Total Purchase Value'})
gender_demo_analysis

Unnamed: 0,Gender,Avg Purchase Price,Total Purchase Value
0,Female,3.203009,361.94
1,Male,3.017853,1967.64
2,Other / Non-Disclosed,3.346,50.19


In [14]:
#extra analysis: which gender purchased at a higher magnitude?
qty_per_purchase_by_gender = df.groupby('Gender')['SN'].count().reset_index(name='Purchases per Player')

qty_per_purchase_by_gender

Unnamed: 0,Gender,Purchases per Player
0,Female,113
1,Male,652
2,Other / Non-Disclosed,15


In [15]:
# final table of gender demographic analysis

gender_demographic_analysis_final = gender_demo_analysis.merge(qty_per_purchase_by_gender, on='Gender', how='inner')
gender_demographic_analysis_final

Unnamed: 0,Gender,Avg Purchase Price,Total Purchase Value,Purchases per Player
0,Female,3.203009,361.94,113
1,Male,3.017853,1967.64,652
2,Other / Non-Disclosed,3.346,50.19,15


### Age Demographics Analysis

In [16]:
## create a new column that is filled with a description object based on the value in the age column

# create bins for the data to be held
age_bins = [0,10,14,19,24,29,34,39,100]

# create the names for the bins
brackets_descriptions = ['>10','10-14','15-19','20-24','25-29','30-34','35-39','40+']

In [17]:
# add age bracket to the original dataframe
df["Age Bracket"] = pd.cut(df["Age"], age_bins, labels= brackets_descriptions)

In [18]:
# calculate Purchase Count
age_demographic_analysis = df.groupby('Age Bracket')["Price"].agg(['count','mean','sum']).reset_index()
age_demographic_analysis

Unnamed: 0,Age Bracket,count,mean,sum
0,>10,32,3.405,108.96
1,10-14,19,2.681579,50.95
2,15-19,136,3.035956,412.89
3,20-24,365,3.052219,1114.06
4,25-29,101,2.90099,293.0
5,30-34,73,2.931507,214.0
6,35-39,41,3.601707,147.67
7,40+,13,2.941538,38.24


In [19]:
age_demographic_analysis = age_demographic_analysis.rename(columns={'count':"Purchase Count",
                                                                    'mean':"Average Purchase Price",
                                                                    'sum':"Total Purchase Value"})
age_demographic_analysis

Unnamed: 0,Age Bracket,Purchase Count,Average Purchase Price,Total Purchase Value
0,>10,32,3.405,108.96
1,10-14,19,2.681579,50.95
2,15-19,136,3.035956,412.89
3,20-24,365,3.052219,1114.06
4,25-29,101,2.90099,293.0
5,30-34,73,2.931507,214.0
6,35-39,41,3.601707,147.67
7,40+,13,2.941538,38.24


### Top 5 Spenders

In [20]:
# Top 5 Spenders: Identify Top Spenders:

# get total value of purchases by username
spend_by_username = df.groupby('SN')['Price'].sum().sort_values(ascending = False)

# extract top 5 spenders
top_5_spenders = spend_by_username.iloc[0:5].reset_index(name='Total Purchase Value')
top_5_spenders

Unnamed: 0,SN,Total Purchase Value
0,Lisosia93,18.96
1,Idastidru52,15.45
2,Chamjask73,13.83
3,Iral74,13.62
4,Iskadarya95,13.1


In [21]:
# List of top 5 spenders
listoftopspenders = list(top_5_spenders['SN'])
listoftopspenders

['Lisosia93', 'Idastidru52', 'Chamjask73', 'Iral74', 'Iskadarya95']

In [22]:
# Top 5 Spenders
top_5_spenders_df = df.loc[df['SN'].isin(listoftopspenders)]

In [23]:
#Top 5 spenders: Average Purchase Price
top5_analysis = top_5_spenders_df.groupby('SN')['Price'].agg(['count','mean','sum']).reset_index().sort_values(by='count',ascending = False)

top5_analysis

Unnamed: 0,SN,count,mean,sum
4,Lisosia93,5,3.792,18.96
1,Idastidru52,4,3.8625,15.45
2,Iral74,4,3.405,13.62
0,Chamjask73,3,4.61,13.83
3,Iskadarya95,3,4.366667,13.1


In [24]:
# finalize dataset format
top5_analysis = top5_analysis.rename(columns={'count':'Number of Purchases',
                                              'mean':'Avg. Purchase Price',
                                              'sum':'Total Purchase Value'})

top5_analysis

Unnamed: 0,SN,Number of Purchases,Avg. Purchase Price,Total Purchase Value
4,Lisosia93,5,3.792,18.96
1,Idastidru52,4,3.8625,15.45
2,Iral74,4,3.405,13.62
0,Chamjask73,3,4.61,13.83
3,Iskadarya95,3,4.366667,13.1


### Most Popular Items

In [25]:
# to calculate the number of times a game was purchased

popular_games = df.groupby('Item ID')['Price'].count().sort_values(ascending = False).reset_index(name= "Purchase Count")
popular_games

Unnamed: 0,Item ID,Purchase Count
0,178,12
1,82,9
2,108,9
3,145,9
4,92,8
...,...,...
178,42,1
179,118,1
180,104,1
181,27,1


In [26]:
# Top 5 gamer ids

top5_popular_game_id = popular_games['Item ID'][0:5]
top5_popular_game_id

0    178
1     82
2    108
3    145
4     92
Name: Item ID, dtype: int64

In [27]:
# Filter DF by popular games

popular_games_df = df.loc[df['Item ID'].isin(top5_popular_game_id)]
popular_games_df.head()

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price,Age Bracket
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53,20-24
2,2,Ithergue48,24,Male,92,Final Critic,4.88,20-24
18,18,Reunasu60,22,Female,82,Nirvana,4.9,20-24
25,25,Lisirra87,29,Male,178,"Oathbreaker, Last Hope of the Breaking Storm",4.23,25-29
56,56,Raesty92,12,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53,10-14


In [28]:
# perform standard agg metrics to dataset

popular_games_analysis = popular_games_df.groupby(['Item ID',"Item Name"])['Price'].agg(['count','mean','sum']).reset_index().sort_values(by='count',ascending = False)
popular_games_analysis

Unnamed: 0,Item ID,Item Name,count,mean,sum
4,178,"Oathbreaker, Last Hope of the Breaking Storm",12,4.23,50.76
0,82,Nirvana,9,4.9,44.1
2,108,"Extraction, Quickblade Of Trembling Hands",9,3.53,31.77
3,145,Fiery Glass Crusader,9,4.58,41.22
1,92,Final Critic,8,4.88,39.04


In [29]:
# final dataset format
popular_games_analysis = popular_games_analysis.rename(columns={'count':'Number of Purchases',
                                              'mean':'Avg. Purchase Price',
                                              'sum':'Total Purchase Value'})

popular_games_analysis

Unnamed: 0,Item ID,Item Name,Number of Purchases,Avg. Purchase Price,Total Purchase Value
4,178,"Oathbreaker, Last Hope of the Breaking Storm",12,4.23,50.76
0,82,Nirvana,9,4.9,44.1
2,108,"Extraction, Quickblade Of Trembling Hands",9,3.53,31.77
3,145,Fiery Glass Crusader,9,4.58,41.22
1,92,Final Critic,8,4.88,39.04


### Most Profitable Items

In [30]:
listofprofitgames_df = df.groupby('Item ID')['Price'].sum().sort_values(ascending = False).reset_index(name= "Total Purchase Value")
top5profitgames_df= listofprofitgames_df.iloc[0:5,0:2]

In [31]:
list_of_profit_games_df = list(top5profitgames_df.iloc[0:5,0])

In [32]:
profitgames_df = df.loc[df['Item ID'].isin(list_of_profit_games_df)]

In [33]:
profit_purchasecount_df = profitgames_df.groupby('Item ID')['Price'].count().reset_index(name ='Purchase Count')

In [34]:
itemnames_profit = profitgames_df[['Item ID', 'Item Name']].drop_duplicates()

In [35]:
itemprice_profit = profitgames_df.groupby('Item ID')['Price'].mean().reset_index(name ='Purchase Price')

In [36]:
#merge most popluar games dfs

profitable_itemnamesandpurchcount_df= pd.merge(itemnames_profit,profit_purchasecount_df, how = 'outer', on = "Item ID")
profitable_itemprice_merge_df = pd.merge(profitable_itemnamesandpurchcount_df, itemprice_profit, how = 'outer', on= "Item ID")
mostprofit_final_df = pd.merge(profitable_itemprice_merge_df,top5profitgames_df, how = 'outer', on = 'Item ID')
mostprofit_final_df.set_index("Item ID").reset_index()

Unnamed: 0,Item ID,Item Name,Purchase Count,Purchase Price,Total Purchase Value
0,92,Final Critic,8,4.88,39.04
1,82,Nirvana,9,4.9,44.1
2,178,"Oathbreaker, Last Hope of the Breaking Storm",12,4.23,50.76
3,145,Fiery Glass Crusader,9,4.58,41.22
4,103,Singed Scalpel,8,4.35,34.8


### <font color= green> Pandas Analysis: Conclusions </font>
#### Gender: 
- Men have more presence, yet woman pay more per order. 

#### Age: 
- Ages 20 through 24 are the most active demographic. 
- Ages 35 through 40 spend more per purchase. 

#### Profitability & Popularity: 
- 3 out of the 5 most propular games are the most profitable games, which means the product was well priced and marketed. 

###  Review Dataframes Integrity

Since the dataset provides more than 1 unique identifier to categorize the data, we finished our analysis without correcting the Item ID column. 

Typically, the Item ID issue would be brought to the product supervisor to see if the Item ID or the Item description needs to revised.

In [37]:
items_with_duplicate_ids

Unnamed: 0,Item Name,# of Item IDs
0,Crucifer,2
1,Final Critic,2
2,Persuasion,2
3,Stormcaller,2


#### Table of Item IDs for Item Name: Crucifer

In [38]:
#to create a dataframe to review the item Ids for "Crucifer"
cruciferIDs_df = df.loc[df['Item Name'] == 'Crucifer']
cruciferIDs_df

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price,Age Bracket
121,121,Ilosia37,23,Male,1,Crucifer,3.26,20-24
251,251,Iasur80,22,Male,23,Crucifer,1.99,20-24
454,454,Lassimla92,25,Male,1,Crucifer,3.26,25-29
697,697,Tyaelistidru84,19,Female,1,Crucifer,3.26,15-19


#### Troubleshooting Item ID
Using the game "Crucifer" as an example, there are two Item IDs. However, the product with the item id '23' was purchased at a discounted rate. 

To assist in the trouble shooting, we'll see if discounts are normal for products.

Also, if they are, are they given a new Item ID & Item Name or is there a techical error where they are being grouped together? 

#### Table of Items with an 'Item ID' of 23

In [39]:
#to review is Item ID # 23.0 is also for a another product
#maybe it was a misclassfication?

itemid23_df=df.loc[df['Item ID'] == 23]
itemid23_df

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price,Age Bracket
251,251,Iasur80,22,Male,23,Crucifer,1.99,20-24


Since Item ID '23' is only for Crucifer, it appears to be about a 40% discount value. <br>
Before I rename the Item Name for Item ID '23' I will check to see if this Item ID is unique to the product.<br>

#### Table of Discounted Items

In [41]:
# Create a stat table of the product names and prices
# -----------
## By using item name as oppose to id, we can see if the full priced and discounted items 
### are being grouped together by product id. 
# -----------
## If it was done with 'best practices' they would have different item ID and Item Names to 
## provide proper marketing performance tracking per promotional item.

std_of_price_per_item = round(df.groupby('Item Name')['Price'].describe().reset_index(),2).fillna(0)

#------------
# A STD value over 0 implies a variance in the purchase price & would confirm that marketing promotions
## are not being easily tracked separately

marketing_promo_products = std_of_price_per_item.loc[std_of_price_per_item['std'] > .01]
marketing_promo_products

Unnamed: 0,Item Name,count,mean,std,min,25%,50%,75%,max
28,Crucifer,4.0,2.94,0.63,1.99,2.94,3.26,3.26,3.26
56,Final Critic,13.0,4.61,0.35,4.19,4.19,4.88,4.88,4.88
98,Persuasion,9.0,3.22,0.06,3.19,3.19,3.19,3.19,3.33
137,Stormcaller,3.0,2.59,0.66,2.21,2.21,2.21,2.78,3.36


In [43]:
std_of_price_per_item_by_id = round(df.groupby('Item ID')['Price'].describe().reset_index(),2).fillna(0)

marketing_promo_products_by_id = std_of_price_per_item_by_id.loc[std_of_price_per_item_by_id['std'] > .01]
marketing_promo_products_by_id

# no result confirms that each item id has a unique price point

Unnamed: 0,Item ID,count,mean,std,min,25%,50%,75%,max


This table above shows us that there are unique IDs for promotional items, however, the Item Name is the same. 


The data is complete but a follow-up question would need to be asked of the manager to make them aware that they can review the marketing promotions separately or review the net result. 