In [1]:
import pandas as pd

In [2]:
pymoli_file = "./Resources/purchase_data.csv"
pymoli_file_df = pd.read_csv(pymoli_file, encoding="ISO-8859-1")
pymoli_file_df.head()

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44


In [3]:
#add ranges to orignal dataframe (destructive edit, but this does not affect other tasks)
#binning class demo day 3 - ted talks)
bins = [0, 9.5, 14.5, 19.5, 24.5, 29.5, 34.5, 39.5, 100]
age_bins = ["<10", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40+"]
pymoli_file_df["Age Range"] = pd.cut(pymoli_file_df["Age"], bins, labels=age_bins, include_lowest=True)
pymoli_file_df.head()

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price,Age Range
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53,20-24
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56,40+
2,2,Ithergue48,24,Male,92,Final Critic,4.88,20-24
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27,20-24
4,4,Iskosia90,23,Male,131,Fury,1.44,20-24


In [4]:
pymoli_file_df["Price"] = pymoli_file_df["Price"].astype(float).map("${:,.2f}".format)

In [5]:
pymoli_file_df.head()

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price,Age Range
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",$3.53,20-24
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,$1.56,40+
2,2,Ithergue48,24,Male,92,Final Critic,$4.88,20-24
3,3,Chamassasya86,24,Male,100,Blindscythe,$3.27,20-24
4,4,Iskosia90,23,Male,131,Fury,$1.44,20-24


In [6]:
#identify incomplete rows - classdemo (day 2 - cleaning data)
#there are no incomplete rows (no cleaning!)
pymoli_file_df.count()

Purchase ID    780
SN             780
Age            780
Gender         780
Item ID        780
Item Name      780
Price          780
Age Range      780
dtype: int64

In [7]:
#data types look correct - classdemo (day 2 - cleaning data)
pymoli_file_df.dtypes

Purchase ID       int64
SN               object
Age               int64
Gender           object
Item ID           int64
Item Name        object
Price            object
Age Range      category
dtype: object

In [8]:
#value counts - classdemo (day 1 - training grounds)
#this shows that multiple SNs show up multiple times. Can't just count index, need to count unique SNs
pymoli_file_df["SN"].value_counts().head()

Lisosia93       5
Idastidru52     4
Iral74          4
Pheodaisun84    3
Saistyphos30    3
Name: SN, dtype: int64

In [9]:
#count unique screen names. pull unique screen names and find the length. 
#https://www.geeksforgeeks.org/how-to-count-distinct-values-of-a-pandas-dataframe-column/ (about halfway down on site)
sn_count = len(pymoli_file_df["SN"].unique())
print("Total Number of Players: " + str(sn_count))

Total Number of Players: 576


In [10]:
#Player Count 
#create a data frame from above calculation (create dictionary to dataframe)
#classdemo (day 2? - creating data frames)
player_count_dict = [{"Total Players": sn_count}]
player_count_df = pd.DataFrame(player_count_dict)
player_count_df

Unnamed: 0,Total Players
0,576


In [14]:
#count unique items. pull unique items and find the length
#https://www.geeksforgeeks.org/how-to-count-distinct-values-of-a-pandas-dataframe-column/ (about halfway down on site)
number_unique_items = len(pymoli_file_df["Item Name"].unique())
print("Number of Unique Items: " + str(number_unique_items))

#count and sum
num_purchases = pymoli_file_df["Price"].count()
sum_purchases = pymoli_file_df["Price"].sum()

#sum the purchase price divided by # of purchases
avg_purchase_price = pymoli_file_df["Price"].sum()/pymoli_file_df["Price"].count()
print("Average Purchase Price: $" + str(avg_purchase_price))

#total number of purchases and total revenue
print("Total Number of Purchases: " + str(num_purchases))
print("Total Revenue: $" + str(sum_purchases))

Number of Unique Items: 179


TypeError: ufunc 'true_divide' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
#Purchasing Analysis (Total)
#create a data frame from above calculations (create dictionary to dataframe)
#classdemo (day 2? - creating data frames)
purchasing_dict = [{"Number of Unique Items": number_unique_items, "Average Purchase Price": avg_purchase_price,
                    "Total Number of Purchases": num_purchases,"Total Revenue": sum_purchases}]
purchase_analysis_df = pd.DataFrame(purchasing_dict)
purchase_analysis_df

In [None]:
#this shows only three genders were listed (no need to merge any with mispellings, etc)
pymoli_file_df["Gender"].unique()

In [None]:
#drop duplicates of SN
gender_df = pymoli_file_df[["SN", "Gender"]].drop_duplicates()

In [None]:
gender_series = gender_df["Gender"].value_counts()
gender_series

In [None]:
#Gender Demographics
#group by gender and put into dataframe
#class demo (day 2 - UFO sightings)
gender_demographics_df = pd.DataFrame({"Total Count": gender_series, 
                                       "Percentage of Players": (gender_series/sn_count)*100})
gender_demographics_df

In [None]:
#purchase analysis
#dont want to remove duplicates for this (except for avg purchase by person)
print(sum_purchases)
print(num_purchases)
avg_purchase = sum_purchases/num_purchases
print(avg_purchase)
avg_person_purchase = sum_purchases/sn_count
print(avg_person_purchase)

In [None]:
grouped_gender_analysis = pymoli_file_df.groupby(["Gender"])
grouped_gender_analysis

In [None]:
gen_df = pymoli_file_df[["Gender", "SN"]].drop_duplicates()

In [None]:
SN_series = gen_df["Gender"].value_counts()
SN_series

In [None]:
gender_purchase_count = grouped_gender_analysis["Price"].count()
gender_purchase_count

In [None]:
gender_total_purchase = grouped_gender_analysis["Price"].sum()
gender_total_purchase

In [None]:
gender_avg_purchase_price = gender_total_purchase/gender_purchase_count
gender_avg_purchase_price

In [None]:
# avg_by_person = total_purchase/purchase_count
# avg_by_person

In [None]:
#Purchasing Analysis (Gender) - need to add avg total purchase per person
gender_analysis = pd.DataFrame({"Purchase Count": gender_purchase_count, 
                                "Average Purchase Price": gender_avg_purchase_price,
                                "Total Purchase Value" : gender_total_purchase,
                                "Avg Total Purchase per Person" : gender_total_purchase/SN_series})
gender_analysis

In [None]:
grouped_age_analysis = pymoli_file_df.groupby("Age Range")
grouped_age_analysis

In [None]:
age_df = pymoli_file_df[["SN", "Age Range"]].drop_duplicates()

In [None]:
age_series = age_df["Age Range"].value_counts()
age_series

In [None]:
age_percent_players = (age_series)/(sn_count)*100
age_percent_players

In [None]:
#Age Demographics - need to sort this by age group
#class demo (day 2 - UFO sightings)
age_group_df = pd.DataFrame({"Total Count": age_series, 
                            "Percentage of Players": age_percent_players})
age_group_df

In [None]:
age_purchase_count = grouped_age_analysis["Price"].count()
age_purchase_count

In [None]:
age_total_purchase = grouped_age_analysis["Price"].sum()
age_total_purchase

In [None]:
age_avg_purchase_price = age_total_purchase/age_purchase_count
age_avg_purchase_price

In [None]:
#Purchasing Analysis (Age)
age_analysis = pd.DataFrame({"Purchase Count": age_purchase_count, 
                             "Average Purchase Price": age_avg_purchase_price,
                             "Total Purchase Value" : age_total_purchase,
                             "Avg Total Purchase per Person" : age_total_purchase/age_series})
age_analysis

In [None]:
grouped_SN_analysis = pymoli_file_df.groupby("SN")
grouped_SN_analysis

In [None]:
SN_purchase_count = grouped_SN_analysis["Price"].count()
SN_purchase_count

In [None]:
SN_total_purchase = grouped_SN_analysis["Price"].sum()
SN_total_purchase

In [None]:
SN_avg_purchase_price = SN_total_purchase/SN_purchase_count

In [None]:
SN_analysis_df = pd.DataFrame({"Purchase Count": SN_purchase_count, 
                             "Average Purchase Price": SN_avg_purchase_price,
                             "Total Purchase Value" : SN_total_purchase})

In [None]:
#Top Spenders
SN_top_five_df = SN_analysis_df.sort_values("Total Purchase Value", ascending = False)
SN_top_five_df.head()

In [None]:
#create new dataframe with three columns
#https://stackoverflow.com/questions/34682828/extracting-specific-selected-columns-to-new-dataframe-as-a-copy
item_df = pymoli_file_df[['Item ID','Item Name','Price']]
item_df

In [None]:
#https://stackoverflow.com/questions/17679089/pandas-dataframe-groupby-two-columns-and-get-counts
grouped_item_analysis = item_df.groupby(["Item ID", "Item Name"])
grouped_item_analysis

In [None]:
item_purchase_count = grouped_item_analysis["Price"].count()
item_purchase_count

In [None]:
#add formating (class demo day three - already float so just add formating)
item_total_purchase = grouped_item_analysis["Price"].sum()
item_total_purchase

In [None]:
item_price = item_total_purchase/item_purchase_count
item_price

In [None]:
item_analysis_df = pd.DataFrame({"Purchase Count": item_purchase_count, 
                                 "Item Price" : item_price,
                                 "Total Purchase Value" : item_total_purchase})

In [None]:
#Most Popular Items - need to add Item Price
popular_top_five_df = item_analysis_df.sort_values("Purchase Count", ascending = False)
popular_top_five_df.head()

In [None]:
#Most Profitable Items - need to add item price
profitable_top_five_df = item_analysis_df.sort_values("Total Purchase Value", ascending = False)
profitable_top_five_df.head()