In [240]:
# Dependencies and Setup
import pandas as pd
import numpy as np

# File to Load (Remember to Change These)
file_to_load = "purchase_data.csv"

# Read Purchasing File and store into Pandas data frame
purchase_data = pd.read_csv(file_to_load)

#create variable for total number of players and use .count for one column to determine total number of players
Total_Players = purchase_data["Purchase ID"].count()
print(f"Total number of players is {Total_Players}")

#list column names
list(purchase_data)
purchase_data.head()

Total number of players is 780


Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44


In [241]:
##Number of unique items

#unique function to determine number of unique Item ID entries
unique_items=purchase_data["Item ID"].unique()

##average purchase price and total revenue
avg_price = purchase_data["Price"].mean()
total_revenue = purchase_data["Price"].sum()

#create new data frame of summarized values:  needs formatting!
summary_df=pd.DataFrame({"Number of Unique Items": [len(unique_items)],
                        "Average Price": [avg_price],
                        "Number of Purchases": [Total_Players],
                        "Total Revenue": [total_revenue]  
})

##Change formatting for average price and total revenue to currency using map
summary_df["Average Price"]=summary_df["Average Price"].map("${:.2f}".format)
summary_df["Total Revenue"]=summary_df["Total Revenue"].map("${:.2f}".format)

#display summary data table
summary_df.head()



Unnamed: 0,Number of Unique Items,Average Price,Number of Purchases,Total Revenue
0,183,$3.05,780,$2379.77


In [242]:
#create new dataframe to drop duplicate screen names to perform gender count
gender_demo=purchase_data.drop_duplicates("SN")

#use value counts to count number of each gender and percentage of each
count=gender_demo["Gender"].value_counts()
count_per=gender_demo["Gender"].value_counts(normalize=True)


#create summary data frame with total counts and percentages from variables above
gender_summary=pd.DataFrame({"Total Count":count,
                             "Percentage of Players":count_per*100                          
                            })

#format percentage column to two decimal places
gender_summary["Percentage of Players"]=gender_summary["Percentage of Players"].map("{:.2f}".format)

#display gender summary table
gender_summary

Unnamed: 0,Total Count,Percentage of Players
Male,484,84.03
Female,81,14.06
Other / Non-Disclosed,11,1.91


In [243]:
#create object that is grouped by Gender
gender_df=purchase_data.groupby("Gender")

#total purchase value is sum of Price column
total_purchase_value=gender_df["Price"].sum()

#average per gender must be divided by the count defined above for unique counts of gender, not total appearances of each gender
avg_per_person=total_purchase_value/count
avg_per_person

#aveage purchase price is mean of price column
average_purchase_price=gender_df["Price"].mean()
average_purchase_price

#find total number of each gender from original data fram
total_count=purchase_data["Gender"].value_counts()

#create new summary data frame
gender_purchases=pd.DataFrame({ "Purchase Count": total_count,
                               "Average Purchase Price":average_purchase_price,
                               "Total Purchase Value":total_purchase_value,
                               "Average Per Person": avg_per_person,
                               })

#format summary table into
gender_purchases["Average Purchase Price"]=gender_purchases["Average Purchase Price"].map("${:.2f}".format)
gender_purchases["Total Purchase Value"]=gender_purchases["Total Purchase Value"].map("${:.2f}".format)
gender_purchases["Average Per Person"]=gender_purchases["Average Per Person"].map("${:.2f}".format)

gender_purchases


Unnamed: 0,Purchase Count,Average Purchase Price,Total Purchase Value,Average Per Person
Female,113,$3.20,$361.94,$4.47
Male,652,$3.02,$1967.64,$4.07
Other / Non-Disclosed,15,$3.35,$50.19,$4.56


In [244]:
#drop duplicates to get unique data set of total players
age_group_df=purchase_data.drop_duplicates("SN")

#identify max age to find highest level and spacing for bins
#max_age=age_group_df["Age"].max()
#print(max_age)

#create bins and group labels for the associated age ranges
bins = [0, 9, 14, 19, 24, 29, 34, 39, 50 ]
group_labels = ["<10", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40+"]

#create new column for the age range by cutting the ages into defined bins
age_group_df["Age Range"]=pd.cut(age_group_df["Age"], bins, labels=group_labels)

#create new object to group by age range
age_summary=age_group_df.groupby("Age Range")

#use Purchase ID column to count the number of items in each age bin
ID_counts_bins=age_summary["Purchase ID"].count()

#create a new data frame for the counts for each age bin
age_summary_df=pd.DataFrame({"Total Count":ID_counts_bins
    
    
})

#add a column to the new dataframe for the percentage of players, round to two decimal places
age_summary_df["Percentage of Players"]=round(age_summary_df["Total Count"]/(age_summary_df["Total Count"].sum())*100,2)

#display summary dataframe
age_summary_df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


Unnamed: 0_level_0,Total Count,Percentage of Players
Age Range,Unnamed: 1_level_1,Unnamed: 2_level_1
<10,17,2.95
10-14,22,3.82
15-19,107,18.58
20-24,258,44.79
25-29,77,13.37
30-34,52,9.03
35-39,31,5.38
40+,12,2.08


In [245]:
#create new data frame with only purchase ID, Age and Price to be analyzed
age_purchase_df=purchase_data[["Purchase ID", "Age", "Price"]]

#create new column for age range that is cut into defined bins and labels from part 4
age_purchase_df["Age Range"]=pd.cut(age_purchase_df["Age"], bins, labels=group_labels)

#create object to group by age range
age_purc_summary=age_purchase_df.groupby("Age Range")

#count total number of purchases in each age bin
ID_counts_bins_2=age_purc_summary["Purchase ID"].count()

#average purchase price is average of the prices for each age bin
avg_purc_price=age_purc_summary["Price"].mean()

#total purchase price is the sum of the prices for each age bin
total_purc_price=age_purc_summary["Price"].sum()

#create a new dataframe for total count, average purchase price and total purchase value
age_summary_purchase_df=pd.DataFrame({"Total Count":ID_counts_bins_2,
                                      "Average Purchase Price": avg_purc_price,
                                      "Total Purchase Value": total_purc_price,
                                      
    
    
})

#add final column for average purchase per person.  Denominator is the total count of people in each age range defined in part 4
age_summary_purchase_df["Avg Total Purchase per Person"]=age_summary_purchase_df["Total Purchase Value"]/age_summary_df["Total Count"]

#display summary
age_summary_purchase_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0_level_0,Total Count,Average Purchase Price,Total Purchase Value,Avg Total Purchase per Person
Age Range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<10,23,3.353478,77.13,4.537059
10-14,28,2.956429,82.78,3.762727
15-19,136,3.035956,412.89,3.858785
20-24,365,3.052219,1114.06,4.318062
25-29,101,2.90099,293.0,3.805195
30-34,73,2.931507,214.0,4.115385
35-39,41,3.601707,147.67,4.763548
40+,13,2.941538,38.24,3.186667


In [246]:
#create smaller datagram with purchase ID, screenname and price
sn_purchase_df=purchase_data[["Purchase ID", "SN", "Price"]]

#create group by object by screen name
sn_group=sn_purchase_df.groupby("SN")

#determine total number of purchase as number of times unique screennames appear in the data set
sn_count=purchase_data["SN"].value_counts()

sn_avg_price=sn_group["Price"].mean()
sn_total_price=sn_group["Price"].sum()

sn_summary_df=pd.DataFrame({"Total Count":sn_count,
                            "Average Purchase Price": sn_avg_price,
                            "Total Purchase Value": sn_total_price
    
    
    
})

sn_ranked_df=sn_summary_df.sort_values("Total Purchase Value", ascending=False)

sn_ranked_df["Average Purchase Price"]=sn_ranked_df["Average Purchase Price"].map("${:.2f}".format)
sn_ranked_df["Total Purchase Value"]=sn_ranked_df["Total Purchase Value"].map("${:.2f}".format)

sn_ranked_df.head(5)

Unnamed: 0,Total Count,Average Purchase Price,Total Purchase Value
Lisosia93,5,$3.79,$18.96
Idastidru52,4,$3.86,$15.45
Chamjask73,3,$4.61,$13.83
Iral74,4,$3.40,$13.62
Iskadarya95,3,$4.37,$13.10


In [271]:
#create object to group by item ID and item name
grouped_items= purchase_data.groupby(["Item ID", "Item Name"])

#create empty data fram for popular items
popular_items=pd.DataFrame()

#purchase count column is number of purchase ID's for grouped items
popular_items["Purchase Count"]=grouped_items["Purchase ID"].count()

#item price is total price divided by number of items in grouped object
popular_items["Item Price"]=grouped_items["Price"].sum()/grouped_items["Purchase ID"].count()

#total purchase value is sum of grouped prices
popular_items["Total Purchase Value"]=grouped_items["Price"].sum()

#create new dataframe of ranked items and sort by Purchase count in descending order
ranked_items=popular_items.sort_values("Purchase Count", ascending=False)

#format ranked items columns as currency
ranked_items["Item Price"]=ranked_items["Item Price"].map("${:.2f}".format)
ranked_items["Total Purchase Value"]=ranked_items["Total Purchase Value"].map("${:.2f}".format)

#display top 5 items
ranked_items.head(5)


Unnamed: 0_level_0,Unnamed: 1_level_0,Purchase Count,Item Price,Total Purchase Value
Item ID,Item Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
178,"Oathbreaker, Last Hope of the Breaking Storm",12,$4.23,$50.76
145,Fiery Glass Crusader,9,$4.58,$41.22
108,"Extraction, Quickblade Of Trembling Hands",9,$3.53,$31.77
82,Nirvana,9,$4.90,$44.10
19,"Pursuit, Cudgel of Necromancy",8,$1.02,$8.16


In [276]:
#ranked items by total purchase value uses data frame from previous exercise and simply sorts
ranked_items_2=popular_items.sort_values("Total Purchase Value", ascending=False)

#formatting item price and total purchase value columns
ranked_items_2["Item Price"]=ranked_items_2["Item Price"].map("${:.2f}".format)
ranked_items_2["Total Purchase Value"]=ranked_items_2["Total Purchase Value"].map("${:.2f}".format)

#display top 5 items
ranked_items_2.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Purchase Count,Item Price,Total Purchase Value
Item ID,Item Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
178,"Oathbreaker, Last Hope of the Breaking Storm",12,$4.23,$50.76
82,Nirvana,9,$4.90,$44.10
145,Fiery Glass Crusader,9,$4.58,$41.22
92,Final Critic,8,$4.88,$39.04
103,Singed Scalpel,8,$4.35,$34.80
