In [4]:
import pandas as pd
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array

# Import Data

In [15]:
adclicksDF = pd.read_csv('./big_data_capstone_datasets_and_scripts/flamingo-data/ad-clicks.csv')

In [18]:
adclicksDF = adclicksDF.rename(columns=lambda x: x.strip()) #remove whitespaces from headers

# Display Data

In [19]:
adclicksDF.head()

Unnamed: 0,timestamp,txId,userSessionId,teamId,userId,adId,adCategory
0,2016-05-26 15:13:22,5974,5809,27,611,2,electronics
1,2016-05-26 15:17:24,5976,5705,18,1874,21,movies
2,2016-05-26 15:22:52,5978,5791,53,2139,25,computers
3,2016-05-26 15:22:57,5973,5756,63,212,10,fashion
4,2016-05-26 15:22:58,5980,5920,9,1027,20,clothing


In [20]:
adclicksDF['adCount'] = 1

In [21]:
adclicksDF.head(n=5)

Unnamed: 0,timestamp,txId,userSessionId,teamId,userId,adId,adCategory,adCount
0,2016-05-26 15:13:22,5974,5809,27,611,2,electronics,1
1,2016-05-26 15:17:24,5976,5705,18,1874,21,movies,1
2,2016-05-26 15:22:52,5978,5791,53,2139,25,computers,1
3,2016-05-26 15:22:57,5973,5756,63,212,10,fashion,1
4,2016-05-26 15:22:58,5980,5920,9,1027,20,clothing,1


In [22]:
buyclicksDF = pd.read_csv('./big_data_capstone_datasets_and_scripts/flamingo-data/buy-clicks.csv')
buyclicksDF = buyclicksDF.rename(columns=lambda x: x.strip()) #removes whitespaces from

In [26]:
buyclicksDF.head(n=5)

Unnamed: 0,timestamp,txId,userSessionId,team,userId,buyId,price
0,2016-05-26 15:36:54,6004,5820,9,1300,2,3.0
1,2016-05-26 15:36:54,6005,5775,35,868,4,10.0
2,2016-05-26 15:36:54,6006,5679,97,819,5,20.0
3,2016-05-26 16:36:54,6067,5665,18,121,2,3.0
4,2016-05-26 17:06:54,6093,5709,11,2222,5,20.0


# Feature Selection

In [27]:
userPurchases = buyclicksDF[['userId','price']] #select only userid and price
userPurchases.head(n=5)

Unnamed: 0,userId,price
0,1300,3.0
1,868,10.0
2,819,20.0
3,121,3.0
4,2222,20.0


In [28]:
useradClicks = adclicksDF[['userId','adCount']]

In [29]:
useradClicks.head(n=5) #as we saw before, this line displays first five lines

Unnamed: 0,userId,adCount
0,611,1
1,1874,1
2,2139,1
3,212,1
4,1027,1


# Aggregation

In [32]:
# aggregate ads count by users

In [34]:
adsPerUser = useradClicks.groupby('userId').sum()
adsPerUser = adsPerUser.reset_index()
adsPerUser.columns = ['userId', 'totalAdClicks'] #rename the columns

In [35]:
adsPerUser.head(n=5)

Unnamed: 0,userId,totalAdClicks
0,1,44
1,8,10
2,9,37
3,10,19
4,12,46


In [None]:
# aggregate revenue by users

In [39]:
revenuePerUser = userPurchases.groupby('userId').sum()
revenuePerUser = revenuePerUser.reset_index()
revenuePerUser.columns = ['userId', 'revenue'] #rename the columns

In [40]:
revenuePerUser.head(n=5)

Unnamed: 0,userId,revenue
0,1,21.0
1,8,53.0
2,9,80.0
3,10,11.0
4,12,215.0


# Merge

In [46]:
combinedDF = adsPerUser.merge(revenuePerUser, on='userId') #userid, adCount, price

In [47]:
combinedDF.head(n=5) #display how the merged table looks

Unnamed: 0,userId,totalAdClicks,revenue
0,1,44,21.0
1,8,10,53.0
2,9,37,80.0
3,10,19,11.0
4,12,46,215.0


In [48]:
trainingDF = combinedDF[['totalAdClicks','revenue']]

In [49]:
trainingDF.head(n=5)

Unnamed: 0,totalAdClicks,revenue
0,44,21.0
1,10,53.0
2,37,80.0
3,19,11.0
4,46,215.0


In [50]:
trainingDF.shape

(543, 2)

The following two commands convert the tables we created into a format that can be understood by the KMeans.train function.
- line[0] refers to the first column. 
- line[1] refers to the second column. 
- Note: If you have more than 2 columns in your training table, modify this command by adding line[2], line[3], line[4] ...

In [54]:
pDF = sqlContext.createDataFrame(trainingDF)

In [55]:
parsedData = pDF.rdd.map(lambda line: array([line[0], line[1]])) #totalAdClicks, revenue

# Train KMeans model

In [59]:
my_kmmodel = KMeans.train(parsedData, 2, maxIterations=10, runs=10, initializationMode="random")

In [60]:
print(my_kmmodel.centers)

[array([  39.07608696,  115.26086957]), array([ 27.39467849,  23.86474501])]


## Conclusion

First number (field1) in each array refers to mean number of ad­clicks and the second number
(field2) is the mean revenue per user for users in that cluster. Compare the 1st number of each
cluster to see how differently users in each cluster behave when it comes to clicking ads.
Compare the 2nd number of each cluster to see how differently users in each cluster behave
when it comes to buying stuff.

In one cluster, in general, players click on ads much more often (~1.6 times) and spend more
money (~4 times) on in app purchases. Assuming that Eglence Inc. gets paid for showing ads
and for hosting in app purchase items, we can use this information to increase game's revenue
by increasing the prices for ads we show to the frequent­clickers, and charge higher fees for
hosting the in app purchase items shown to the higher revenue generating buyers

Note: This analysis requires you to compare the cluster centers and find any ‘significant’
differences in the corresponding feature values of the centers. The answer to this question will
depend on the features you have chosen. Some features help distinguish the clusters
remarkably while others may not tell you much. At this point, if you don’t find clear distinguishing
patterns, perhaps rerunning the clustering model with different numbers of clusters and revising
the features you picked would be a good idea.