In [1]:
import os
#execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

In [2]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local").getOrCreate()

In [3]:
playHistoryPath = '/data/sample264'
playHistoryGraph = sparkSession.read.parquet(playHistoryPath)
playHistoryGraph.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- trackId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- timestamp: long (nullable = true)



In [4]:
metaDataPath = '/data/meta'
metaDataGraph = sparkSession.read.parquet(metaDataPath)
metaDataGraph.printSchema()

root
 |-- type: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Artist: string (nullable = true)
 |-- Id: integer (nullable = true)



In [5]:
playHistoryGraph.createTempView("history1")
playHistoryGraph.createTempView("history2")

In [6]:
from pyspark.sql.functions import count, col

consecutiveTracksForUser = sparkSession.sql( \
    "select h1.trackId as track1, h2.trackId as track2, h1.userId as user " \
    "from history1 h1, history2 h2 " \
    "where h1.userId = h2.userId " \
    "and h1.trackId != h2.trackId " \
    "and abs(h2.timestamp - h1.timestamp) <= 420 " \
).groupBy(col("track1"), col("track2")) \
.count().alias("count") \
.orderBy(col("track1"), col("track2")) \
.cache()

In [23]:
#consecutiveTracksForUser.show()

+------+------+-----+
|track1|track2|count|
+------+------+-----+
|798256|923706|    1|
|798258|808254|    1|
|798258|810685|    1|
|798261|911939|    3|
|798261|916840|    1|
|798261|943188|    1|
|798290|880442|    1|
|798290|906999|    1|
|798302|836228|    1|
|798302|893311|    1|
|798311|864601|    1|
|798311|903496|    1|
|798319|837992|    1|
|798322|876562|    1|
|798331|827364|    1|
|798335|840741|    1|
|798372|815418|    1|
|798372|820446|    1|
|798372|823661|    1|
|798372|907153|    1|
+------+------+-----+
only showing top 20 rows



In [29]:
from pyspark.sql import Window
from pyspark.sql.functions import col, row_number, sum

window = Window.partitionBy("track1").orderBy(col("count").desc())
       
topsDF = consecutiveTracksForUser.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= 40) \
        .drop(col("row_number")) \
        .orderBy(col("track1"), col("track2")) \
        .cache()

In [30]:
#topsDF.show()

+------+------+-----+
|track1|track2|count|
+------+------+-----+
|798256|923706|    1|
|798258|808254|    1|
|798258|810685|    1|
|798261|911939|    3|
|798261|916840|    1|
|798261|943188|    1|
|798290|880442|    1|
|798290|906999|    1|
|798302|836228|    1|
|798302|893311|    1|
|798311|864601|    1|
|798311|903496|    1|
|798319|837992|    1|
|798322|876562|    1|
|798331|827364|    1|
|798335|840741|    1|
|798372|815418|    1|
|798372|820446|    1|
|798372|823661|    1|
|798372|907153|    1|
+------+------+-----+
only showing top 20 rows



In [36]:

sumsDF = topsDF.groupBy(col("track1")) \
.agg(sum(col("count")).alias("sum_weights")) \
.orderBy("track1") \
.cache()

In [37]:
#sumsDF.show()

+------+-----------+
|track1|sum_weights|
+------+-----------+
|798256|          1|
|798258|          2|
|798261|          5|
|798290|          2|
|798302|          2|
|798311|          2|
|798319|          1|
|798322|          1|
|798331|          1|
|798335|          1|
|798372|          5|
|798374|          1|
|798375|          2|
|798376|          2|
|798377|          5|
|798379|          1|
|798380|          1|
|798396|          2|
|798398|          1|
|798403|          2|
+------+-----------+
only showing top 20 rows



In [38]:
normalized_count = topsDF.join(sumsDF, "track1", "inner") \
    .withColumn("weight", col("count") / col("sum_weights")) \
    .cache()


In [39]:
#normalized_count.show()

+------+------+-----+-----------+------+
|track1|track2|count|sum_weights|weight|
+------+------+-----+-----------+------+
|798256|923706|    1|          1|   1.0|
|798258|808254|    1|          2|   0.5|
|798258|810685|    1|          2|   0.5|
|798261|911939|    3|          5|   0.6|
|798261|916840|    1|          5|   0.2|
|798261|943188|    1|          5|   0.2|
|798290|880442|    1|          2|   0.5|
|798290|906999|    1|          2|   0.5|
|798302|836228|    1|          2|   0.5|
|798302|893311|    1|          2|   0.5|
|798311|864601|    1|          2|   0.5|
|798311|903496|    1|          2|   0.5|
|798319|837992|    1|          1|   1.0|
|798322|876562|    1|          1|   1.0|
|798331|827364|    1|          1|   1.0|
|798335|840741|    1|          1|   1.0|
|798372|815418|    1|          5|   0.2|
|798372|820446|    1|          5|   0.2|
|798372|823661|    1|          5|   0.2|
|798372|907153|    1|          5|   0.2|
+------+------+-----+-----------+------+
only showing top

In [40]:
results = normalized_count.orderBy(col("weight").desc(), col("track1"), col("track2")).limit(40)
#results.show()

+------+------+-----+-----------+------+
|track1|track2|count|sum_weights|weight|
+------+------+-----+-----------+------+
|798256|923706|    1|          1|   1.0|
|798319|837992|    1|          1|   1.0|
|798322|876562|    1|          1|   1.0|
|798331|827364|    1|          1|   1.0|
|798335|840741|    1|          1|   1.0|
|798374|816874|    1|          1|   1.0|
|798375|810685|    2|          2|   1.0|
|798379|812055|    1|          1|   1.0|
|798380|840113|    1|          1|   1.0|
|798396|817687|    2|          2|   1.0|
|798398|926302|    1|          1|   1.0|
|798405|867217|    1|          1|   1.0|
|798443|905923|    1|          1|   1.0|
|798457|918918|    1|          1|   1.0|
|798460|891840|    1|          1|   1.0|
|798461|940379|    1|          1|   1.0|
|798470|840814|    1|          1|   1.0|
|798474|963162|    1|          1|   1.0|
|798477|883244|    1|          1|   1.0|
|798485|955521|    1|          1|   1.0|
+------+------+-----+-----------+------+
only showing top

In [42]:
results = results.select(col("track1"), col("track2"))
for t1, t2 in results.collect():
    print("{}\t{}".format(t1,t2))

798256	923706
798319	837992
798322	876562
798331	827364
798335	840741
798374	816874
798375	810685
798379	812055
798380	840113
798396	817687
798398	926302
798405	867217
798443	905923
798457	918918
798460	891840
798461	940379
798470	840814
798474	963162
798477	883244
798485	955521
798505	905671
798545	949238
798550	936295
798626	845438
798691	818279
798692	898823
798702	811440
798704	937570
798725	933147
798738	894170
798745	799665
798782	956938
798801	950802
798820	890393
798833	916319
798865	962662
798931	893574
798946	946408
799012	809997
799024	935246


Task 2

In [8]:
from pyspark.sql.functions import count, col

tracksPerUser = sparkSession.sql( \
    "select userId as user, trackId as track " \
    "from history1 "
).groupBy(col("user"), col("track")) \
.count().alias("count") \
.orderBy(col("count").desc(), col("user"), col("track")) \
.cache()

In [9]:
tracksPerUser.show()

+------+------+-----+
|  user| track|count|
+------+------+-----+
|668849|817132|  277|
|560428|950984|   94|
|767478|870292|   94|
|278647|940362|   87|
|770607|830615|   76|
| 20167|857441|   75|
|343313|967768|   69|
|408783|830615|   67|
|714890|915690|   67|
|525436|954967|   66|
|590037|885769|   62|
|607295|878795|   62|
|511305|858940|   60|
|755028|870923|   60|
|590037|825997|   54|
|277727|830874|   52|
|226263|841697|   51|
|381214|862698|   48|
|478110|824455|   48|
|273636|967639|   47|
+------+------+-----+
only showing top 20 rows



In [11]:
from pyspark.sql import Window
from pyspark.sql.functions import col, row_number, sum

window = Window.partitionBy("user").orderBy(col("count").desc())
       
topsTracksPerUser = tracksPerUser.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= 1000) \
        .drop(col("row_number")) \
        .cache()

In [12]:
sumsTopsTracksPerUser = topsTracksPerUser.groupBy(col("user")) \
.agg(sum(col("count")).alias("sum_weights")) \
.orderBy("user") \
.cache()

In [14]:
normalized_topsTracksPerUser = topsTracksPerUser.join(sumsTopsTracksPerUser, "user", "inner") \
    .withColumn("norm_weight", col("count") / col("sum_weights")) \
    .cache()

In [17]:
results = normalized_topsTracksPerUser.orderBy(col("norm_weight").desc(), col("user"), col("track")) \
    .limit(40) \
    .select(col("user"),col("track"))

In [18]:
for u, t in results.collect():
    print("{} {}".format(u, t))

66 965774
116 867268
128 852564
131 880170
195 946408
215 860111
235 897176
300 857973
321 915545
328 943482
333 818202
346 864911
356 961308
428 943572
431 902497
445 831381
488 841340
542 815388
617 946395
649 901672
658 937522
662 881433
698 935934
708 952432
746 879259
747 879259
776 946408
784 806468
806 866581
811 948017
837 799685
901 871513
923 879322
934 940714
957 945183
989 878364
999 967768
1006 962774
1049 849484
1057 920458
