In [1]:
import os
#execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

In [2]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local").getOrCreate()

In [3]:
playHistoryPath = '/data/sample264'
playHistoryGraph = sparkSession.read.parquet(playHistoryPath)
playHistoryGraph.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- trackId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- timestamp: long (nullable = true)



In [4]:
metaDataPath = '/data/meta'
metaDataGraph = sparkSession.read.parquet(metaDataPath)
metaDataGraph.printSchema()

root
 |-- type: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Artist: string (nullable = true)
 |-- Id: integer (nullable = true)



In [5]:
playHistoryGraph.createTempView("history1")
playHistoryGraph.createTempView("history2")

# Task 1

In [None]:
from pyspark.sql.functions import count, col

consecutiveTracksForUser = sparkSession.sql( \
    "select h1.trackId as track1, h2.trackId as track2, h1.userId as user " \
    "from history1 h1, history2 h2 " \
    "where h1.userId = h2.userId " \
    "and h1.trackId != h2.trackId " \
    "and abs(h2.timestamp - h1.timestamp) <= 420 " \
).groupBy(col("track1"), col("track2")) \
.count().alias("count") \
.orderBy(col("track1"), col("track2")) \
.cache()

In [None]:
#consecutiveTracksForUser.show()

In [None]:
from pyspark.sql import Window
from pyspark.sql.functions import col, row_number, sum

window = Window.partitionBy("track1").orderBy(col("count").desc())
       
topsDF = consecutiveTracksForUser.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= 40) \
        .drop(col("row_number")) \
        .orderBy(col("track1"), col("track2")) \
        .cache()

In [None]:
#topsDF.show()

In [None]:

sumsDF = topsDF.groupBy(col("track1")) \
.agg(sum(col("count")).alias("sum_weights")) \
.orderBy("track1") \
.cache()

In [None]:
#sumsDF.show()

In [None]:
normalized_count = topsDF.join(sumsDF, "track1", "inner") \
    .withColumn("weight", col("count") / col("sum_weights")) \
    .cache()


In [None]:
#normalized_count.show()

In [None]:
results = normalized_count.orderBy(col("weight").desc(), col("track1"), col("track2")).limit(40)
#results.show()

In [None]:
results = results.select(col("track1"), col("track2"))
for t1, t2 in results.collect():
    print("{}\t{}".format(t1,t2))

# Task 2

In [None]:
from pyspark.sql.functions import count, col

tracksPerUser = sparkSession.sql( \
    "select userId as user, trackId as track " \
    "from history1 "
).groupBy(col("user"), col("track")) \
.count().alias("count") \
.orderBy(col("count").desc(), col("user"), col("track")) \
.cache()

In [None]:
tracksPerUser.show()

In [None]:
from pyspark.sql import Window
from pyspark.sql.functions import col, row_number, sum

window = Window.partitionBy("user").orderBy(col("count").desc())
       
topsTracksPerUser = tracksPerUser.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= 1000) \
        .drop(col("row_number")) \
        .cache()

In [None]:
sumsTopsTracksPerUser = topsTracksPerUser.groupBy(col("user")) \
.agg(sum(col("count")).alias("sum_weights")) \
.orderBy("user") \
.cache()

In [None]:
normalized_topsTracksPerUser = topsTracksPerUser.join(sumsTopsTracksPerUser, "user", "inner") \
    .withColumn("norm_weight", col("count") / col("sum_weights")) \
    .cache()

In [None]:
results = normalized_topsTracksPerUser.orderBy(col("norm_weight").desc(), col("user"), col("track")) \
    .limit(40) \
    .select(col("user"),col("track"))

In [None]:
for u, t in results.collect():
    print("{} {}".format(u, t))

# Task 3

In [6]:
from pyspark.sql.functions import count, col

artistsPerUser = sparkSession.sql( \
    "select userId as user, artistId as artist " \
    "from history1 "
).groupBy(col("user"), col("artist")) \
.count().alias("count") \
.orderBy(col("count").desc(), col("user"), col("artist")) \
.cache()

In [7]:
artistsPerUser.show()

+------+-------+-----+
|  user| artist|count|
+------+-------+-----+
|668849| 994686|  277|
|436158|1003021|  142|
|442306|1001300|  107|
|560428| 975695|   94|
|767478| 991179|   94|
|278647| 981306|   87|
|770607| 978956|   76|
| 20167| 978800|   75|
| 26976|1003021|   75|
|637446| 978288|   71|
|397054| 974777|   70|
|407962| 968823|   70|
|201788| 970240|   69|
|343313| 968823|   69|
|510688| 976986|   69|
|408783| 978956|   67|
|714890| 985755|   67|
|525436| 976384|   66|
|607295| 993789|   63|
|300275| 989189|   62|
+------+-------+-----+
only showing top 20 rows



In [8]:
from pyspark.sql import Window
from pyspark.sql.functions import col, row_number, sum

window = Window.partitionBy("user").orderBy(col("count").desc())
       
topsArtistsPerUser = artistsPerUser.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= 100) \
        .drop(col("row_number")) \
        .cache()

In [9]:
sumsTopsArtistsPerUser = topsArtistsPerUser.groupBy(col("user")) \
.agg(sum(col("count")).alias("sum_weights")) \
.orderBy("user") \
.cache()

In [10]:
normalized_topsArtistsPerUser = topsArtistsPerUser.join(sumsTopsArtistsPerUser, "user", "inner") \
    .withColumn("norm_weight", col("count") / col("sum_weights")) \
    .cache()

In [11]:
results = normalized_topsArtistsPerUser.orderBy(col("norm_weight").desc(), col("user"), col("artist")) \
    .limit(40) \
    .select(col("user"),col("artist"))

In [12]:
for u, t in results.collect():
    print("{} {}".format(u, t))

66 993426
116 974937
128 1003021
131 983068
195 997265
215 991696
235 990642
288 1000564
300 1003362
321 986172
328 967986
333 1000416
346 982037
356 974846
374 1003167
428 993161
431 969340
445 970387
488 970525
542 969751
612 987351
617 970240
649 973851
658 973232
662 975279
698 995788
708 968848
746 972032
747 972032
776 997265
784 969853
806 995126
811 996436
837 989262
901 988199
923 977066
934 990860
957 991171
989 975339
999 968823
