In [1]:
import os
#execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

In [2]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local").getOrCreate()

In [3]:
playHistoryPath = '/data/sample264'
playHistoryGraph = sparkSession.read.parquet(playHistoryPath)
playHistoryGraph.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- trackId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- timestamp: long (nullable = true)



In [4]:
metaDataPath = '/data/meta'
metaDataGraph = sparkSession.read.parquet(metaDataPath)
metaDataGraph.printSchema()

root
 |-- type: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Artist: string (nullable = true)
 |-- Id: integer (nullable = true)



In [None]:
playHistoryGraph.createTempView("history1")
playHistoryGraph.createTempView("history2")

# Task 1

In [None]:
from pyspark.sql.functions import count, col

consecutiveTracksForUser = sparkSession.sql( \
    "select h1.trackId as track1, h2.trackId as track2, h1.userId as user " \
    "from history1 h1, history2 h2 " \
    "where h1.userId = h2.userId " \
    "and h1.trackId != h2.trackId " \
    "and abs(h2.timestamp - h1.timestamp) <= 420 " \
).groupBy(col("track1"), col("track2")) \
.count().alias("count") \
.orderBy(col("track1"), col("track2")) \
.cache()

In [None]:
#consecutiveTracksForUser.show()

In [None]:
from pyspark.sql import Window
from pyspark.sql.functions import col, row_number, sum

window = Window.partitionBy("track1").orderBy(col("count").desc())
       
topsDF = consecutiveTracksForUser.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= 40) \
        .drop(col("row_number")) \
        .orderBy(col("track1"), col("track2")) \
        .cache()

In [None]:
#topsDF.show()

In [None]:

sumsDF = topsDF.groupBy(col("track1")) \
.agg(sum(col("count")).alias("sum_weights")) \
.orderBy("track1") \
.cache()

In [None]:
#sumsDF.show()

In [None]:
normalized_count = topsDF.join(sumsDF, "track1", "inner") \
    .withColumn("weight", col("count") / col("sum_weights")) \
    .cache()


In [None]:
#normalized_count.show()

In [None]:
results = normalized_count.orderBy(col("weight").desc(), col("track1"), col("track2")).limit(40)
#results.show()

In [None]:
results = results.select(col("track1"), col("track2"))
for t1, t2 in results.collect():
    print("{}\t{}".format(t1,t2))

# Task 2

In [None]:
from pyspark.sql.functions import count, col

tracksPerUser = sparkSession.sql( \
    "select userId as user, trackId as track " \
    "from history1 "
).groupBy(col("user"), col("track")) \
.count().alias("count") \
.orderBy(col("count").desc(), col("user"), col("track")) \
.cache()

In [None]:
tracksPerUser.show()

In [None]:
from pyspark.sql import Window
from pyspark.sql.functions import col, row_number, sum

window = Window.partitionBy("user").orderBy(col("count").desc())
       
topsTracksPerUser = tracksPerUser.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= 1000) \
        .drop(col("row_number")) \
        .cache()

In [None]:
sumsTopsTracksPerUser = topsTracksPerUser.groupBy(col("user")) \
.agg(sum(col("count")).alias("sum_weights")) \
.orderBy("user") \
.cache()

In [None]:
normalized_topsTracksPerUser = topsTracksPerUser.join(sumsTopsTracksPerUser, "user", "inner") \
    .withColumn("norm_weight", col("count") / col("sum_weights")) \
    .cache()

In [None]:
results = normalized_topsTracksPerUser.orderBy(col("norm_weight").desc(), col("user"), col("track")) \
    .limit(40) \
    .select(col("user"),col("track"))

In [None]:
for u, t in results.collect():
    print("{} {}".format(u, t))

# Task 3

In [None]:
from pyspark.sql.functions import count, col

artistsPerUser = sparkSession.sql( \
    "select userId as user, artistId as artist " \
    "from history1 "
).groupBy(col("user"), col("artist")) \
.count().alias("count") \
.orderBy(col("count").desc(), col("user"), col("artist")) \
.cache()

In [None]:
artistsPerUser.show()

In [None]:
from pyspark.sql import Window
from pyspark.sql.functions import col, row_number, sum

window = Window.partitionBy("user").orderBy(col("count").desc())
       
topsArtistsPerUser = artistsPerUser.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= 100) \
        .drop(col("row_number")) \
        .cache()

In [None]:
sumsTopsArtistsPerUser = topsArtistsPerUser.groupBy(col("user")) \
.agg(sum(col("count")).alias("sum_weights")) \
.orderBy("user") \
.cache()

In [None]:
normalized_topsArtistsPerUser = topsArtistsPerUser.join(sumsTopsArtistsPerUser, "user", "inner") \
    .withColumn("norm_weight", col("count") / col("sum_weights")) \
    .cache()

In [None]:
results = normalized_topsArtistsPerUser.orderBy(col("norm_weight").desc(), col("user"), col("artist")) \
    .limit(40) \
    .select(col("user"),col("artist"))

In [None]:
for u, t in results.collect():
    print("{} {}".format(u, t))

# Task 4

In [None]:
from pyspark.sql.functions import count, col

tracksPerArtist = sparkSession.sql( \
    "select artistId as artist, trackId as track " \
    "from history1 "
).groupBy(col("artist"), col("track")) \
.count().alias("count") \
.orderBy(col("count").desc(), col("artist"), col("track")) \
.cache()

In [None]:
tracksPerArtist.show()

In [None]:
from pyspark.sql import Window
from pyspark.sql.functions import col, row_number, sum

window = Window.partitionBy("artist").orderBy(col("count").desc())
       
topsTracksPerArtist = tracksPerArtist.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= 100) \
        .drop(col("row_number")) \
        .cache()

In [None]:
sumsTracksPerArtist = topsTracksPerArtist.groupBy(col("artist")) \
.agg(sum(col("count")).alias("sum_weights")) \
.orderBy("artist") \
.cache()

In [None]:
normalized_topsTracksPerArtist = topsTracksPerArtist.join(sumsTracksPerArtist, "artist", "inner") \
    .withColumn("norm_weight", col("count") / col("sum_weights")) \
    .cache()

In [None]:
results = normalized_topsTracksPerArtist.orderBy(col("norm_weight").desc(), col("artist"), col("track")) \
    .limit(40) \
    .select(col("artist"),col("track"))

In [None]:
for u, t in results.collect():
    print("{} {}".format(u, t))

# Task 5

In [5]:
from pyspark.sql.functions import col

user = playHistoryGraph.filter("userId == 776748").cache()

tracks = user.select(col("trackId").alias("Id")).distinct()
artists = user.select(col("artistId").alias("Id")).distinct()

result = tracks.union(artists) \
    .join(metaDataGraph, on="Id") \
    .orderBy(col("Artist"), col("Name")) \
    .select (col("Artist"), col("Name")) \
    .limit(40) \
    .cache()

In [6]:
for a, n in result.collect():
    print("{} {}".format(a,n))

Artist: 3 Doors Down Artist: 3 Doors Down
Artist: 3 Doors Down Kryptonite
Artist: 311 Artist: 311
Artist: 311 Beautiful disaster
Artist: Blur Artist: Blur
Artist: Blur Girls and Boys
Artist: Clawfinger Artist: Clawfinger
Artist: Clawfinger Nothing Going On
Artist: Disturbed Artist: Disturbed
Artist: Disturbed The Vengeful One
Artist: Gotthard Artist: Gotthard
Artist: Gotthard Eagle
Artist: Green Day 21 Guns
Artist: Green Day Artist: Green Day
Artist: Green Day Kill The DJ
Artist: Iggy Pop Artist: Iggy Pop
Artist: Iggy Pop Sunday
Artist: Korn Artist: Korn
Artist: Korn Here To Stay
Artist: Linkin Park Artist: Linkin Park
Artist: Linkin Park In The End
Artist: Linkin Park Numb
Artist: Lordi Artist: Lordi
Artist: Lordi Hard Rock Hallelujah
Artist: Nickelback Artist: Nickelback
Artist: Nickelback She Keeps Me Up
Artist: Nomy Artist: Nomy
Artist: Nomy Cocaine
Artist: Papa Roach Artist: Papa Roach
Artist: Papa Roach Getting Away With Murder
Artist: Rise Against Artist: Rise Against
Artist: Ri