In [1]:
TASK1 = False
TASK2 = False
TASK3 = False
TASK4 = False
TASK5 = True
TASK6 = False

In [2]:
import os
execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
      /_/

Using Python version 2.7.12 (default, Nov 19 2016 06:48:10)
SparkSession available as 'spark'.


## Data description (DataFrames in parquet format)

Location - `/data/sample264`

Fields: `trackId`, `userId`, `timestamp`, `artistId`

- `trackId` - `id` of the track
- `userId` - `id` of the user
- `artistId` - `id` of the artist
- `timestamp` - `timestamp` of the moment the user starts listening to a track

Location - `/data/meta`

Fields: `type`, `Name`, `Artist`, `Id`

- `Type` could be “track” or “artist”
- `Name` is the title of the track if the type == “track” and the name of the musician or group if the type == “artist”.
- `Artist` states for the creator of the track in case the type == “track” and for the name of the musician or group in case the type == “artist”.
- `Id` - id of the item

In [3]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local [2]").getOrCreate()

In [4]:
data = sparkSession.read.parquet("/data/sample264")
meta = sparkSession.read.parquet("/data/meta")

## Normalization could be done by next function

In [5]:
from pyspark.sql import Window
from pyspark.sql.functions import row_number, sum

def norm(df, key1, key2, field, n): 
    
    window = Window.partitionBy(key1).orderBy(col(field).desc())
        
    topsDF = df.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= n) \
        .drop(col("row_number")) 
        
    tmpDF = topsDF.groupBy(col(key1)).agg(col(key1), sum(col(field)).alias("sum_" + field))
   
    normalizedDF = topsDF.join(tmpDF, key1, "inner") \
        .withColumn("norm_" + field, col(field) / col("sum_" + field)) \
        .cache()

    return normalizedDF

In [6]:
from pyspark.sql import Window
from pyspark.sql.functions import col, rank

# userTrack = data.groupBy(col("userId"), col("trackId")).count()

# userTrackNorm = norm(userTrack, "userId", "trackId", "count", 1000) \
#         .withColumn("id", col("userId")) \
#         .withColumn("id2", col("trackId")) \
#         .withColumn("norm_count", col("norm_count") * 0.5) \
#         .select(col("id"), col("id2"), col("norm_count"))     

# window = Window.orderBy(col("norm_count"))
    
# userTrackList = userTrackNorm.withColumn("position", rank().over(window))\
#     .filter(col("position") < 50)\
#     .orderBy(col("id"), col("id2"))\
#     .select(col("id"), col("id2"))\
#     .take(40)

In [7]:
# for val in userTrackList:
#     print "%s %s" % val

In [8]:
from pyspark.sql.functions import desc, asc

In [9]:
def print_result(df, task):
    if not task:
        return

    window = Window.orderBy(desc('norm_count'))

    result = (
        df.withColumn('position', rank().over(window))
        .filter(col('position') < 50)
        .orderBy(asc('id1'), asc('id2'))
        .select(col('id1'), col('id2'))
    )

    for row in result.take(40):
        print '{r.id1} {r.id2}'.format(r=row)


## Graph based Music Recommender. Task 1

Build the edges of the type “track-track”. To do it you will need to count the collaborative similarity between all the tracks: if a user has listened To the tracks A and B together in THE limited time interval (equal to 7 minutes), then you should add 1 to the weight of the edge from vertex A to vertex B. For each track choose top 40 tracks similar to the initial one and normalize weights of its edges (divide the weight of each edge on a summary of weights of all edges).

Sort the resulting Data Frame in ascending order by the column norm_count, take top 40 rows, select only the columns “id1”, “id2”, sort them in descending order this time first by “id1”, then by “id2” and print the columns “id1”, “id2” of the resulting dataframe.Example:

```
54719	767867
54719	767866
50787	327676
```

---

_For all tasks use the same ipython notebook, each task should be the continuation of the previous._

In [10]:
TIME_DELTA = 60 * 7

In [11]:
data_1 = data.alias('d1')
data_2 = data.alias('d2')

cond = (
    (col('d1.userId') == col('d2.userId'))
    & (col('d1.timestamp') < col('d2.timestamp'))
    & (col('d1.timestamp') + TIME_DELTA >= col('d2.timestamp'))
    & (col('d1.trackId') != col('d2.trackId'))
)

djoin = (
#     data_1.crossJoin(data_2)
    data_1.join(data_2, cond, 'left_outer')
#     .filter(cond)
    .select(col('d1.trackId').alias('id1'), col('d2.trackId').alias('id2'))
    .dropna()
    .groupBy(col('id1'), col('id2'))
    .count()
)

track_track = (
    norm(djoin, 'id1', 'id2', 'count', 40)
    .select(col('id1'), col('id2'), col('norm_count'))
)

In [12]:
print_result(track_track, TASK1)

## Graph based Music Recommender. Task 2

Build the edges of the type “user-track”. Take the amount of times the track was listened by all users as the weight of the edge from the artist’s vertex to the track’s vertex. For each user take top-1000 and normalize them.

Sort the resulting Data Frame in ascending order by the column norm_count, take top 40 rows, select only the columns “id1”, “id2”, sort them in descending order this time first by “id1”, then by “id2” and print the columns “id1”, “id2” of the resulting dataframe.

Example:

```
54719	767867
54719	767866
50787	327676
```

In [13]:
user_track = (
    norm(data.groupBy('userId', 'trackId').count(), 'userId', 'trackId', 'count', 1000)
    .withColumn('id1', col('userId'))
    .withColumn('id2', col('trackId'))
    .select(col('id1'), col('id2'), col('norm_count'))
)

In [14]:
print_result(user_track, TASK2)

## Graph based Music Recommender. Task 3

Build the edges of the type “user-artist”. Take the amount of times the user has listened to the artist’s tracks as the weight of the edge from the user’s vertex to the artist’s vertex. For each user take top-100 artists and normalize weights.

Sort the resulting Data Frame in ascending order by the column norm_count, take top 40 rows, select only the columns “id1”, “id2”, sort them in descending order this time first by “id1”, then by “id2” and print the columns “id1”, “id2” of the resulting dataframe.

Example:

```
54719	767867
54719	767866
50787	327676
```

In [15]:
user_artist = (
    norm(data.groupBy('userId', 'artistId').count(), 'userId', 'artistId', 'count', 100)
    .withColumn('id1', col('userId'))
    .withColumn('id2', col('artistId'))
    .select(col('id1'), col('id2'), col('norm_count'))
)

In [16]:
print_result(user_artist, TASK3)

## Graph based Music Recommender. Task 4

Build the edges of the type “artist-track”. Take the amount of times the track HAS BEEN listened by all users as the weight of the edge from the artist’s vertex to the track’s vertex. For each artist take top-100 tracks and normalize weights.

Sort the resulting Data Frame in ascending order by the column norm_count (the column with normalized weights), take top 40 rows, select only the columns “id1”, “id2”, sort them in descending order this time first by “id1”, then by “id2” and print the columns “id1”, “id2” of the resulting dataframe.

Example:
```
54719	767867
54719	767866
50787	327676
```

In [17]:
artist_track = (
    norm(data.groupBy('artistId', 'trackId').count(), 'artistId', 'trackId', 'count', 100)
    .withColumn('id1', col('artistId'))
    .withColumn('id2', col('trackId'))
    .select(col('id1'), col('id2'), col('norm_count'))
)

In [18]:
print_result(artist_track, TASK4)

## Graph based Music Recommender. Task 5

Construct balancing function where the edges of the type “user-track” and the edges of the type “user-artist” influence the final recommendations equally.

For the user with Id 776748 find all the tracks and artists connected to him. Sort founded items first by artist then by name in ascending order, leave only columns ”Artist” and “Name” and print top-40

Example:

```
Artist: Green Day 21 Guns
Artist: Green Day
Artist: Green Day
Artist: Green Day Kill The DJ
Artist: Iggy Pop
Artist: Iggy Pop
Artist: Iggy Pop Sunday
```

In [29]:
result = (
    data.filter(col('userId') == 776748)
    .join(meta, (col('trackId') == col('Id')), 'left_outer')
    .orderBy(asc('Artist'), asc('Name'))
    .select('Artist', 'Name')
)

if TASK5:
    for row in result.take(40):
        print '{r.Artist} {r.Name}'.format(r=row)

Artist: 3 Doors Down Kryptonite
Artist: 311 Beautiful disaster
Artist: Blur Girls and Boys
Artist: Clawfinger Nothing Going On
Artist: Clawfinger Nothing Going On
Artist: Disturbed The Vengeful One
Artist: Gotthard Eagle
Artist: Green Day 21 Guns
Artist: Green Day 21 Guns
Artist: Green Day Kill The DJ
Artist: Green Day Kill The DJ
Artist: Green Day Kill The DJ
Artist: Iggy Pop Sunday
Artist: Korn Here To Stay
Artist: Linkin Park In The End
Artist: Linkin Park Numb
Artist: Lordi Hard Rock Hallelujah
Artist: Nickelback She Keeps Me Up
Artist: Nomy Cocaine
Artist: Papa Roach Getting Away With Murder
Artist: Rise Against Prayer Of The Refugee
Artist: Serj Tankian Sky is Over
Artist: Slipknot Wait And Bleed
Artist: The Offspring Come Out and Play
Artist: The Offspring Come Out and Play
Artist: Thousand Foot Krutch Take It Out On Me
Artist: Three Days Grace I Hate Everything About You


## Graph based Music Recommender. Task 6

For the user with Id 776748 print top-40 recommended tracks. Build music recommendations with the algorithm described in the lesson 3 of the fourth week. Initialize coordinates of vector x_0 corresponding to the user’s vertex and vertices from step 7 with ones and all other coordinates with zeros. Do 5 iterations.

You should receive a table with 3 collumns: “name”, “artist” and “rank”. Sort the resulting dataframe in descending order by “rank”, select top 40 recommended tracks, select only the columns “name”, “artist” and “rank”, leave 5 digits after the decimal point in “rank” and print the resulting dataframe.

Example:

```
Smells Like Teen Spirit Artist: Nirvana 0.09401
Whispers In The Dark Artist: Skillet 0.07914
Kisses Back Artist: Matthew Koma 0.07876
Attention Artist: Charlie Puth 0.07851
Nothing Else Matters Artist: Metallica 0.07674
```