In [1]:
import os
execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
      /_/

Using Python version 2.7.12 (default, Nov 19 2016 06:48:10)
SparkSession available as 'spark'.


## Data description (DataFrames in parquet format)

Location - `/data/sample264`

Fields: `trackId`, `userId`, `timestamp`, `artistId`

- `trackId` - `id` of the track
- `userId` - `id` of the user
- `artistId` - `id` of the artist
- `timestamp` - `timestamp` of the moment the user starts listening to a track

Location - `/data/meta`

Fields: `type`, `Name`, `Artist`, `Id`

- `Type` could be “track” or “artist”
- `Name` is the title of the track if the type == “track” and the name of the musician or group if the type == “artist”.
- `Artist` states for the creator of the track in case the type == “track” and for the name of the musician or group in case the type == “artist”.
- `Id` - id of the item

In [2]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local [2]").getOrCreate()

In [3]:
data = sparkSession.read.parquet("/data/sample264")
meta = sparkSession.read.parquet("/data/meta")

## Normalization could be done by next function

In [4]:
from pyspark.sql import Window
from pyspark.sql.functions import row_number, sum, desc, asc, udf
from pyspark.sql import types


def norm_pair(value, summ):
    return value / summ if value != summ else 0.5
udf_norm_pair = udf(norm_pair, types.FloatType())


def norm(df, key1, key2, field, n): 
    
    window = Window.partitionBy(key1).orderBy(col(field).desc())
        
    topsDF = (
        df
        .withColumn("row_number", row_number().over(window))
        .filter(col("row_number") <= n)
        .drop(col("row_number")) 
    )
        
    tmpDF = topsDF.groupBy(col(key1)).agg(col(key1), sum(col(field)).alias("sum_" + field))
   
    normalizedDF = (
        topsDF
        .join(tmpDF, key1, "inner")
        .withColumn("norm_" + field, udf_norm_pair(col(field), col("sum_" + field)))
        .cache()
    )

    return normalizedDF

In [5]:
from pyspark.sql import Window
from pyspark.sql.functions import col, rank

# userTrack = data.groupBy(col("userId"), col("trackId")).count()

# userTrackNorm = (
#     norm(userTrack, "userId", "trackId", "count", 1000)
#     .withColumn("id", col("userId"))
#     .withColumn("id2", col("trackId"))
#     .withColumn("norm_count", col("norm_count") * 0.5)
#     .select(col("id"), col("id2"), col("norm_count"))
# )

# window = Window.orderBy(col("norm_count"))
    
# userTrackList = (
#     userTrackNorm.withColumn("position", rank().over(window))
#     .filter(col("position") < 50)
#     .orderBy(col("id"), col("id2"))
#     .select(col("id"), col("id2"))
#     .take(40)
# )

In [6]:
# for val in userTrackList:
#     print "%s %s" % val

## Graph based Music Recommender. Task 1

Build the edges of the type “track-track”. To do it you will need to count the collaborative similarity between all the tracks: if a user has listened To the tracks A and B together in THE limited time interval (equal to 7 minutes), then you should add 1 to the weight of the edge from vertex A to vertex B. For each track choose top 40 tracks similar to the initial one and normalize weights of its edges (divide the weight of each edge on a summary of weights of all edges).

Sort the resulting Data Frame in ascending order by the column norm_count, take top 40 rows, select only the columns “id1”, “id2”, sort them in descending order this time first by “id1”, then by “id2” and print the columns “id1”, “id2” of the resulting dataframe.Example:

```
54719	767867
54719	767866
50787	327676
```

---

_For all tasks use the same ipython notebook, each task should be the continuation of the previous._

In [7]:
TIME_DELTA = 60 * 7

In [8]:
data_1 = data.alias('d1')
data_2 = data.alias('d2')

cond = (
    (col('d1.userId') == col('d2.userId'))
    & (col('d1.timestamp') < col('d2.timestamp'))
    & (col('d1.timestamp') + TIME_DELTA >= col('d2.timestamp'))
    & (col('d1.trackId') != col('d2.trackId'))
)

djoin = (
#     data_1.crossJoin(data_2)
    data_1.join(data_2, cond, 'left_outer')
#     .filter(cond)
    .select(col('d1.trackId').alias('id1'), col('d2.trackId').alias('id2'))
    .dropna()
    .groupBy(col('id1'), col('id2'))
    .count()
)

track_track = (
    norm(djoin, 'id1', 'id2', 'count', 40)
    .select(col('id1'), col('id2'), col('norm_count'))
)

In [9]:
window = Window.orderBy(desc('norm_count'), asc('id1'), asc('id2'))

result = (
    track_track
    .withColumn('position', rank().over(window))
    .filter(col('position') <= 40)
    .orderBy(asc('id1'), asc('id2'))
    .select(col('id1'), col('id2'))
)

for row in result.take(40):
    print '{r.id1} {r.id2}'.format(r=row)

798256 923706
798258 808254
798302 836228
798322 876562
798331 827364
798335 840741
798376 888871
798379 812055
798398 926302
798403 868805
798405 867217
798426 910880
798447 832635
798457 918918
798471 801831
798474 963162
798475 827475
798505 905671
798508 810743
798516 860347
798526 937573
798542 946408
798544 841232
798550 936295
798552 830267
798618 930224
798667 874844
798682 934393
798704 937570
798707 839389
798720 958333
798725 933147
798731 853117
798782 956938
798801 950802
798820 890393
798821 883244
798827 908022
798851 801321
798978 854212


## Graph based Music Recommender. Task 2

Build the edges of the type “user-track”. Take the amount of times the track was listened by all users as the weight of the edge from the artist’s vertex to the track’s vertex. For each user take top-1000 and normalize them.

Sort the resulting Data Frame in ascending order by the column norm_count, take top 40 rows, select only the columns “id1”, “id2”, sort them in descending order this time first by “id1”, then by “id2” and print the columns “id1”, “id2” of the resulting dataframe.

Example:

```
54719	767867
54719	767866
50787	327676
```

In [10]:
user_track = (
    norm(data.groupBy('userId', 'trackId').count(), 'userId', 'trackId', 'count', 1000)
    .withColumn('id1', col('userId'))
    .withColumn('id2', col('trackId'))
    .select(col('id1'), col('id2'), col('norm_count'))
)

In [12]:
window = Window.orderBy(desc('norm_count'), asc('id1'), asc('id2'))

result = (
    user_track
    .withColumn('position', rank().over(window))
    .filter(col('position') <= 40)
    .orderBy(asc('id1'), asc('id2'))
    .select(col('id1'), col('id2'))
)

for row in result.take(40):
    print '{r.id1} {r.id2}'.format(r=row)

66 965774
116 867268
128 852564
131 880170
195 946408
215 860111
235 897176
300 857973
321 915545
328 943482
333 818202
346 864911
356 961308
428 943572
431 902497
445 831381
488 841340
542 815388
617 946395
649 901672
658 937522
662 881433
698 935934
708 952432
746 879259
747 879259
776 946408
784 806468
806 866581
811 948017
837 799685
901 871513
923 879322
934 940714
957 945183
989 878364
999 967768
1006 962774
1049 849484
1057 920458


## Graph based Music Recommender. Task 3

Build the edges of the type “user-artist”. Take the amount of times the user has listened to the artist’s tracks as the weight of the edge from the user’s vertex to the artist’s vertex. For each user take top-100 artists and normalize weights.

Sort the resulting Data Frame in ascending order by the column norm_count, take top 40 rows, select only the columns “id1”, “id2”, sort them in descending order this time first by “id1”, then by “id2” and print the columns “id1”, “id2” of the resulting dataframe.

Example:

```
54719	767867
54719	767866
50787	327676
```

In [17]:
user_artist = (
    norm(data.groupBy('userId', 'artistId').count(), 'userId', 'artistId', 'count', 100)
    .withColumn('id1', col('userId'))
    .withColumn('id2', col('artistId'))
    .select(col('id1'), col('id2'), col('norm_count'))
)

In [18]:
window = Window.orderBy(desc('norm_count'), asc('id1'), asc('id2'))

result = (
    user_artist
    .withColumn('position', rank().over(window))
    .filter(col('position') <= 40)
    .orderBy(asc('id1'), asc('id2'))
    .select(col('id1'), col('id2'))
)

for row in result.take(40):
    print '{r.id1} {r.id2}'.format(r=row)

66 993426
116 974937
128 1003021
131 983068
195 997265
215 991696
235 990642
288 1000564
300 1003362
321 986172
328 967986
333 1000416
346 982037
356 974846
374 1003167
428 993161
431 969340
445 970387
488 970525
542 969751
612 987351
617 970240
649 973851
658 973232
662 975279
698 995788
708 968848
746 972032
747 972032
776 997265
784 969853
806 995126
811 996436
837 989262
901 988199
923 977066
934 990860
957 991171
989 975339
999 968823
