# **WELCOME TO THIS NOTEBOOK**

In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Let's install pyspark

In [None]:
!pip install pyspark



Importing the modules

In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, desc , col, max
from pyspark.ml.feature import  StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS # Alternating least square  
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

Creating the spark session


In [33]:
# first we will start the spark session
spark = SparkSession.builder.appName("lastfm").getOrCreate()


# Loading the dataset

In [None]:
file_path = '/content/drive/MyDrive/dataset/listenings.csv'

#Header is to infer column type and inferschema is for column name
df_music = spark.read.format('csv').option('header',True).option('inferSchema',True).load(file_path)
df_music.show()

+-----------+-------------+--------------------+---------------+--------------------+
|    user_id|         date|               track|         artist|               album|
+-----------+-------------+--------------------+---------------+--------------------+
|000Silenced|1299680100000|           Price Tag|       Jessie J|         Who You Are|
|000Silenced|1299679920000|Price Tag (Acoust...|       Jessie J|           Price Tag|
|000Silenced|1299679440000|Be Mine! (Ballad ...|          Robyn|            Be Mine!|
|000Silenced|1299679200000|            Acapella|          Kelis|            Acapella|
|000Silenced|1299675660000|   I'm Not Invisible|      The Tease|   I'm Not Invisible|
|000Silenced|1297511400000|Bounce (Feat NORE...|       MSTRKRFT|         Fist of God|
|000Silenced|1294498440000|Don't Stop The Mu...|        Rihanna|Addicted 2 Bassli...|
|000Silenced|1292438340000|               ObZen|      Meshuggah|               ObZen|
|000Silenced|1292437740000|   Yama's Messengers|      


# Cleaning tables 

In [None]:
df_music = df_music.drop('date')
df_music.show()

+-----------+--------------------+---------------+--------------------+
|    user_id|               track|         artist|               album|
+-----------+--------------------+---------------+--------------------+
|000Silenced|           Price Tag|       Jessie J|         Who You Are|
|000Silenced|Price Tag (Acoust...|       Jessie J|           Price Tag|
|000Silenced|Be Mine! (Ballad ...|          Robyn|            Be Mine!|
|000Silenced|            Acapella|          Kelis|            Acapella|
|000Silenced|   I'm Not Invisible|      The Tease|   I'm Not Invisible|
|000Silenced|Bounce (Feat NORE...|       MSTRKRFT|         Fist of God|
|000Silenced|Don't Stop The Mu...|        Rihanna|Addicted 2 Bassli...|
|000Silenced|               ObZen|      Meshuggah|               ObZen|
|000Silenced|   Yama's Messengers|         Gojira|The Way of All Flesh|
|000Silenced|On the Brink of E...|   Napalm Death|Time Waits For No...|
|000Silenced|On the Brink of E...|   Napalm Death|Time Waits For

In [None]:
# now we can see that there are some NAN values that we will drop

df_music = df_music.na.drop()
df_music.show()

+-----------+--------------------+---------------+--------------------+
|    user_id|               track|         artist|               album|
+-----------+--------------------+---------------+--------------------+
|000Silenced|           Price Tag|       Jessie J|         Who You Are|
|000Silenced|Price Tag (Acoust...|       Jessie J|           Price Tag|
|000Silenced|Be Mine! (Ballad ...|          Robyn|            Be Mine!|
|000Silenced|            Acapella|          Kelis|            Acapella|
|000Silenced|   I'm Not Invisible|      The Tease|   I'm Not Invisible|
|000Silenced|Bounce (Feat NORE...|       MSTRKRFT|         Fist of God|
|000Silenced|Don't Stop The Mu...|        Rihanna|Addicted 2 Bassli...|
|000Silenced|               ObZen|      Meshuggah|               ObZen|
|000Silenced|   Yama's Messengers|         Gojira|The Way of All Flesh|
|000Silenced|On the Brink of E...|   Napalm Death|Time Waits For No...|
|000Silenced|On the Brink of E...|   Napalm Death|Time Waits For

In [None]:
rows = df_music.count()
columns = len(df_music.columns)
print(rows, columns)

13758905 4



# Performing some aggregation
to see how many times each user has listened to specific track


In [None]:
df_music_agg = df_music.select('user_id','track').groupby('user_id', 'track').agg(count('*')).orderBy('user_id')
df_music_agg.show()

+-------+--------------------+--------+
|user_id|               track|count(1)|
+-------+--------------------+--------+
| --Seph|Chelsea Hotel - L...|       1|
| --Seph|               Leloo|       1|
| --Seph|          Paris 2004|       7|
| --Seph|     The Way We Were|       1|
| --Seph|        Window Blues|       1|
| --Seph|   Summa for Strings|       1|
| --Seph|         The Embrace|       1|
| --Seph|       Life On Mars?|       1|
| --Seph|Hungarian Rhapsod...|       1|
| --Seph| Air on the G String|       1|
| --Seph|Vestido Estampado...|       1|
| --Seph|Belina (Original ...|       1|
| --Seph|Hungarian Dance No 5|       1|
| --Seph|       Phantom Pt II|       1|
| --Seph|              Monday|       1|
| --Seph| White Winter Hymnal|       3|
| --Seph|Airplanes [feat H...|       1|
| --Seph|  California Waiting|       1|
| --Seph|      Hour for magic|       2|
| --Seph|Virus (Luke Fair ...|       1|
+-------+--------------------+--------+
only showing top 20 rows



In [None]:
rows2 = df_music_agg.count()
columns2 = len(df_music_agg.columns)
print(rows2, columns2)

9930128 3


In [None]:
# before the next step we will decrease the volumne of our data as it will take a lot of time.
# therefore for efficiency

#Lets select first 20k rows

df_music_agg = df_music_agg.limit(20000)

# Let's convert the user id and track columns into unique integers




In [None]:
# now we wanna use string indexer to convert user_id column and track column into unique integer values
# from pyspark.ml.feature import  StringIndexer
# from pyspark.ml import Pipeline

indexer = [StringIndexer(inputCol = col, outputCol=col+'_index').fit(df_music_agg) for col in list(set(df_music_agg.columns) - set(['count']))]

pipeline = Pipeline(stages=indexer)

df_data = pipeline.fit(df_music_agg).transform(df_music_agg)
df_data.show()

+-------+--------------------+--------+-------------+-----------+--------------+
|user_id|               track|count(1)|user_id_index|track_index|count(1)_index|
+-------+--------------------+--------+-------------+-----------+--------------+
| --Seph| White Winter Hymnal|       3|      32095.0|      365.0|           2.0|
| --Seph|Virus (Luke Fair ...|       1|      32095.0|   285449.0|           0.0|
| --Seph|Airplanes [feat H...|       1|      32095.0|    21512.0|           0.0|
| --Seph|Belina (Original ...|       1|      32095.0|   207900.0|           0.0|
| --Seph|              Monday|       1|      32095.0|     2049.0|           0.0|
| --Seph|Hungarian Dance No 5|       1|      32095.0|    23260.0|           0.0|
| --Seph|       Life On Mars?|       1|      32095.0|      674.0|           0.0|
| --Seph|  California Waiting|       1|      32095.0|     5001.0|           0.0|
| --Seph|       Phantom Pt II|       1|      32095.0|     7907.0|           0.0|
| --Seph|   Summa for String

In [None]:
data = df_data.select('user_id_index','track_index','count(1)').orderBy('user_id_index')
data.show()

+-------------+-----------+--------+
|user_id_index|track_index|count(1)|
+-------------+-----------+--------+
|          0.0|   332232.0|       1|
|          0.0|   501305.0|       1|
|          0.0|    17414.0|       2|
|          0.0|    99207.0|       1|
|          0.0|      770.0|       1|
|          0.0|      447.0|       1|
|          0.0|   908489.0|       1|
|          0.0|     9728.0|       1|
|          0.0|   341246.0|       1|
|          0.0|   326910.0|       1|
|          0.0|   140078.0|       1|
|          0.0|   510823.0|       1|
|          0.0|     8489.0|       1|
|          0.0|   435641.0|       1|
|          0.0|   326252.0|       1|
|          0.0|   754139.0|       1|
|          0.0|   112292.0|       1|
|          0.0|    29033.0|       1|
|          0.0|    37045.0|       1|
|          0.0|    10583.0|       1|
+-------------+-----------+--------+
only showing top 20 rows



# Train and Test data

In [41]:
(training, test) =  data.randomSplit([0.5,0.5])

# Let's Create our Model

In [None]:
USERID = 'user_id_index'
TRACK = 'track_index'
COUNT = 'count(1)'

als = ALS(maxIter=5 , regParam=0.01, userCol=USERID, itemCol=TRACK, ratingCol=COUNT)

model = als.fit(training)

predictions = model.transform(test)

Py4JJavaError: ignored


# Generate top 10 Track recommendations for each user

In [None]:
recs = model.recommendForAllUsers(10)

In [None]:
recs.show()

In [None]:
# We can see user item index and a number of recommendations
recs.take(1)