# **WELCOME TO THIS NOTEBOOK**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Let's install pyspark

In [None]:

!pip install pyspark==3.0.1





Importing the modules

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, desc , col, max
from pyspark.ml.feature import  StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder



```
# This is formatted as code
```

Creating the spark session


In [None]:
spark = SparkSession.builder.appName('lastfm2').getOrCreate()

# Loading the dataset

In [None]:
listening_data_path = '/content/drive/MyDrive/dataset/listenings.csv'
listening_df = spark.read.format('csv').option('inferSchema',True).option('header',True).load(listening_data_path)


# Cleaning tables 

In [None]:
listening_df=listening_df.drop('date')
#listening_df.show()

In [None]:
listening_df = listening_df.na.drop()

In [None]:
listening_df.count()

13758905


# Let's Perform some aggregation
to see how many times each user has listened to specific track


In [None]:
df_listening_agg = listening_df.select('user_id','track').groupBy('user_id','track').agg(count('*').alias('count')).orderBy('user_id')

In [None]:
df_listening_agg.show()

+-------+--------------------+-----+
|user_id|               track|count|
+-------+--------------------+-----+
| --Seph|Chelsea Hotel - L...|    1|
| --Seph|        Window Blues|    1|
| --Seph|          Paris 2004|    7|
| --Seph|     The Way We Were|    1|
| --Seph|Vestido Estampado...|    1|
| --Seph|               Leloo|    1|
| --Seph|         The Embrace|    1|
| --Seph|      Hour for magic|    2|
| --Seph|Hungarian Rhapsod...|    1|
| --Seph| Air on the G String|    1|
| --Seph|       Life On Mars?|    1|
| --Seph|Belina (Original ...|    1|
| --Seph|   Summa for Strings|    1|
| --Seph|       Phantom Pt II|    1|
| --Seph|              Monday|    1|
| --Seph|Hungarian Dance No 5|    1|
| --Seph|Airplanes [feat H...|    1|
| --Seph|  California Waiting|    1|
| --Seph|Virus (Luke Fair ...|    1|
| --Seph| White Winter Hymnal|    3|
+-------+--------------------+-----+
only showing top 20 rows



In [None]:
df_listening_agg.count()

9930128

In [None]:
len(df_listening_agg.columns)

3

In [None]:
df_listening_agg = df_listening_agg.limit(5000)



```
# This is formatted as code
```

# Let's convert the user id and track columns into unique integers




In [None]:

indexer = [StringIndexer(inputCol= col , outputCol=col + '_index').fit(df_listening_agg) for col in list(set(df_listening_agg.columns)-set(['count']))]

pipeline = Pipeline(stages= indexer)
data = pipeline.fit(df_listening_agg).transform(df_listening_agg)

In [None]:

data.show()

+-------+--------------------+-----+-----------+-------------+
|user_id|               track|count|track_index|user_id_index|
+-------+--------------------+-----+-----------+-------------+
| --Seph|   Summa for Strings|    1|     3770.0|         16.0|
| --Seph|      Hour for magic|    2|     1909.0|         16.0|
| --Seph| White Winter Hymnal|    3|     4587.0|         16.0|
| --Seph|Virus (Luke Fair ...|    1|     4425.0|         16.0|
| --Seph|Airplanes [feat H...|    1|      358.0|         16.0|
| --Seph|Belina (Original ...|    1|      604.0|         16.0|
| --Seph|              Monday|    1|     2663.0|         16.0|
| --Seph|Hungarian Dance No 5|    1|     1933.0|         16.0|
| --Seph|       Life On Mars?|    1|     2402.0|         16.0|
| --Seph|  California Waiting|    1|      801.0|         16.0|
| --Seph|       Phantom Pt II|    1|     3062.0|         16.0|
| --Seph|Hungarian Rhapsod...|    1|     1934.0|         16.0|
| --Seph|     The Way We Were|    1|     4116.0|       

In [None]:
dict_track = data.select('track','track_index').distinct()
print(dict_track.count())

4811


In [None]:
data1 = data.select('count' ,'user_id_index' ,'track_index' ).orderBy(desc('user_id_index'))

In [None]:
data1.printSchema()

root
 |-- count: long (nullable = false)
 |-- user_id_index: double (nullable = false)
 |-- track_index: double (nullable = false)



In [None]:
data1.show()

+-----+-------------+-----------+
|count|user_id_index|track_index|
+-----+-------------+-----------+
|    1|         37.0|      261.0|
|    1|         37.0|     4713.0|
|    1|         37.0|     2921.0|
|    1|         36.0|      876.0|
|    1|         36.0|     4106.0|
|    1|         36.0|     4592.0|
|    1|         36.0|     3818.0|
|    1|         36.0|     3719.0|
|    1|         36.0|     2003.0|
|    2|         36.0|     3226.0|
|    1|         36.0|     1419.0|
|    1|         36.0|     4288.0|
|    1|         36.0|     4646.0|
|    1|         36.0|     1632.0|
|    1|         36.0|      666.0|
|    3|         36.0|     2673.0|
|    2|         35.0|     3184.0|
|    1|         35.0|     2049.0|
|    1|         35.0|      170.0|
|    1|         35.0|     2878.0|
+-----+-------------+-----------+
only showing top 20 rows



# Train and Test data

In [None]:
(train , test)  = data1.randomSplit([0.4 , 0.6])

# Let's Create our Model

In [None]:
user_id = 'user_id_index'
track = 'track_index'
count = 'count'

als = ALS(maxIter= 5  ,rank=10 , regParam=0.01 , userCol= 'user_id_index' , itemCol= 'track_index' , ratingCol= 'count' )

model = als.fit(train)
predictions = model.transform(test)



# Generate top 10 Track recommendations for each user

In [None]:
recommendations = model.recommendForAllUsers(10)

In [None]:
recommendations.show()

+-------------+--------------------+
|user_id_index|     recommendations|
+-------------+--------------------+
|           31|[[1589, 14.994965...|
|           34|[[154, 19.994606]...|
|           28|[[1381, 6.988539]...|
|           26|[[2478, 22.97975]...|
|           27|[[1663, 6.9820185...|
|           12|[[2478, 10.678355...|
|           22|[[2478, 7.4050927...|
|            1|[[1789, 3.9897099...|
|           13|[[154, 6.7562275]...|
|            6|[[1414, 4.029411]...|
|           16|[[1789, 4.449752]...|
|            3|[[2675, 2.6989505...|
|           20|[[2478, 14.691655...|
|            5|[[3483, 7.555993]...|
|           19|[[1663, 4.399854]...|
|           15|[[3483, 4.261537]...|
|           37|[[154, 8.208038],...|
|           17|[[1256, 3.9761558...|
|            9|[[2478, 4.5652537...|
|           35|[[563, 6.8483744]...|
+-------------+--------------------+
only showing top 20 rows



In [None]:
rec1 = recommendations.take(1)

In [None]:
rec2= [row['recommendations'] for row in rec1]

In [None]:
rec3 = rec2.pop()

In [None]:
tracks = [row['track_index'] for row in rec3]
tracks

[1589, 260, 3496, 1139, 201, 526, 2175, 154, 687, 2056]

In [None]:
t = dict_track.select('track').where(col("track_index").isin(tracks))



In [None]:
dict_track.where(col("track_index").isin(1589)).show()

+-----+-----------+
|track|track_index|
+-----+-----------+
|Fugue|     1589.0|
+-----+-----------+



In [None]:
t.show()

+--------------------+
|               track|
+--------------------+
|            Shock Me|
|   Desperation Burns|
|               Fugue|
|Blue Day for Croatoa|
|1000 Points of Light|
|                Ayil|
|        9mm Solution|
|If I Have to Wake...|
|Je N'en Connais P...|
|Tokyo (Vampires &...|
+--------------------+



gfrdf

In [None]:
recc  = model.recommendForAllItems(3)

In [None]:
recc.show()

+-----------+--------------------+
|track_index|     recommendations|
+-----------+--------------------+
|       1580|[[6, 0.9923823], ...|
|       4101|[[29, 0.99651045]...|
|       2122|[[6, 0.9923823], ...|
|       3175|[[1, 0.9905428], ...|
|       2366|[[18, 0.9944534],...|
|        148|[[29, 0.99651045]...|
|       3918|[[33, 0.9987579],...|
|       4519|[[15, 1.9885733],...|
|       1460|[[13, 0.9940494],...|
|       2580|[[17, 0.99403894]...|
|       4190|[[29, 0.99651045]...|
|       2811|[[3, 0.9914764], ...|
|       1483|[[33, 0.9987579],...|
|       1025|[[28, 1.9967256],...|
|       2235|[[9, 1.985991], [...|
|       3475|[[9, 0.9929955], ...|
|        897|[[7, 0.9911765], ...|
|       1507|[[8, 0.99335194],...|
|        858|[[10, 0.9944298],...|
|       4158|[[27, 0.9974312],...|
+-----------+--------------------+
only showing top 20 rows

