In [1]:
## This project is using implicit feedbacks from users
### Implicit feedback: Number of times viewed, length of session, browsing history, mouse movements...

## The major advantage of implicit feedbacks is its avaliability.
## The disadvantages of implicit feedbacks include:
### a. No negative feedbacks
### b. Likely to be noisy
### c. No preference or order (can only use to model confidence)

## Model using implicit feedbacks can use RMSE for optimization, but can't use RMSE for evaluation(no negative feedback).
## The Metrics used to evaluate ALS models with implicit feedbacks are Mean Average Precision (MAP) 
## and Normalized Discounted Cumulative Gain which are not implimented in Spark yet. (Avaliable in SKlearn)

## 1. Initiate App and Load Raw Data

In [2]:
spark=SparkSession.builder\
                  .appName('Collaborative Filtering Music Recommendation System with Implicit Feedbacks')\
                  .getOrCreate()

In [3]:
##load raw data, weight means times the user listens to the artist

rawData=spark.read.format('csv').option('delimiter','\t').option('header','true')\
            .load('02/demos/datasets/lastfm/user_artists.dat')
    
rawData.toPandas().head()

Unnamed: 0,userID,artistID,weight
0,2,51,13883
1,2,52,11690
2,2,53,11351
3,2,54,10300
4,2,55,8983


## 2. Feature Engineering 

In [4]:
from pyspark.sql.functions import col

## convert datatype
DF=rawData.select(col('userID').cast('int'),
                  col('artistID').cast('int'),
                  col('weight').cast('int')
                 )

In [5]:
## statistical features of weight col

DF.select('weight').describe().toPandas()

Unnamed: 0,summary,weight
0,count,92834.0
1,mean,745.2439300256372
2,stddev,3751.32208038768
3,min,1.0
4,max,352698.0


In [6]:
## standadize the weight column

from pyspark.sql.functions import stddev,mean,col

standDF = DF.select(mean('weight').alias('mean_weight'), 
                    stddev('weight').alias('stddev_weight'))\
            .crossJoin(DF)\
            .withColumn('weight_scaled' , 
                        (col('weight') - col('mean_weight')) / col('stddev_weight'))
        
standDF.toPandas().head()

Unnamed: 0,mean_weight,stddev_weight,userID,artistID,weight,weight_scaled
0,745.24393,3751.32208,2,51,13883,3.502167
1,745.24393,3751.32208,2,52,11690,2.917573
2,745.24393,3751.32208,2,53,11351,2.827205
3,745.24393,3751.32208,2,54,10300,2.547037
4,745.24393,3751.32208,2,55,8983,2.195961


In [7]:
## split training and testing datasets

(trainingData,testData)=standDF.randomSplit([0.8,0.2])

## 3. Build ALS model

In [8]:
from pyspark.ml.recommendation import ALS

als=ALS(maxIter=10,
        regParam=0.1,
        userCol='userID',
        itemCol='artistID',
        implicitPrefs=True,
        ratingCol='weight_scaled',
        coldStartStrategy='drop'
       )

model=als.fit(trainingData)

In [9]:
predictions=model.transform(testData)
predictions.toPandas().head()

Unnamed: 0,mean_weight,stddev_weight,userID,artistID,weight,weight_scaled,prediction
0,745.24393,3751.32208,3,148,66,-0.181068,0.0
1,745.24393,3751.32208,1137,463,77,-0.178136,0.0
2,745.24393,3751.32208,850,463,784,0.010331,0.001447
3,745.24393,3751.32208,2079,471,282,-0.123488,0.007455
4,745.24393,3751.32208,1145,471,129,-0.164274,0.007188


In [10]:
## exam the distribution of original weights and predictions

predictionsPandas=predictions.select('weight_scaled','prediction').toPandas()
predictionsPandas.describe()

Unnamed: 0,weight_scaled,prediction
count,16007.0,16007.0
mean,0.007888,0.04268
std,0.797481,0.099586
min,-0.198395,-0.345408
25%,-0.168006,0.0
50%,-0.124554,0.002539
75%,-0.023657,0.036606
max,34.917758,0.925803


## 4. Music Recommendation Engine

In [11]:
## Load Artist info

artistData=spark.read.format('csv').option('delimiter', '\t').option('header', 'true')\
                     .load('02/demos/datasets/lastfm/artists.dat')
    
artistData.toPandas().head()

Unnamed: 0,id,name,url,pictureURL
0,1,MALICE MIZER,http://www.last.fm/music/MALICE+MIZER,http://userserve-ak.last.fm/serve/252/10808.jpg
1,2,Diary of Dreams,http://www.last.fm/music/Diary+of+Dreams,http://userserve-ak.last.fm/serve/252/3052066.jpg
2,3,Carpathian Forest,http://www.last.fm/music/Carpathian+Forest,http://userserve-ak.last.fm/serve/252/40222717...
3,4,Moi dix Mois,http://www.last.fm/music/Moi+dix+Mois,http://userserve-ak.last.fm/serve/252/54697835...
4,5,Bella Morte,http://www.last.fm/music/Bella+Morte,http://userserve-ak.last.fm/serve/252/14789013...


In [12]:
## Define a function to get artist recommendations

from pyspark.sql.types import IntegerType

def musicRecommendationsForUser(userId,numRecs):
    allUserRecs=model.recommendForAllUsers(numRecs)
    
    userArtistList=allUserRecs.filter(allUserRecs.userID==userId).select('recommendations')
    recArtistList=userArtistList.collect()[0].recommendations
    recArtistDF=spark.createDataFrame(recArtistList)
    
    recArtistFinalDF=artistData.join(recArtistDF, artistData.id==recArtistDF.artistID).orderBy('rating',ascending=False)\
                    .select('name','url','rating')
    
    return recArtistFinalDF

In [13]:
## recommendations for User 250
musicRecommendationsForUser(250,5).toPandas()

Unnamed: 0,name,url,rating
0,The Beatles,http://www.last.fm/music/The+Beatles,0.238277
1,Pink Floyd,http://www.last.fm/music/Pink+Floyd,0.202532
2,Led Zeppelin,http://www.last.fm/music/Led+Zeppelin,0.134917
3,Radiohead,http://www.last.fm/music/Radiohead,0.097381
4,Queen,http://www.last.fm/music/Queen,0.081031


In [14]:
## Actual data for User 250
userActualRaw=rawData.filter(rawData.userID==250)

userActual=artistData.join(userActualRaw,userActualRaw.artistID==artistData.id).orderBy('weight')\
            .select('userID','name','weight')
    
userActual.toPandas().head()   

Unnamed: 0,userID,name,weight
0,250,The Mothers of Invention,1119
1,250,Pink Floyd,1132
2,250,Aesop Rock,1633
3,250,Frank Zappa,2285
4,250,GG Allin,2336
