In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')
import warnings
warnings.filterwarnings("ignore")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

In [3]:
!pip install pyspark



In [4]:
from pyspark.sql import SparkSession, functions
from pyspark.ml import Pipeline
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [5]:
def createDataframe(path):
    ''' 
    create Dataframe using csv path
    '''
    raw_data = spark.read.csv(path,header=True,inferSchema=True)
    df = pd.DataFrame(raw_data.toPandas())
    data_values=df.values.tolist()
    data_columns=list(df.columns)
    df = spark.createDataFrame(data_values,data_columns)
    return df

In [6]:
def getSongid(df):
    indexer = StringIndexer(inputCol="name", outputCol="songid")
    indexed = indexer.fit(df).transform(df)
    return indexed

In [7]:
# reference: https://spark.apache.org/docs/2.2.0/ml-collaborative-filtering.html
def alsRecall(df):
    (training, test) = df.randomSplit([0.8, 0.2])
    # rating is inferred from other signals, set implicitPrefs to True to get better results
    als = ALS(maxIter=15, regParam=0.01,implicitPrefs=True,userCol="userid", itemCol="songid", ratingCol="rating",
            coldStartStrategy="drop")
    model = als.fit(training)
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("Root-mean-square error = " + str(rmse))

    # Generate top 10 song recommendations for each user
    userRecs = model.recommendForAllUsers(10)
    # Generate top 10 user recommendations for each song
    songRecs = model.recommendForAllItems(3)

    return userRecs, songRecs, rmse

In [8]:
if __name__ == '__main__':
    data_path = "/content/drive/My Drive/6893project/data/merged.csv"
    spark = SparkSession.builder.appName('ALS').getOrCreate()
    columns = ['name','userid','rating']
    df = createDataframe(data_path).select(columns).cache()
    df = getSongid(df)

In [9]:
df.show(10)

+--------------------+------+-----------+------+
|                name|userid|     rating|songid|
+--------------------+------+-----------+------+
|             abcdefu|     0|5.021052507| 243.0|
|    Champagne Poetry|     0|9.979441337|  75.0|
|Chamber Of Reflec...|     0|1.128520011| 146.0|
|          In My Room|     0|1.128520011| 213.0|
|           Adore You|     0|3.622152942| 129.0|
|          Heat Waves|     0|9.979441337|  11.0|
|SLOW DANCING IN T...|     0|5.021052507|  13.0|
|     White Christmas|     0|3.817198528| 123.0|
|Bubbly (with Drak...|     0|8.899590206|  73.0|
|     Flashing Lights|     0|9.979441337| 176.0|
+--------------------+------+-----------+------+
only showing top 10 rows



In [10]:
userRecs, songRecs,rmse = alsRecall(df)

Root-mean-square error = 6.480136951395033


In [12]:
4userRecs.show()

+------+--------------------+
|userid|     recommendations|
+------+--------------------+
|     0|[{16, 1.0863357},...|
|     1|[{8, 1.1126817}, ...|
|     2|[{37, 1.0071075},...|
|     3|[{0, 1.0800748}, ...|
|     4|[{0, 1.0663652}, ...|
|     5|[{27, 1.0933261},...|
|     6|[{23, 1.0569347},...|
|     7|[{7, 1.0091039}, ...|
|     8|[{50, 1.010134}, ...|
|     9|[{75, 1.005351}, ...|
+------+--------------------+



In [13]:
songRecs.show()

+------+--------------------+
|songid|     recommendations|
+------+--------------------+
|     0|[{3, 1.0800748}, ...|
|     1|[{6, 1.0527464}, ...|
|     2|[{2, 1.0002174}, ...|
|     3|[{7, 0.99460363},...|
|     4|[{2, 0.9973035}, ...|
|     5|[{5, 1.0005875}, ...|
|     6|[{5, 1.0787176}, ...|
|     7|[{7, 1.0091039}, ...|
|     8|[{1, 1.1126817}, ...|
|     9|[{2, 0.9954386}, ...|
|    10|[{6, 0.9947193}, ...|
|    11|[{4, 0.9934742}, ...|
|    12|[{1, 0.9981467}, ...|
|    13|[{1, 1.0027547}, ...|
|    14|[{1, 0.9994098}, ...|
|    15|[{0, 1.0077391}, ...|
|    16|[{0, 1.0863357}, ...|
|    17|[{5, 0.9988396}, ...|
|    18|[{7, 1.0011135}, ...|
|    19|[{3, 0.99926937},...|
+------+--------------------+
only showing top 20 rows



In [14]:
rmse

6.480136951395033

In [15]:
model = alsRecall(df)

Root-mean-square error = 6.47418705368279
