In [1]:
from pyspark import SparkContext
from pyspark.mllib.recommendation import ALS, Rating
from random import randrange

sc = SparkContext.getOrCreate()

In [2]:
def title(s):
    print("---- %s -----" %s)    
    
def see(s, v):
    print("---- %s -----" %s)
    print(v)

# Implement the AreaUnderCurve function
## Requirements:
You are required to build the function that calculates the area under the curve (AUC). This function is used to measure the quality of the recommneder model. The idea is to simply measure the probability that a randomly chosen good (we know the user played) recommendation ranks above a randomly chosen bad (we know the user didnt' play) recommendation.

AUC accepts the CV dataset as the “positive” or “good” artists for each user, and a prediction function. This function translates each user-artist pair into a prediction as a Rating containing the user, artist, and a number wherein higher values mean higher rank in the recommendations.


Fill all the code under lines like this:

<font color="red">### ---CODE HERE --- ###</font>



### Helper functions

#### `isInt`
- Checks if a string is an integer

#### `buildArtistAlias`
- Map artist alias ID to a unique artist ID
- Handle corrupt non-convertable to int ids by ignoring them

####  `buildRatings`
`Rating(user, product, rating)`
- Represents a (user, product, rating) tuple.

In [3]:
def isInt(s):
    try:
        int(s)
        return True
    except ValueError:
        return False
    
def buildArtistAlias(rawArtistAlias):
    # Convert ther rawArtistData into tuples of (aliasID, artistID)
    # Filter all bad lines
    return rawArtistAlias \
        .map(lambda line: line.split('\t')) \
        .filter(lambda artist: artist[0] and isInt(artist[0])) \
        .map(lambda artist: (int(artist[0]), int(artist[1]))) \
        .collectAsMap()

        
def getArtistRating(line):
    # Parse the line to extract the 3 fields
    (userID, artistID, count) = map(lambda x: int(x), line.split(' '))
    try:
        # Lookup if the current artistID is an just an alias to an artist
        # The lookup is done from a the broadcast RDD
        finalArtistID = bArtistAlias.value[artistID]
    except KeyError:
        # if the lookup failed, then we have a new artist
        finalArtistID = artistID
    #Finally, create a new rating Object
    return Rating(userID, finalArtistID, count)


# Go over all User-Artist data and convert each line to a Rating object
def buildRatings(rawUserArtistData, bArtistAlias):
    return rawUserArtistData.map(lambda line: getArtistRating(line))


### Reading and caching the files RDDs

In [5]:
# --Replace the base location with your local path--
base = "./profiledata_06-May-2005/"
rawArtistAlias = sc.textFile(base + "artist_alias.txt").cache()
rawUserArtistData = sc.textFile(base + "user_artist_data.txt").cache()

### Preparing the artist aliases and user ratings

In [7]:
artistAlias = buildArtistAlias(rawArtistAlias)
bArtistAlias = sc.broadcast(artistAlias)

allData = buildRatings(rawUserArtistData, bArtistAlias)
allItemIDs = allData.map(lambda item: item.product).distinct().collect()
bAllItemIDs = sc.broadcast(allItemIDs)

### Split the data into trainData (90%) and cvData (10%)

In [8]:
(trainData, cvData) = allData.randomSplit(weights=[0.9, 0.1])

### Building the model

In [10]:
model = ALS.trainImplicit(ratings=trainData, rank=10, iterations=5, lambda_=0.01, alpha=1.0)

## Implement the `areaUnderCurve` function

In [29]:
def areaUnderCurve(positiveData, bAllItemIDs, predictFunction):
    
    
    ### ---CODE HERE (2 vars below)--- ###
    # positiveData contains information about artists who were listend to by certain users.
    # Create a positiveUserProducts that maps the positiveDate to (userID, artistID) tuple.
    positiveUserProducts = positiveData.map(lambda x:(x[0],x[1]))
    
    # We need to create a negativeUserProducts that contains artists the user did not listen to.
    ## - Group the artists of each user in a (userID1, [artistID1, artistID12,.....]) tuple.
    groupedPositiveUserProducts = positiveUserProducts.groupByKey().map(lambda x:(x[0],list(x[1])))
    
    ## - For each user, map their artist list to a new randomly selected unseed list of artists 
    ## - To do this, you need to implement a pos2neg function that takes the user and list of positive artists as parameters 
    ### - Selects a random artists from all artists (hint: bAllItemIDs)
    ### - Checks if the selected random artists is not in the positive artisits list
    ### - If not then add it to a negative list
    ### - Stop when the negative artists list is as big as the positive artists list.
    ### - return a tuple that contains that userID and the new negative artists list
    def pos2neg(userID, positiveArtists):
        ### ---CODE HERE--- ###
        import random
        negativeArtisits = []
        temp = bAllItemIDs.value
        temp = list(set(temp) - set(positiveArtists))
        for i in range(len(positiveArtists)):
            t=random.choice(temp)
            temp.remove(t)
            negativeArtisits.append(t)
        return (userID, negativeArtisits)
    
    
    ### ---CODE HERE (5 vars below)--- ###
    groupedNegativeUserProducts = groupedPositiveUserProducts.map(lambda x:pos2neg(x[0],x[1]))
    
    # - Map (userID1, [artistID1, artistID12,.....]) tuple to [(userID1, artistID1), (user1, artistID2), ...]
    negativeUserProducts = groupedNegativeUserProducts.flatMapValues(lambda x:x)

    # Use the predictFunction with positiveUserProducts and negativeUserProductsto get the rating value for each user, artist tuple and group each by the user
    positivePredictions = predictFunction(positiveUserProducts).map(lambda x:(x[0],(x[1],x[2]))).groupByKey().map(lambda x:(x[0],list(x[1])))
    negativePredictions = predictFunction(negativeUserProducts).map(lambda x:(x[0],(x[1],x[2]))).groupByKey().map(lambda x:(x[0],list(x[1])))
    
    # Join the predicted output of the positive and negative RDD and get their value only (drop the key)
    posAndNegRatingsJoined = positivePredictions.join(negativePredictions).map(lambda x:x[1])
    
    # Map the joined RDD to calcualte the probability of true positive
    ## To do this, you need to create a function probabilityOfTruePositive that takes the postive and negative artists ratings list of a user as a parameter
    ### - For each positive artist, count the number of times the positive rating was higher than the negative rating
    ### - Divid the count produced from the previous step with the total count of items in the positive and negative lists
    ### - return the final ouptut
    def probabilityOfTruePositive(positiveRatings, negativeRatings):
        ### ---CODE HERE--- ###
        total = 0
        correct=0
        for i in positiveRatings:
            for j in negativeRatings:
                total=total+1
                if i[1] >= j[1]:
                    correct=correct+1
        return float(correct)/total
    
    ### ---CODE HERE (2 vars below)--- ###
    probabilities = posAndNegRatingsJoined.map(lambda x:probabilityOfTruePositive(x[0],x[1]))
    mean = probabilities.mean()
    
    # Return the mean of the RDD producted above.
    return mean


areaUnderCurve(cvData, bAllItemIDs, model.predictAll)

0.4633431085043988

### Expected Output:
~*0.9659*