# PART1: Getting and processing the data

## Download the dataset from the GroupLens url

In [1]:
#complete_dataset_url = 'http://files.grouplens.org/datasets/movielens/ml-latest.zip'
small_dataset_url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'

In [2]:
import os

#complete_dataset_path = os.path.join('data', 'ml-latest.zip')
small_dataset_path = os.path.join('data', 'ml-latest-small.zip')

In [3]:
datasets_path = 'data'

ratings_file = os.path.join(datasets_path, 'ml-latest-small', 'ratings.csv')
movies_file = os.path.join(datasets_path, 'ml-latest-small', 'movies.csv')


In [4]:
ratings_raw_data = sc.textFile(ratings_file)
movies_raw_data = sc.textFile(movies_file)


# Parse the data

In [5]:
def removeHeader(data):
    """ Remove the header of the input raw data
        Args:
            data (RDD): input data from the dataset
        Returns:
            same data without header
    """
    header = data.take(1)[0]
    
    return data.filter(lambda line: line!=header)

In [6]:
movies_data = removeHeader(movies_raw_data)
ratings_data = removeHeader(ratings_raw_data)

In [7]:
def parseData(data_no_header):

    """ Parses a data file by commas
        Args:
            data without header
        Returns:
            parsed data
    """

    return data_no_header.map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1],tokens[2])).cache()

In [8]:
def parseData2(data_no_header):

    """ Parses a data file by commas
        Args:
            data without header
        Returns:
            parsed data
    """
    # Firstly, we parse the data by commas
    data_split = data_no_header.map(lambda line: line.split(",")).collect() 
    # Then we check if the division returns more than 3 columns, as is expected.
    # If there are more than 3, the first column is supposed to be the movie id, the last column is considered the genre
    # and all the columns in between are joined and considered the title
    data1 = []
    data2 = []
    for i in data_split:
        if len(i)>3:
            data1.append(i) # These are the films with more than one column for the title
        else:
            data2.append(i) # These films are correctly divided 
    for i in data1:
        i[1:-1] = [''.join(i[1:-1])] # Columns between id and genre are joined
    
    data = data2 + data1 # Both lists of movies are merged
    # Finally, the list is sorted by the id of the movies
    indices = []
    for idx in data:
        indices.append(idx[0])
    sorted_indices = sorted(range(len(indices)), key=lambda k: int(indices[k]))
    split_sorted_data = []
    for i in range(0,len(sorted_indices)):
        split_sorted_data.append(data[sorted_indices[i]])
    return split_sorted_data

In [9]:
# Parse the data by commas applying the corresponding function
movies = parseData2(movies_data)
ratings = parseData(ratings_data)

# Divide movies matrix in train and test
movies = sc.parallelize(movies)
moviesTrain, moviesTest = movies.randomSplit([7, 3], seed=0L)

# id's of the movies
MovieTrainId = moviesTrain.map(lambda line: line[0]).collect()
MovieTrainId = [int(float(row)) for row in MovieTrainId]

MovieTestId = moviesTest.map(lambda line: line[0]).collect()
MovieTestId = [int(float(row)) for row in MovieTestId]


In [45]:
print 'User Movies Example: ' + str(userMovies)

User Movies Example:[16, 32, 47, 50, 150, 204, 223, 256, 318, 380, 457, 480, 527, 589, 590, 608, 648, 719, 724, 780, 858, 912, 1061, 1089, 1136, 1198, 1210, 1220, 1222, 1243, 1265, 1270, 1287, 1580, 1721, 2028, 2105, 2161, 2194, 2407, 2571, 2858, 2947, 2959, 3256, 3421, 3578, 4011, 4027, 4033, 4085, 4262, 4306, 4963, 4993, 5349, 5418, 5445, 5952, 6365, 6807, 7153, 8825, 33794, 45950, 48516, 48780, 49272, 54286, 57949]


In [46]:
print 'User Rates Example: ' + str(userRates)

User Rates Example: [4, 4, 4, 4, 3, 0.5, 4, 0.5, 4, 3, 4, 3.5, 4.5, 3.5, 3.5, 3.5, 3.5, 0.5, 3.5, 3.5, 5, 5, 4, 4.5, 5, 4, 4.5, 4, 5, 3.5, 3, 3, 4.5, 3.5, 1.5, 4.5, 1.5, 1.5, 4.5, 2.5, 4.5, 4, 3.5, 5, 5, 4.5, 4, 4.5, 3.5, 3.5, 3.5, 5, 4, 3.5, 4.5, 3, 4, 3.5, 4.5, 4.5, 4, 4.5, 2.5, 4.5, 0.5, 5, 4, 3.5, 4, 0.5]


In [10]:
# Split the matrix depending on the indexes to select
def splitMatrix(originalMatrix, idtoSelect):
    
    """ Select the movies contained in a vector of id's
        Args:
           originalMatrix: complete matrix of data
           idtoSelect: ids of movies to select
        Returns:
            matrix containing only the movies of interest
    """
    originalMatrix = originalMatrix.collect()
    newMatrix = []
    for i in range(0,len(originalMatrix)):
        if int(float(originalMatrix[i][1])) in idtoSelect:
            newMatrix.append(originalMatrix[i])

    return newMatrix


# Divide the original rates matrix into train, validation and test
ratesTrain = splitMatrix(ratings,MovieTrainId)
ratesTest = splitMatrix(ratings,MovieTestId)


In [11]:
# Store genres of train and test movies in a separated variable
MovieId =  MovieTrainId
movieGenre =  [row[2] for row in moviesTrain.collect()]
movieTestGenre =  [row[2] for row in moviesTest.collect()]

# Separate the genres that are joint with a |
movieGenreList =[row.split("|") for row in movieGenre]
movieTestGenreList = [row.split("|") for row in movieTestGenre]

# Create the genre matrix parametrization. (Films x Genre) 

In [14]:
# Complete list of possible movie genres

genreList = ['Action', 'Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi', 'Thriller','War','Western']
import numpy

def createGenreMatrix(movieGenreList):
    
  
    genreMatrix = numpy.zeros(shape=(len(movieGenreList),len(genreList)))
    
    for i in range(0,len(movieGenreList)-1):
        #print i
        filmGenre = movieGenreList[i]
        for j in range(0,len(filmGenre)):
            auxGenre = filmGenre[j]
            for k in range(0,len(genreList)-1):
                #print k
                if auxGenre == genreList[k]:
                    genreMatrix[i][k] = 1
                    
    return genreMatrix

In [15]:
# Create matrix of genres for train and test movies
genreMatrix = createGenreMatrix(movieGenreList)
genreTestMatrix = createGenreMatrix(movieTestGenreList)

# Store in a variable the vector of different users in the training data (repeated users appear)
userId = [row[0] for row in ratesTrain]
# Counts the number of different users in the previous vector
numUniqueUsers = int(max(set(userId))) 
# Counts the number of movies for training
numMovies = int(max(MovieId))

In [16]:
# Specify the value of the masses it has two columns representing the two clusters of the model. Column 0 has a "1" if user liked
# the movie and column 1 has a "1" if user did not like the movie

def massInitialSpecification(ratingsPunctuation):
    """ Create the initial matrix of masses for one user, based on rated movies only
        Args:
            ratingsPunctuation: vector of ratings of the user
        Returns:
            two column matrix of masses. The first column corresponds to the 'like' cluster, and has a "1" whenever
            the user liked the movie- The second column corresponds to the 'dislike' cluster, and has a "1" whenever
            the user disliked the movie
    """ 
    
    # The masses matrix is initialized to zeros
    mass = numpy.zeros(shape=(len(ratingsPunctuation),2))
                       
    for i in range(0,len(mass)):
        # It is considered that a user liked a movie if the rate is higher than 3
        if int(ratingsPunctuation[i]) > 3:
            mass[i][0] = 1 # like
            mass[i][1] = 0
        else:
            mass[i][0] = 0
            mass[i][1] = 1 #"dont like"
            # For the rest of the ratings, it remains as 0 "dont like"
        
            
    return mass

In [83]:
print masses[:10]

[[ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]
 [ 1.  0.]
 [ 0.  1.]
 [ 1.  0.]
 [ 0.  1.]]


In [17]:
def convertRatestoIntList(userRates):

    newList = []

    for i in range(0,len(userRates)):
        if int(userRates[i][2]) == 5: # Check the decimal part of the rate
            fractionalPart = str(userRates[i][0])
            decimalPart = str(userRates[i][2])
            number = fractionalPart + '.' + decimalPart
            
            newList.append(float(number))
        else:
            number = int(userRates[i][0])
            newList.append(number)

            
    return newList

In [18]:
def createTotalRates(userRates,MovieId,userMovies):
    """ Given the rates evaluated by the user, create a new vector with all the movies. The evaluated ones have the rate
        given by the user and the rest of the films have a 6
        Args:
            userRates: vector of ratings of the user
            MovieId: complete vector of training movies id's
            userMovies: vector of id's of the movies rated by the user
        Returns:
            a vector containing the rates for all the movies, including those not evaluated by the user
    """
    # The total vector of rates is initialized to 6
    totalUserRates = numpy.zeros(shape=(len(MovieId),1)) + 6
    
    # For each movie, it checks if it is already rated by the user. If it is, the rate is updated to its real value; if not,
    # the rate is left as 6    
    for i in range(0,len(MovieId)):
        if MovieId[i] in userMovies:
            totalUserRates[i] = userRates[userMovies.index(MovieId[i])]

    return totalUserRates

In [87]:
print userTrainRates[:10]

[[ 6.]
 [ 6.]
 [ 6.]
 [ 6.]
 [ 6.]
 [ 6.]
 [ 6.]
 [ 4.]
 [ 6.]
 [ 6.]]


In [19]:
def massSpecification(ratingsPunctuation):
    
    """ Create the matrix of masses for one user, based on all movies
        Args:
            ratingsPunctuation: vector of total rates of the user, obtainde by the 'createTotalRates' function
        Returns:
            two column matrix of masses. The first column corresponds to the 'like' cluster, and has a "1" whenever
            the user liked the movie. The second column corresponds to the 'dislike' cluster, and has a "1" whenever
            the user disliked the movie. When the movie has not been rated by the user, its mass is initialized as 0.5 
            for both clusters.
    """   
    # The masses matrix is initialized to zeros
    mass = numpy.zeros(shape=(len(ratingsPunctuation),2))
                       
    for i in range(0,len(mass)):
        # It is considered that a user liked a movie if the rate is higher than 3
        if ratingsPunctuation[i] > 3 and ratingsPunctuation[i]<6 :
            mass[i][0] = 1 # like
            mass[i][1] = 0
        else:
                mass[i][0] = 0
                mass[i][1] = 1 #"dont like"
                # For the rest of the ratings, it remains as 0 "dont like"
        if ratingsPunctuation[i] == 6:  
            mass[i][0] = 0.5 # like
            mass[i][1] = 0.5
            
    return mass

In [20]:
def getUserMoviesGenre(userMovies,genreMatrix,MovieId):
    """ Create the matrix of genres rated by one user
        Args:
            userMovies: vector of id's of the movies rated by the user
            genreMatrix: complete matrix of genres obtained by the function 'createGenreMatrix'
            MovieId: complete vector of training movies id's  
        Returns:
            List of arrays. Each array corresponds to one of the movies rated by the user.
            It contains 1 and 0 depending if that movie belongsmto that genre or not.
    """    
    userMovieGenre = []
    userMovies = [int(float(aux)) for aux in userMovies]
    
    for i in range(0,len(MovieId)):
        if MovieId[i] in userMovies:
            userMovieGenre.append(genreMatrix[int(i)])
            
            
    return userMovieGenre
        

In [21]:
def getMoviesGenre(genreMatrix,MovieId):

    movieGenre = []
    
    for i in range(0,len(genreMatrix)):
        if i in MovieId:
            movieGenre.append(genreMatrix[int(i)])
            
            
    return movieGenre

In [22]:
def cosine_similarity(v1,v2):
    "compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(0,len(v1)):
        x = v1[i]
        y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/numpy.sqrt(sumxx*sumyy)



In [23]:
def selectCluster(Force):
    """ Determine if the movie corresponds to the 'like' or 'dislike' cluster, based on the force exerted by each cluster 
        center.
        Args:
            Force: matrix of forces exerted by the cluster centers. Obtained by function 'clusterData'
        Returns:
            vector of 0 and 1, which determines whether the film is classified as 'liked' (1) or 'dislike' (0) 
    """    
    cluster = numpy.zeros(shape=(len(Force),1))
    # The force is checked for each film. If the force corresponding to the cluster 'like' is higher than the force 
    # corresponding to the cluser 'dislike', the value is set to 1. When the contrary occurs, the value is set to 0.    
    for i in range(0,len(Force)):
        if Force[i][0] > Force[i][1]:
            cluster[i] = 1
        else:
            cluster[i] = 0
            
    return cluster
    

# MK-means Clustering Algorithm

In [24]:
def MKmeans(masses,userMoviesGenre,variant,userTotalRates): 
    """ Algorithm that takes the labeled data (masses with a 1 or a 0) and initialize the K-means variables
        Args:
            masses: matrix of masses obtained by function 'massSpecification' or 'massInitialSpecification'
            userMoviesGenre: list of arrays of genres obtained by 'getUserMoviesGenre'
            variant: indicates the variant of the algorithm to apply
            userTotalRates: a vector containing the rates for all the movies obtained by the function 'createTotalRates'
        Returns:
            M: the mass of the gravitational center
            Z: the center of the cluster
    """
    Z = []
    # Initialize mass of the gravitational center 
    if variant == 0:
        # Standard Scheme
        M = [sum(x) for x in zip(*masses)]
    else:
        # Variant 1: check if the user has less than 40 labeled movies in the data set
        if len(userTotalRates) < 40:
            M = [1, 1]
        else: 
            M = [sum(x) for x in zip(*masses)]

    for j in range(0,2):
        aux = numpy.zeros(shape=(1,18))     

        for i in range(0,len(masses)-1):
            # Get the vector Xi belonging to the data (genre)
            X = map(float,userMoviesGenre[i])
            X = [int(float(x)) for x in X]
            # Initialize cluster center
            if masses[i][j] == 0:
                aux = aux + numpy.zeros(shape=(1,len(X)))       
            else:
                aux2 = [x*masses[i][j] for x in X]
                aux = aux + aux2
               
        
        Z.append([x / M[j] for x in aux])
           
    return M,Z

In [93]:
print 'Initial Mass of Gravitational Center: ' + str(Minit)

print 'Initial Cluster Centers: ' + str(Zinit)


Initial Mass of Gravitational Center: [39.0, 31.0]
Initial Cluster Centers: [[array([ 0.41025641,  0.25641026,  0.02564103,  0.02564103,  0.17948718,
        0.33333333,  0.        ,  0.43589744,  0.12820513,  0.        ,
        0.        ,  0.02564103,  0.15384615,  0.07692308,  0.12820513,
        0.38461538,  0.07692308,  0.        ])], [array([ 0.41935484,  0.41935484,  0.        ,  0.03225806,  0.38709677,
        0.16129032,  0.03225806,  0.25806452,  0.09677419,  0.        ,
        0.06451613,  0.        ,  0.09677419,  0.09677419,  0.32258065,
        0.38709677,  0.03225806,  0.        ])]]


In [25]:
def biasMechanism(masses,userMovies,bias):
    """ Algorithm that calculates the centers of the two clusters we are considering
        Args:
            masses: matrix of masses obtained by function 'massSpecification' or 'massInitialSpecification'
            userMovies: vector of id's of the movies rated by one user
            bias: value that is going to be added/substracted from the masses
        Returns:
            matrix of masses after applying the bias
    """    # Increase or decrease the masses of the unlabeled data by a certain fixed quantity
    userMovies = [int(value) for value in userMovies]
    for i in range(0,len(masses)):
        if i not in userMovies:
            masses[i][0] = masses[i][0] - bias
            masses[i][1] = masses[i][1] + bias
            
            #masses[i][1] = masses[i][1] + 0.3
    return masses

In [26]:
def recommendMovies(selectedCluster,userTestMovies,MovieTestId,userTestRates):
    """  From all the movies clustered as “like”, it selects the ones that belong to the test set of the user.
        Args:
            selectedCluster: vector obtained by the function 'selectCluster'
            userTestMovies: vector of id's of test movies for one user
            MovieTestId: complete matrix of test movies
            userTestRates: vector of test rates for one user
        Returns:
            list of movies of test clustered as 'like'
    """
    recommendedMovies = []
    recommendationRates = []
        # Get all the recommended Movies -> those classified in cluster 0
    idFilmRecom,b = numpy.where(clusterSelection==0)

    for i in range(0,len(idFilmRecom)):
        if MovieTestId[idFilmRecom[i]] in userTestMovies:
            recommendedMovies.append(MovieTestId[idFilmRecom[i]])
            #recommendationRates.append(userTestRates[idFilmRecom[i]])
    return recommendedMovies

In [27]:
def clusterData(massesTrain,M,Z,userMoviesGenre):
    """  Calculates the force exerted by each cluster over each movie
        Args:
            massesTrain: matrix of masses for one user
            M: the mass of the gravitational center obtained by the function 'MKmeans'
            Z: the center of the cluster obtained by the function 'MKmeans'
            userMoviesGenre: list of arrays of genres obtained by 'getUserMoviesGenre'
        Returns:
            two column matrix of forces. The first column contains the forces exerted by the cluster 'like' over each movie.
            The second one, the forces exerted by the cluster 'dislike'
    """    
    # The matrix of forces is initialized to zeros
    F = numpy.zeros(shape=(len(masses),2))
    # For each mass of each film, the force is calculated
    for j in range(0,2):
        for i in range(0,len(masses)):
            X = map(float,userMoviesGenre[i])
            F[i][j] = (masses[i][j] * M[j]) / cosine_similarity(X,Z[j][0])
    return F

In [28]:
def calculateAccuracy(recomRates,accuracy,numNoSucessRate):
    """  Counter of true positives: counts the number of movies recommended, and rated as liked by the user in the test set
        Args:
            recomRates: vector of rates for the movies in the test set that have been recommended to the user
            accuracy: value of the accuracy in the previous iteration
        Returns:
            number of movies well recommended
    """    
    individualAccuracy = 0;
    for i in range(0,len(recomRates)):
        if recomRates[i] > 2.5:
            accuracy = accuracy + 1
            individualAccuracy = individualAccuracy + 1
    if len(recomRates) == 0: 
        individualAccuracy = 0
    else:
        individualAccuracy = individualAccuracy / len(recomRates)
    if individualAccuracy == 0:
        numNoSucessRate = numNoSuccessRate + 1
    return accuracy,numNoSucessRate

In [29]:
def distanceTestCluster(Z,userTestMoviesGenre):
    """  Calculates the cosine distance from the test movie set to the center of the clusters obtained from the MKmeans
        Args:
            Z: the center of the cluster obtained by the function 'MKmeans'
            userTestMoviesGenre: list of genres of the test movies obtained by the function 'getUserMoviesGenre'
        Returns:
            number of movies well recommended
    """
    distanceCluster = numpy.zeros(shape=(len(userTestMoviesGenre),2))

    for j in range(0,2):

        for i in range(0,len(userTestMoviesGenre)):
            X = map(float,userTestMoviesGenre[i])
            distanceCluster[i][j] = cosine_similarity(X,Z[j][0])
    clusterSelection = selectCluster(distanceCluster)
    return clusterSelection,distanceCluster

In [30]:
def calculateCorrelation(userX,userY,ratesTrain):
    """  Calculates the correlation between users
        Args:
            userX and user Y: users for which the correlation is going to be calculated
            ratesTrain: the matrix of rates for training
        Returns:
            corr: correlation between the users
            commonMovies: vector of id's of movies in common
            positionsX:
            positionsY:
            ratesY:
    """
    # User X
    ratesX = [int(float(row[2])) for row in ratesTrain if int(row[0]) == userX and row[1]]
    moviesX = [int(float(row[1])) for row in ratesTrain if int(row[0]) == userX and row[1]]

    # User Y
    ratesY = [int(float(row[2])) for row in ratesTrain if int(row[0]) == userY and row[1]]
    moviesY = [int(float(row[1])) for row in ratesTrain if int(row[0]) == userY and row[1]]
    commonMovies = []
    positionsX = []
    positionsY = []
    for i in range(0 ,len(moviesX)):
         if moviesX[i] in moviesY:
                positionsX.append(int(i))
                commonMovies.append(moviesX[i])
    for i in range(0,len(moviesY)):
        if moviesY[i] in moviesX:
            positionsY.append(i)

        # Calculate mean
    meanX = numpy.mean(ratesX)
    meanY = numpy.mean(ratesY)


    corr = sum([(ratesX[positionsX[f]]-meanX)*(ratesY[positionsY[f]]-meanY) for f in range(0,len(commonMovies))]) / numpy.sqrt(sum([(ratesX[positionsX[f]]-meanX)**2 for f in range(0,len(commonMovies))])*sum([(ratesY[positionsY[f]]-meanY)**2 for f in range(0,len(commonMovies))]))

    return corr,commonMovies,positionsX,positionsY,ratesY


In [31]:
def collaborativeFiltering(userX,userY,ratesTrain,moviesCounter):
    """  Calculates the correlation between users
        Args:
            userX and user Y: users for which the correlation is going to be calculated
            ratesTrain: the matrix of rates for training
            moviesCounter:
        Returns:
            moviesCounter:
    """
    userCorr,commonMovies,posX,posY,userYrates = calculateCorrelation(userX,userY,ratesTrain)
    for i in range(0,len(commonMovies)):
        if userCorr > 0.1: # high correlation: similar users    
            if userYrates[posY[i]] > 3:
                moviesCounter[posX[i]][0] = moviesCounter[posX[i]][0] + 1
            else:
                moviesCounter[posX[i]][1] = moviesCounter[posX[i]][1] - 1
        else:
            if userCorr < -0.1: # low correlation: different users
                if userYrates[posY[i]] > 3:
                    moviesCounter[posX[i]][0] = moviesCounter[posX[i]][0] - 1
                else:
                    moviesCounter[posX[i]][1] = moviesCounter[posX[i]][0] + 1        
            else: 
                if userYrates[posY[i]] > 3:
                    moviesCounter[posX[i]][0] = moviesCounter[posX[i]][0] + 0.5
                else:
                    moviesCounter[posX[i]][1] = moviesCounter[posX[i]][0] - 0.5      
    return moviesCounter


In [32]:
def variant3Mechanism(rate,massesTrain,userMovies,MovieId):
    """  Clasifies the data applying the third variant: The mass that corresponds to each vector is either 
    increased or decreased depending on whether the respecive movie is proposed by collaborative filtering
        Args:
            rate:
            massesTrain: 
            userMovies:
            MovieId:
        Returns:
            massesTrain:
    """
    pos = 0
    for i in range(0,len(massesTrain)):
        if MovieId[i] in userMovies:
            if rate[pos] > 0.5:
                massesTrain[i][0] = massesTrain[i][0] + 0.1
            else:
                massesTrain[i][1] = massesTrain[i][1] - 0.1 # probar tambien con -0.2
            pos = pos + 1
    return massesTrain

## K-means Algorithm

In [99]:
variantList = [0,1,3]
for aux in range(0,3):
    userNonRec = 0
    accuracy = 0
    numFilms = 0
    usersNoRecom = 0
    variant = variantList[aux]
    numNoSuccessRate = 0
    for user in range(1,numUniqueUsers+1):

        #print 'Working with user number:' + str(user)
        K = 2 # number of clusters {like,dislike}
        # Build a model for each user

        userRates = [row[2] for row in ratesTrain if int(row[0]) == user and row[1]]
        userMovies = [int(float(row[1])) for row in ratesTrain if int(row[0]) == user and row[1]]
        userTestRates = [row[2] for row in ratesTest if int(row[0]) == user and row[1]]
        userTestMovies = [int(float(row[1])) for row in ratesTest if int(row[0]) == user and row[1]]
        userTotalRates = [int(float(row[1])) for row in ratings.collect() if int(row[0]) == user and row[1]]

        trainGenreMatrix = getMoviesGenre(genreMatrix,MovieId)


        ## INITIALIZATION
        # Convert rates to int format
        userRates = convertRatestoIntList(userRates)
        # Specify the value of the masses from the labeled user data
        masses = massInitialSpecification(userRates)
        userMoviesGenre = getUserMoviesGenre(userMovies,genreMatrix,MovieId)
        # Initialize the algorithm (M,Z) with the labeled data of the user
        Minit,Zinit = MKmeans(masses,userMoviesGenre,variant,userTotalRates)

        # Create a vector with the total number of movies in the train set
        userTrainRates = createTotalRates(userRates,MovieId,userMovies)
        # Total vector of masses
        massesTrain = massSpecification(userTrainRates)
        # Cluster all the data
        Finit = clusterData(massesTrain,Minit,Zinit,userMoviesGenre)
        # Select a cluster depending on the Force vector
        firstClusterSelection = selectCluster(Finit)
        # Bias Mechanism: Increase or decrease the value of the masses before the next iteration
        if variant == 0:
            bias = 0.3
            massesTrain = biasMechanism(massesTrain,userMovies,bias)
        if variant == 3:
            moviesCounter = numpy.zeros(shape=(len(userMovies),2))
            for userY in range(1,numUniqueUsers):
                moviesCounter = collaborativeFiltering(user,userY,ratesTrain,moviesCounter)

            rate = []
            for i in range(0,len(moviesCounter)):
                rate.append( moviesCounter[i][0] / (moviesCounter[i][0] + moviesCounter[i][1]))

            massesTrain = variant3Mechanism(rate,massesTrain,userMovies,MovieId)
        ## ITERATION
        for i in range(0,5): # 1000 iterations
            # Recalculate the gravitational masses and centers with all the train movies
            M,Z = MKmeans(massesTrain,genreMatrix,variant,userTotalRates) 

            # Recluster all the data
            Fcurrent = clusterData(massesTrain,M,Z,genreMatrix)
            # Select a cluster depending on the Force vector
            currentClusterSelection = selectCluster(Fcurrent)
            if i == 0:
                if (currentClusterSelection == firstClusterSelection).all(): # keep iterating
                    break
                else:
                    lastClusterSelection = currentClusterSelection
                    currentClusterSelection = [] 
            else:
                # Check if from the previous iteration and the next one, the classification stays the same
                if (currentClusterSelection == lastClusterSelection).all(): # keep iterating
                    break
                else:
                    lastClusterSelection = currentClusterSelection # The classification from this iteration now become the previous classification for the next iteration
                    currentClusterSelection = []       
            # Bias Mechanism: Increase or decrease the value of the masses before the next iteration
            if variant == 0:
                massesTrain = biasMechanism(massesTrain,userMovies,bias)
            if variant == 3:
                massesTrain = variant3Mechanism(rate,massesTrain,userMovies,MovieId)


        ## RECOMMENDATIONS
        userTestMoviesGenre = getUserMoviesGenre(userTestMovies,genreTestMatrix,MovieTestId)
        # Compute the cosine distance between the test movies genre and the center of the cluster -> Select the cluster
        # to which is closer
        clusterSelection,distanceCluster = distanceTestCluster(Z,genreTestMatrix)

        # Create a vector with the total number of movies in the test set
        userTestRates = convertRatestoIntList(userTestRates)

        totalUserTestRates = createTotalRates(userTestRates,MovieTestId,userTestMovies)
        recomMovies = recommendMovies(clusterSelection,userTestMovies,MovieTestId,totalUserTestRates)
        # Count the number of users with no recommendations
        if not recomMovies:
            usersNoRecom = usersNoRecom + 1

        recomRates = []
        [recomRates.append(userTestRates[k]) for k in range(0,len(recomMovies)) if recomMovies[k] in userTestMovies]

        numFilms = numFilms + len(recomMovies)
        accuracy,numNoSuccessRate = calculateAccuracy(recomRates,accuracy,numNoSuccessRate)
        # print 'Recommendations for user' + str(user) + ': ' + str(recomMovies)

    finalAccuracy = (float(accuracy) / numFilms) * 100

    print 'Accuracy for Variant ' + str(variant)+ ' is: '  + str(finalAccuracy) + '%'
    print 'number of users with no recommendations: ' + str(usersNoRecom)
    #print 'number of users with 0% success rate: ' + str(numNoSuccessRate)



Accuracy for Variant 0 is: 76.9674711438%
number of users with no recommendations: 0
Accuracy for Variant 1 is: 74.3142144638%




number of users with no recommendations: 2
Accuracy for Variant 3 is: 74.500665779%
number of users with no recommendations: 2


