# Lab 3 — dimensionality reduction

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np

from operator import itemgetter

%matplotlib inline
plt.style.use("ggplot")

In [None]:
!hdfs dfs -cat /ix/ml-20m/genome-tags.txt | tail -n 2

In [None]:
tags = sc.textFile("/ix/ml-20m/genome-tags.txt").map(json.loads)
tag2name = dict(tags.map(itemgetter("tagId", "tag")).collect())
movies = sc.textFile("/ix/ml-20m/movies.txt").map(json.loads)
scores = sc.textFile("/ix/ml-20m/genome-scores.txt").map(json.loads)

#All the movies that have tags
movieId_withTags = scores.map(itemgetter('movieId')).distinct()

print(tags.take(1))
print(movies.take(1))
print(scores.take(1))

nb_tags = tags.count()
nb_movies = movies.count()
nb_movieId_withTags = movieId_withTags.count()
nb_scores = scores.count()

print('number tags: ', nb_tags)
print('number movies: ', nb_movies)
print('number movies with tags: ', nb_movieId_withTags)
print('number scores :', nb_scores)

In [None]:
#In order to construct the matrix, we have reshape all the relevance with the row is the number of tags and the column the number of movies (with tags)
relevance = scores.map(lambda x: x['relevance'])
#order='F' is to precise to reshape by starting with the column
matrix = np.array(relevance.collect()).reshape((nb_tags,nb_movieId_withTags), order='F')

In [None]:
#find the variance of each tag
tagsVar = np.var(matrix,axis=1)

#And plot it
plt.plot(tagsVar)
plt.xlabel('Tags')
plt.ylabel('Variance')
plt.title("Variance of each tag")

In [None]:
#In order to do a PCA we need to center our matrix
matrixCentered = matrix - matrix.mean(axis=1).reshape(1128,1)
#Then we compute the covariance matrix
covMatrix = np.cov(matrixCentered, rowvar=False)
#This we can find the eigenvalues and eigenvectors of the covariance matrix
eigenvalues,eigenvectors = np.linalg.eigh(covMatrix)

#And we plot it
plt.plot(eigenvalues)
plt.title('Eigenvalues of the covariance matrix')

In [None]:
#We can find the number of directions we need to capture 2/3 of the variability in the data
nb_principal_direction = np.argmin(eigenvalues[::-1].cumsum() < 2/3*np.sum(eigenvalues))
print(nb_principal_direction)

Exercise 3.1)

We can see that the variance of each tag are very varied and the plot seems noisy. Moreover we need very few directions in order to get a big majority of the variability of the data. Then the dimensionality reduction will be useful and will give some results.

In [None]:
from scipy import spatial

#We get the index of the 5 max eigenvalues
idxMaxEval = np.argsort(eigenvalues)[::-1][:5]
#Then we get the 5 related eigenvectors, that will be our 5 principal directions
principalDirections = eigenvectors[:,idxMaxEval]

tagsName = np.asarray(tags.map(lambda x: x['tag']).collect())

#This function compute the distance between a given direction and each tags
def distance(direction):
    dist = np.empty(nb_tags)
    for i in range(nb_tags):
        dist[i] = spatial.distance.cosine(principalDirections[:,direction], matrix[i,:])
    return dist

#This function find the 10 tags that have the highest and lowest coordinates for a given direction
def tenHighAndLow(direc, dist):
    index = np.argsort(dist)[::-1]
    #The lowest coordinates are the farther from the direction
    indexMin = index[:10]
    #The highest coordinates are the closer from the direction
    indexMax = index[-10:][::-1]
    print(direc, ' direction \n------------------')
    print('Tags with highest coordinates:\n', tagsName[indexMax])
    print('Tags with the lowest coordinates:\n', tagsName[indexMin])
    print('\n')

tenHighAndLow('First', distance(0))
tenHighAndLow('Second', distance(1))
tenHighAndLow('Third', distance(2))
tenHighAndLow('Fourth', distance(3))
tenHighAndLow('Fifth', distance(4))

Exercise 3.2)

We can see that for each direction, the tags with the highest (and respectively the lowest) coordinates are very close one to an other and tend to be the definitions of movie genres.

These tags can define drama movies for exemple: 'depressing' 'bleak' 'downbeat' 'golden palm' 'grim' 'character study' 'melancholy' 'tragedy' 'understated' 'poignant'

In [None]:
import pickle
#We find the 5 coordinates of every tags thanks to the 5 principal directions
newCoordinates = np.dot(matrix, principalDirections)
tagsDict = {}

#We map every tag to its coordinates in the 5 principal directions
for i in range(nb_tags):
    tagsDict[tagsName[i]] = tuple(newCoordinates[i,:])

with open("tagsCoordinates.pickle", "wb") as f:
    pickle.dump(tagsDict, f)

In [None]:
#We get the selected movies from the disk
with open("selected-movies.pickle", "rb") as f:
    selectMovies = pickle.load(f, encoding="utf-8")

In [None]:
#We want to get the 'relevance' attributes of each tags for each selected movies
#We get the movieIds of all the movies that have tags
movieId_array = np.sort(np.array(movieId_withTags.collect()))
#We initialize the numpy array of the 'relevance' matrix for each selected movies
relevanceSelecMovies = np.zeros([nb_tags,len(selectMovies)])

#Then we get these 'relevance' scores of each tags for each selected movies
for i,movie in enumerate(selectMovies):
    idx = np.argwhere(movieId_array==movie[0])
    relevanceSelecMovies[:,i] = matrix[:,idx[0][0]]

In [None]:
#We want to project the selected movies into the 2 first princiap directions
#Then we have to get the 2 first principal directions
twoMainDir = newCoordinates[:,:2]
coorSelectMovies = []
#Then we associate each selected movies with their new coordinates in order to plot it
for i,movie in enumerate(relevanceSelecMovies.T):
    firstCoor = np.dot(movie,twoMainDir[:,0])
    secondCoor = np.dot(movie,twoMainDir[:,1])
    coorSelectMovies.append((selectMovies[i][0], selectMovies[i][1], int(selectMovies[i][2]*255), firstCoor, secondCoor))

In [None]:
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool, ResetTool, PanTool, WheelZoomTool, SaveTool
output_notebook()

In [None]:
from bokeh.palettes import Viridis256

#We color the nodes by their Rotten Tomatoes score
source = ColumnDataSource(
    data={
        "id": [x[0] for x in coorSelectMovies],
        "movie": [x[1] for x in coorSelectMovies],
        "score": [Viridis256[x[2]] for x in coorSelectMovies],
        "x": [x[3] for x in coorSelectMovies],
        "y": [x[4] for x in coorSelectMovies],
    })

hover = HoverTool(
    tooltips=[
        ("Movie", "@movie"),
    ])
tools = [hover, ResetTool(), PanTool(), WheelZoomTool(), SaveTool()]

p = figure(plot_width=960, plot_height=360, tools=tools, title="Mouse over the dots")
p.circle("x", "y", source=source, size=20, color="score", alpha=0.5)
show(p, notebook_handle=True)

Exercise 3.3)

We can see that movies are globally gathered according to their genres. For example at the bottom of the plot we have horror movies (videodrome, Ichi the killer, Stage Fright...). At the top we have the comedy drama (Billy Elliot, Pride, Life Beautiful...). At the left drama and in the middle fantasy/science-fiction movies.

Moreover we can see that movies are also more or less gathered by their rotten tomatoes scores (close shades). Thus we can think that the PCA directions are correlated with the RT scores because movies with the same genres tend to have the same feedback from the audience.