# Lab 3 — dimensionality reduction

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np

from operator import itemgetter

%matplotlib inline
plt.style.use("ggplot")

In [None]:
!hdfs dfs -cat /ix/ml-20m/genome-tags.txt | tail -n 2

In [None]:
tags = sc.textFile("/ix/ml-20m/genome-tags.txt").map(json.loads)
tag2name = dict(tags.map(itemgetter("tagId", "tag")).collect())
movies = sc.textFile("/ix/ml-20m/movies.txt").map(json.loads)
scores = sc.textFile("/ix/ml-20m/genome-scores.txt").map(json.loads)

movieId_withTags = scores.map(itemgetter('movieId')).distinct()

print(tags.take(1))
print(movies.take(1))
print(scores.take(1))

nb_tags = tags.count()
nb_movies = movies.count()
nb_movieId_withTags = movieId_withTags.count()
nb_scores = scores.count()

print('number tags: ', nb_tags)
print('number movies: ', nb_movies)
print('number movies with tags: ', nb_movieId_withTags)
print('number scores :', nb_scores)

In [None]:
relevance = scores.map(lambda x: x['relevance'])
matrix = np.array(relevance.collect()).reshape((nb_tags,nb_movieId_withTags), order='F')

In [None]:
print(matrix)

In [None]:
tagsVar = np.var(matrix,axis=1)
tagsVarSort = np.sort(tagsVar)

plt.plot(tagsVar)
plt.xlabel('Tags')
plt.ylabel('Variance')
plt.title("Variance of each tag")

In [None]:
matrixCentered = matrix - matrix.mean(axis=1).reshape(1128,1)
covMatrix = np.cov(matrixCentered, rowvar=False)
eigenvalues,eigenvectors = np.linalg.eigh(covMatrix)

plt.plot(eigenvalues)
plt.title('Eigenvalues of the covariance matrix')

In [None]:
nb_principal_direction = np.argmin(eigenvalues[::-1].cumsum() < 2/3*np.sum(eigenvalues))
print(nb_principal_direction)

In [None]:
from scipy import spatial

idxMaxEval = np.argsort(eigenvalues)[::-1][:5]
principalDirections = eigenvectors[:,idxMaxEval]

tagsName = np.asarray(tags.map(lambda x: x['tag']).collect())

def distance(direction):
    dist = np.empty(nb_tags)
    for i in range(nb_tags):
        dist[i] = spatial.distance.cosine(principalDirections[:,direction], matrix[i,:])
    return dist

def tenHighAndLow(direc, dist):
    index = np.argsort(dist)[::-1]
    indexMin = index[:10]
    indexMax = index[-10:][::-1]
    print(direc, ' direction \n------------------')
    print('Tags with highest coordinates:\n', tagsName[indexMax])
    print('Tags with the lowest coordinates:\n', tagsName[indexMin])
    print('\n')

tenHighAndLow('First', distance(0))
tenHighAndLow('Second', distance(1))
tenHighAndLow('Third', distance(2))
tenHighAndLow('Fourth', distance(3))
tenHighAndLow('Fifth', distance(4))

In [None]:
import pickle

newCoordinates = np.dot(matrix, principalDirections)
tagsDict = {}

for i in range(nb_tags):
    tagsDict[tagsName[i]] = tuple(newCoordinates[i,:])

with open("tagsCoordinates.pickle", "wb") as f:
    pickle.dump(tagsDict, f)

In [None]:
with open("selected-movies.pickle", "rb") as f:
    selectMovies = pickle.load(f, encoding="utf-8")
print(selectMovies[:3])

In [None]:
movieId_array = np.sort(np.array(movieId_withTags.collect()))
relevanceSelecMovies = np.zeros([nb_tags,len(selectMovies)])

for i,movie in enumerate(selectMovies):
    idx = np.argwhere(movieId_array==movie[0])
    relevanceSelecMovies[:,i] = matrixCentered[:,idx[0][0]]

print(relevanceSelecMovies.shape)

In [None]:
twoMainDir = newCoordinates[:,:2]
coorSelectMovies = []

for i,movie in enumerate(relevanceSelecMovies.T):
    firstCoor = np.dot(movie,twoMainDir[:,0])
    secondCoor = np.dot(movie,twoMainDir[:,1])
    coorSelectMovies.append((selectMovies[i][0], selectMovies[i][1], int(selectMovies[i][2]*255), firstCoor, secondCoor))

In [None]:
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool, ResetTool, PanTool, WheelZoomTool, SaveTool
output_notebook()

In [None]:
from bokeh.palettes import Viridis256

source = ColumnDataSource(
    data={
        "id": [x[0] for x in coorSelectMovies],
        "movie": [x[1] for x in coorSelectMovies],
        "score": [Viridis256[x[2]] for x in coorSelectMovies],
        "x": [x[3] for x in coorSelectMovies],
        "y": [x[4] for x in coorSelectMovies],
    })

hover = HoverTool(
    tooltips=[
        ("Movie", "@movie"),
    ])
tools = [hover, ResetTool(), PanTool(), WheelZoomTool(), SaveTool()]

p = figure(plot_width=960, plot_height=360, tools=tools, title="Mouse over the dots")
p.circle("x", "y", source=source, size=20, color="score", alpha=0.5)
show(p, notebook_handle=True)