In [2]:
# IMPORTS AND SETTINGS

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import euclidean_distances
from tqdm._tqdm_notebook import tqdm_notebook
%config IPCompleter.greedy=True
%config Completer.use_jedi = False
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [3]:
# to make display(...) show all rows of a dataframe
pd.set_option('display.max_rows', None)

In [4]:
pd.reset_option('display.max_rows')

In [101]:
# FUNCTION DEFS

# extract string movieId from movieclipId (without leading zeros)
def str_movie_id(movieclip_id):
    return str(movie_id(movieclip_id))

# extract int movieId from movieclipId
def movie_id(movieclip_id):
    return int(movieclip_id.split('_')[0])

# calculate precision and tag coverage for the results of a single query
def rate(qresults, ks):
    qmovie_id = qresults.movieId.head(1)    
    rating = pd.DataFrame(index=[qmovie_id])
    for k in ks:
        rmovie_ids = [movie_id(y) for y in qresults.movieclipId.head(k)]
        # rate relevance    
        qgenres = genres.loc[qmovie_id]
        rgenres = genres.loc[rmovie_ids]
        rjscore = rgenres.apply(lambda row: jaccard_score(qgenres.values[0], row), axis=1)
        prec = len(rjscore[rjscore > 0.5]) / len(rjscore)
        rating['P@' + str(k)]= prec
        # rate diversity
        qtags = tags.loc[qmovie_id].iloc[0]
        qtags = qtags[qtags > 0].index
        rtags = tag_occs.loc[rmovie_ids].agg('sum')
        rtags = rtags[rtags > 0]
        urtags = rtags.index
        rtags = np.repeat(rtags.index, rtags.values)
        abs_tag_cov = len(urtags)
        if (len(qtags)!= 0): #sometimes len(qtags) = 0 and the function fails
            #TODO review if (I don't if it's correct)
            rel_tag_cov = len(urtags) / len(qtags) 
        else:
            rel_tag_cov = 0
        rating['AbsTagCov@' + str(k)] = abs_tag_cov
        rating['RelTagCov@' + str(k)] = rel_tag_cov
        entropy = entropy_label_distribution(rtags) # TODO rtags should not be unique!
        rating['Entropy@' + str(k)] = entropy
    return rating

# Compute entropy of label distribution
def entropy_label_distribution(labels):
    n_labels = len(labels)
    
    if n_labels <= 1:
        return 0

    value, counts = np.unique(labels, return_counts=True)
    probs = counts / np.float32(n_labels)
    n_classes = np.count_nonzero(probs)

    if n_classes <= 1:
        return 0.0

    # Compute entropy
    ent = 0.0

    for p in probs:
        ent -= p * np.log(p)

    return ent

#computes jaccard similarity between genders
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

# returns the genre of the element in i position on the dataframe r1
def get_genre_of_filmsclip (i, r1):
    candidate = r1.iloc[i] # get the element
    aux = r1[r1['dist']==candidate.dist].index.values.astype(str) #obtain an array with index of the element
    movieIdaux = aux[0] #get the first item of the array
    movieIdaux = movieIdaux[0:9] #get the part we are interested in (the movie id)
    int(movieIdaux) # convert form str to integer
    genre_movie_aux = genres.loc[int(movieIdaux)] #get gender
    return genre_movie_aux

In [58]:
# RESULTS FILE EVALUATION 

results = pd.read_csv('results.csv')    
results_per_query = results.groupby('movieId')
tqdm_notebook.pandas(desc='rating each query result...')
# TODO this is super slow...
ratings = results_per_query.progress_apply(lambda x: rate(x, [5, 10, 30, 50]))
ratings.index = ratings.index.get_level_values(0)
agg_ratings = ratings.agg(['mean', 'median', 'min', 'max'])
display(agg_ratings)

HBox(children=(FloatProgress(value=0.0, description='rating each query result...', max=637.0, style=ProgressSt…




Unnamed: 0,P@5,AbsTagCov@5,RelTagCov@5,Entropy@5,P@10,AbsTagCov@10,RelTagCov@10,Entropy@10,P@30,AbsTagCov@30,RelTagCov@30,Entropy@30,P@50,AbsTagCov@50,RelTagCov@50,Entropy@50
mean,0.454003,113.514914,8.284021,4.472338,0.389325,205.651491,16.362307,5.077554,0.244532,530.133438,46.280857,5.995674,0.195542,776.098901,70.022916,6.32779
median,0.4,106.0,3.769231,4.611382,0.4,201.0,7.357143,5.160461,0.233333,524.0,20.545455,6.023773,0.18,776.0,29.826087,6.353841
min,0.0,1.0,0.0,0.0,0.0,20.0,0.0,2.697653,0.0,197.0,0.0,5.086608,0.02,398.0,0.0,5.678893
max,1.0,355.0,152.0,5.814003,0.9,490.0,320.0,6.067454,0.666667,905.0,667.0,6.554224,0.48,1205.0,912.0,6.720955


In [82]:
# DATA PARSING AND PREPARATION

# parse data
# We need to use the testset, not the devset of data
movies = pd.read_csv('MMSR_dataset_2019/testset_movies.csv', index_col='movieId')
clips = pd.read_csv('MMSR_dataset_2019/testset_ids.csv', index_col='movieId')
tags = pd.read_csv('MMSR_dataset_2019/features/Metadata/devset_TagFeatures.csv', index_col='movieId')
genres = pd.read_csv('MMSR_dataset_2019/features/Metadata/devset_GenreFeatures.csv', index_col='movieId')
# TODO use all BLFs instead of just SPECTRAL (I don't have enough RAM for that...) -> DONE, I don't have any 
# problem with the RAM

#blfs = pd.read_csv('MMSR_dataset_2019/features/Audio/Block level features/Component6/BLF_SPECTRAL_fullId.csv', header=None, index_col=0)
blfs = pd.read_csv('MMSR_dataset_2019/features/Audio/Block level features/All/BLF_all_fullId.csv', header=None, index_col=0)

alexnet = pd.read_csv('MMSR_dataset_2019/features/Visual/Deep AlexNetFc7/Avg/AlexNetFeatures - AVG - fc7.csv', header=None, index_col=0)

# fix missing or superfluous movies or clips in features
tags = movies.join(tags).drop('title', axis=1).fillna(0)
genres = movies.join(genres).drop('title', axis=1).fillna(0)
blfs = clips.join(blfs, on='movieclipId').set_index('movieclipId').fillna(0)
alexnet = clips.join(alexnet, on='movieclipId').set_index('movieclipId').fillna(0)

# normalize and reduce dimensionality...
scaler = MinMaxScaler()
pca = PCA(n_components=0.9)
red_tags = pd.DataFrame(pca.fit_transform(scaler.fit_transform(tags)), index=tags.index)
blfs = pd.DataFrame(pca.fit_transform(scaler.fit_transform(blfs)), index=blfs.index)
alexnet = pd.DataFrame(pca.fit_transform(scaler.fit_transform(alexnet)), index=alexnet.index)

# precompute tag occurrences for calculating tag coverage
tag_occs = tags > 0

  explained_variance_ratio_ = explained_variance_ / total_var


In [104]:
# DECENT? SOLUTION USING BLFS AND ALEXNET 

results = pd.DataFrame(columns=['movieId', 'movieclipId', 'sim', 'rank'])
results.set_index(['movieId', 'movieclipId'])

# 1) compute single feature vector for metadata, audio and visual features for (all clips of) a movie
#qtags = tags.groupby(lambda x: movie_id(x)).mean()
qblfs = blfs.groupby(lambda x: movie_id(x)).mean()
qalexnet = alexnet.groupby(lambda x: movie_id(x)).mean()

for movieId in tqdm_notebook(movies.index):
    # 2) rank clips by computing distance to aggregated feature vectors of this movie
    ranked_clips_audio = pd.DataFrame(euclidean_distances(blfs.values, qblfs.loc[movieId].values.reshape(1, -1)), 
                                        index=clips.movieclipId, 
                                       columns=['dist']).sort_values('dist')
    ranked_clips_visual = pd.DataFrame(euclidean_distances(alexnet.values, qalexnet.loc[movieId].values.reshape(1, -1)),
                                        index=clips.movieclipId, 
                                        columns=['dist']).sort_values('dist')
    
    genre_movie_query= genres.loc[movieId] # genre of the query movie
    
    # 3) take most similar clips (alternate between audio and visual features) but at most 3 of the same movie
    qresult = pd.DataFrame(columns=['movieclipId', 'sim']).set_index('movieclipId')
    movie_counter = pd.DataFrame(np.zeros(len(movies)), index=movies.index, columns=['counter'])
    while len(qresult) < 100:
        source = ranked_clips_audio if len(qresult) % 2 == 0 else ranked_clips_visual
        for i in range(0, len(source)):
            
            rcandidate = source.iloc[i]
          
        #JACCARD COEFFICIENT (doesn't work because the genres of testset are missing)
            l1=get_genre_of_filmsclip(i, source)
            if  jaccard_similarity(l1,genre_movie_query) >= 0.5:
                is_jaccard = True
            else:
                is_jaccard = False
            
            is_new_clip = not rcandidate.name in qresult.index
            is_movie_allowed = movie_counter.loc[movie_id(rcandidate.name)].counter < 3
            if is_new_clip & is_movie_allowed & is_jaccard:
                # didn't work because it was written movie_allowed instead of is_movie_allowed
                if len(qresult) % 2 == 0:
                    ranked_clips_audio = source.iloc[i:] # deletes the element we have just worked with
                else:
                    ranked_clips_visual = source.iloc[i:] #deletes the element we have just worked with
                qresult.loc[rcandidate.name] = rcandidate.dist
                break
    
    # fixup ranks and movieId
    
    
   
    qresult['rank'] = np.arange(100)
    qresult['movieId'] = np.full(100, movieId)
    # transform euclidean distance into similarity measure 
    # instead of using the cosine, we use 1/(1+x) to have measures between 0 and 1
    qresult['sim'] = qresult['sim'].map(lambda x: 1 / (1 + x))
    results = pd.concat([results, qresult], sort=False)
    
results.reset_index(inplace=True)



results.columns = ['movieclipId', 'movieId','','sim', 'rank'] 
# added one more column to results because it needs to have five columns, not four (don't exactly know why we
# need it)
results = results.astype({'movieId':int})
results = results.astype({'movieclipId':str})


results.to_csv('results2.csv', columns=['movieId', 'movieclipId', 'sim', 'rank'], index=False)
# result with 15900 lines (+ header) that we are asked for


HBox(children=(FloatProgress(value=0.0, max=159.0), HTML(value='')))




In [14]:
# BAD SOLUTION USING ONLY METADATA

# compute cosine similarities between tag tf_idf vectors
tags_cos_sim = pd.DataFrame(cosine_similarity(tags.values),
                            columns=tags.index.values, 
                            index=tags.index)

# generate results file
results = pd.DataFrame(columns=['movieId', 'movieclipId', 'sim', 'rank'])

for movieId in tqdm_notebook(movies.index):
    sim = tags_cos_sim[[movieId]]
    sim = sim.reset_index()
    sim = sim.rename(columns={movieId : 'sim', 'movieId': 'otherMovieId'})
    sim = sim.join(clips, on='otherMovieId', how='outer')
    sim = sim.reset_index(drop=True)
    sim = sim.drop('otherMovieId', axis=1)
    sim = sim.sort_values('sim', ascending=False)
    sim = sim.head(100)
    sim['rank'] = np.arange(100)
    sim['movieId'] = np.full(100, movieId)
    results = pd.concat([results, sim], ignore_index=True)
results.to_csv('results.csv', columns=['movieId', 'movieclipId', 'sim', 'rank'], index=False)



HBox(children=(FloatProgress(value=0.0, max=159.0), HTML(value='')))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.







AttributeError: 'DataFrame' object has no attribute 'csv'