In [1]:
# IMPORTS AND SETTINGS

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import euclidean_distances
from tqdm._tqdm_notebook import tqdm_notebook
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
%config IPCompleter.greedy=True
%config Completer.use_jedi = False
%load_ext line_profiler

In [None]:
# to make display(...) show all rows of a dataframe
pd.set_option('display.max_rows', None)

In [None]:
pd.reset_option('display.max_rows')

In [27]:
# FUNCTION DEFS

# extract string movieId from movieclipId (without leading zeros)
def str_movie_id(movieclip_id):
    return str(movie_id(movieclip_id))

# extract int movieId from movieclipId
def movie_id(movieclip_id):
    return int(movieclip_id.split('_')[0])

# calculate precision and tag coverage for the results of a single query
def rate(qresults, ks):
    qmovie_id = qresults.movieId.head(1) 
    rating = pd.DataFrame(index=[qmovie_id])
    for k in ks:
        rmovie_ids = [movie_id(y) for y in qresults.movieclipId.head(k)]
        # rate relevance    
        qgenres = genres.loc[qmovie_id]
        rgenres = genres.loc[rmovie_ids]
        rjscore = rgenres.apply(lambda row: jaccard_score(qgenres.values[0], row), axis=1)
        prec = len(rjscore[rjscore > 0.5]) / len(rjscore)
        rating['P@' + str(k)] = prec
        # rate diversity
        qtags = tags.loc[qmovie_id].iloc[0]
        qtags = qtags[qtags > 0].index
        rtags = tag_occs.loc[rmovie_ids].agg('sum')
        rtags = rtags[rtags > 0]
        urtags = rtags.index
        rtags = np.repeat(rtags.index, rtags.values)
        abs_tag_cov = len(urtags)
        rel_tag_cov = (len(urtags) / len(qtags)) if len(qtags) > 0 else np.nan
        rating['AbsTagCov@' + str(k)] = abs_tag_cov
        rating['RelTagCov@' + str(k)] = rel_tag_cov
        entropy = entropy_label_distribution(rtags)
        rating['Entropy@' + str(k)] = entropy
    return rating

# Compute entropy of label distribution
def entropy_label_distribution(labels):
    n_labels = len(labels)
    
    if n_labels <= 1:
        return 0

    value, counts = np.unique(labels, return_counts=True)
    probs = counts / np.float32(n_labels)
    n_classes = np.count_nonzero(probs)

    if n_classes <= 1:
        return 0.0

    # Compute entropy
    ent = 0.0

    for p in probs:
        ent -= p * np.log(p)

    return ent

In [28]:
# RESULTS FILE EVALUATION 

results = pd.read_csv('results.csv')  
results_per_query = results.groupby('movieId')
tqdm_notebook.pandas(desc='rating each query result...')
ratings = results_per_query.progress_apply(lambda x: rate(x, [5, 10, 30, 50]))
ratings.index = ratings.index.get_level_values(0)
agg_ratings = ratings.agg(['mean', 'median', 'min', 'max'])
agg_ratings.sort_index(axis=1)
display(agg_ratings)

HBox(children=(IntProgress(value=0, description='rating each query result...', max=637, style=ProgressStyle(de…




Unnamed: 0,P@5,AbsTagCov@5,RelTagCov@5,Entropy@5,P@10,AbsTagCov@10,RelTagCov@10,Entropy@10,P@30,AbsTagCov@30,RelTagCov@30,Entropy@30,P@50,AbsTagCov@50,RelTagCov@50,Entropy@50
mean,0.454003,113.514914,8.456605,4.472338,0.389325,205.651491,16.703188,5.077554,0.244532,530.133438,47.245041,5.995674,0.195542,776.098901,71.481727,6.32779
median,0.4,106.0,3.83631,4.611382,0.4,201.0,7.602302,5.160461,0.233333,524.0,20.909341,6.023773,0.18,776.0,30.869355,6.353841
min,0.0,1.0,0.275362,0.0,0.0,20.0,0.731884,2.697653,0.0,197.0,3.173913,5.086608,0.02,398.0,4.25,5.678893
max,1.0,355.0,152.0,5.814003,0.9,490.0,320.0,6.067454,0.666667,905.0,667.0,6.554224,0.48,1205.0,912.0,6.720955


In [19]:
# DATA PARSING AND PREPARATION

# parse data
# TODO use testset when finally generating results file to hand in...
movies = pd.read_csv('MMSR_dataset_2019/devset_movies.csv', index_col='movieId')
clips = pd.read_csv('MMSR_dataset_2019/devset_ids.csv', index_col='movieId')
tags = pd.read_csv('MMSR_dataset_2019/features/Metadata/devset_TagFeatures.csv', index_col='movieId')
genres = pd.read_csv('MMSR_dataset_2019/features/Metadata/devset_GenreFeatures.csv', index_col='movieId')
# TODO use all BLFs instead of just SPECTRAL (I don't have enough RAM for that...)
blfs = pd.read_csv('MMSR_dataset_2019/features/Audio/Block level features/Component6/BLF_SPECTRAL_fullId.csv', header=None, index_col=0)
alexnet = pd.read_csv('MMSR_dataset_2019/features/Visual/Deep AlexNetFc7/Avg/AlexNetFeatures - AVG - fc7.csv', header=None, index_col=0)

# fix missing or superfluous movies or clips in features
tags = movies.join(tags).drop('title', axis=1).fillna(0)
genres = movies.join(genres).drop('title', axis=1).fillna(0)
blfs = clips.join(blfs, on='movieclipId').set_index('movieclipId').fillna(0)
alexnet = clips.join(alexnet, on='movieclipId').set_index('movieclipId').fillna(0)

# normalize and reduce dimensionality...
scaler = MinMaxScaler()
pca = PCA(n_components=0.9)
red_tags = pd.DataFrame(pca.fit_transform(scaler.fit_transform(tags)), index=tags.index)
blfs = pd.DataFrame(pca.fit_transform(scaler.fit_transform(blfs)), index=blfs.index)
alexnet = pd.DataFrame(pca.fit_transform(scaler.fit_transform(alexnet)), index=alexnet.index)

# precompute tag occurrences for calculating tag coverage
tag_occs = tags > 0

In [None]:
# SOLUTION USING BLFS AND ALEXNET 

results = pd.DataFrame(columns=['movieId', 'movieclipId', 'sim', 'rank']).set_index(['movieId', 'movieclipId'])

# 1) compute average feature vector of audio and visual features for (all clips of) a movie
qblfs = blfs.groupby(lambda x: movie_id(x)).mean()
qalexnet = alexnet.groupby(lambda x: movie_id(x)).mean()

for movieId in tqdm_notebook(movies.index):
    # 2) rank clips by computing distance to aggregated feature vectors of this movie
    ranked_clips_audio = pd.DataFrame(euclidean_distances(blfs.values, qblfs.loc[movieId].values.reshape(1, -1)), 
                                        index=clips.movieclipId, 
                                       columns=['dist']).sort_values('dist')
    ranked_clips_visual = pd.DataFrame(euclidean_distances(alexnet.values, qalexnet.loc[movieId].values.reshape(1, -1)),
                                        index=clips.movieclipId, 
                                        columns=['dist']).sort_values('dist')
    
    # 3) take most similar clips (alternate between audio and visual features) but at most 3 of the same movie
    qresult = pd.DataFrame(columns=['movieclipId', 'sim']).set_index('movieclipId')
    movie_counter = pd.DataFrame(np.zeros(len(movies)), index=movies.index, columns=['counter'])
    while len(qresult) < 100:
        source = ranked_clips_audio if len(qresult) % 2 == 0 else ranked_clips_visual
        for i in range(0, len(source)):
            rcandidate = source.iloc[i]
            is_new_clip = not rcandidate.name in qresult.index
            is_movie_allowed = movie_counter.loc[movie_id(rcandidate.name)].counter < 3
            if is_new_clip & is_movie_allowed:
                if len(qresult) % 2 == 0:
                    ranked_clips_audio = source.iloc[i:]
                else:
                    ranked_clips_visual = source.iloc[i:]
                qresult.loc[rcandidate.name] = rcandidate.dist
                break
    
    # fixup ranks and movieId
    qresult['rank'] = np.arange(100)
    qresult['movieId'] = np.full(100, movieId)
    # transform euclidean distance into similarity measure (i.e. invert it)
    qresult['sim'] = qresult['sim'].map(lambda x: 1 / (1 + x))
    results = pd.concat([results, qresult], sort=False)
    
results.reset_index(inplace=True)
results.columns = ['movieclipId', 'sim', 'rank', 'movieId']
results = results.astype({'movieId':int})
results.to_csv('results.csv', columns=['movieId', 'movieclipId', 'sim', 'rank'], index=False)

In [None]:
# TODO!
# find a way to predict movie genres with decent accuracy...
# then integrate the predicted genres somehow in the algorithm that generates the results file... 
# (e.g. use them to calculate the jaccard index of currently retrieved movies in order to ensure a certain precision?)
# idea: maybe use weaker dimensionality reduction (PCA parameter currently at 0.9) and maybe prediction results will improve...?
y = genres.values

# predict movie genre from audio features
X = qblfs.values
blfs_clf = OneVsRestClassifier(SVC(gamma='scale'))
print(cross_val_score(blfs_clf, X, y, cv=5))

# predict movie genre from video features
X = qalexnet.values
alexnet_clf = OneVsRestClassifier(SVC(gamma='scale'))
print(cross_val_score(alexnet_clf, X, y, cv=5))

# predict movie genre from metadata
X = red_tags.values
tags_clf = OneVsRestClassifier(SVC(gamma='scale'))
print(cross_val_score(tags_clf, X, y, cv=5))

# predict movie genre from averaged audio/visual/metadata features
X = pd.concat([qblfs, qalexnet, red_tags]).groupby(lambda x: x).mean()
all_clf = OneVsRestClassifier(SVC(gamma='scale'))
print(cross_val_score(all_clf, X, y, cv=5))