In [1]:
# IMPORTS AND SETTINGS

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import cosine_distances
from tqdm._tqdm_notebook import tqdm_notebook
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from itertools import product
import csv

%config IPCompleter.greedy=True
%config Completer.use_jedi = False
%load_ext line_profiler

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  if sys.path[0] == '':


In [2]:
# to make display(...) show all rows of a dataframe
pd.set_option('display.max_rows', None)

In [3]:
pd.reset_option('display.max_rows')

In [4]:
# FUNCTION DEFS

# extract string movieId from movieclipId (without leading zeros)
def str_movie_id(movieclip_id):
    return str(movie_id(movieclip_id))

# extract int movieId from movieclipId
def movie_id(movieclip_id):
    return int(movieclip_id.split('_')[0])

# calculate precision and tag coverage for the results of a single query
def rate(qresults, ks):
    qmovie_id = qresults.movieId.head(1) 
    rating = pd.DataFrame(index=[qmovie_id])
    for k in ks:
        rmovie_ids = [movie_id(y) for y in qresults.movieclipId.head(k)]
        # rate relevance    
        qgenres = genres.loc[qmovie_id]
        rgenres = genres.loc[rmovie_ids]
        rjscore = rgenres.apply(lambda row: jaccard_score(qgenres.values[0], row), axis=1)
        prec = len(rjscore[rjscore > 0.5]) / len(rjscore)
        rating['P@' + str(k)] = prec
        # rate diversity
        qtags = tags.loc[qmovie_id].iloc[0]
        qtags = qtags[qtags > 0].index
        rtags = tag_occs.loc[rmovie_ids].agg('sum')
        rtags = rtags[rtags > 0]
        urtags = rtags.index
        rtags = np.repeat(rtags.index, rtags.values)
        abs_tag_cov = len(urtags)
        rel_tag_cov = (len(urtags) / len(qtags)) if len(qtags) > 0 else np.nan
        rating['AbsTagCov@' + str(k)] = abs_tag_cov
        rating['RelTagCov@' + str(k)] = rel_tag_cov
        entropy = entropy_label_distribution(rtags)
        rating['Entropy@' + str(k)] = entropy
    return rating

# Compute entropy of label distribution
def entropy_label_distribution(labels):
    n_labels = len(labels)
    
    if n_labels <= 1:
        return 0

    value, counts = np.unique(labels, return_counts=True)
    probs = counts / np.float32(n_labels)
    n_classes = np.count_nonzero(probs)

    if n_classes <= 1:
        return 0.0

    # Compute entropy
    ent = 0.0

    for p in probs:
        ent -= p * np.log(p)

    return ent

In [5]:
# RESULTS FILE EVALUATION 

def eval(filename):
    results = pd.read_csv(filename)  
    results_per_query = results.groupby('movieId')
    tqdm_notebook.pandas(desc='rating each query result of %s...' % filename)
    ks = [5, 10, 30, 50]
    ratings = results_per_query.progress_apply(lambda x: rate(x, ks))
    ratings.index = ratings.index.get_level_values(0)
    agg_ratings = ratings.agg('mean')
    agg_ratings = agg_ratings[[label + str(k) for label, k in product(['P@', 'AbsTagCov@', 'RelTagCov@', 'Entropy@'], ks)]]
    return agg_ratings
    
our_algorithm = eval('results.csv')
random_baseline = eval('random_results.csv')
df = pd.DataFrame([our_algorithm, random_baseline], index=['Our algorithm (AVGd)', 'Random baseline (AVGd)'])
df.to_csv('evaluation.csv', index=False)
df

HBox(children=(FloatProgress(value=0.0, description='rating each query result of results.csv...', max=637.0, s…

AttributeError: 'DataFrame' object has no attribute 'movieId'

In [8]:
# DATA PARSING AND PREPARATION

# parse data
# TODO use testset when finally generating results file to hand in...
movies = pd.read_csv('MMSR_dataset_2019/devset_movies.csv', index_col='movieId')
clips = pd.read_csv('MMSR_dataset_2019/devset_ids.csv', index_col='movieId')
#movies = pd.read_csv('MMSR_dataset_2019/testset_movies.csv', index_col='movieId')
#clips = pd.read_csv('MMSR_dataset_2019/testset_ids.csv', index_col='movieId')

tags = pd.read_csv('MMSR_dataset_2019/features/Metadata/devset_TagFeatures.csv', index_col='movieId')
genres = pd.read_csv('MMSR_dataset_2019/features/Metadata/devset_GenreFeatures.csv', index_col='movieId')
# TODO use all BLFs instead of just SPECTRAL (I don't have enough RAM for that...)
#blfs = pd.read_csv('MMSR_dataset_2019/features/Audio/Block level features/Component6/BLF_SPECTRAL_fullId.csv', header=None, index_col=0)
blfs = pd.read_csv('MMSR_dataset_2019/features/Audio/Block level features/All/BLF_all_fullId.csv', header=None, index_col=0)
alexnet = pd.read_csv('MMSR_dataset_2019/features/Visual/Deep AlexNetFc7/Avg/AlexNetFeatures - AVG - fc7.csv', header=None, index_col=0)

# fix missing or superfluous movies or clips in features
tags = movies.join(tags).drop('title', axis=1).fillna(0)
genres = movies.join(genres).drop('title', axis=1).fillna(0)
blfs = clips.join(blfs, on='movieclipId').set_index('movieclipId').fillna(0)
alexnet = clips.join(alexnet, on='movieclipId').set_index('movieclipId').fillna(0)

# normalize and reduce dimensionality...
scaler = MinMaxScaler()
pca = PCA(n_components=0.9)
red_tags = pd.DataFrame(pca.fit_transform(scaler.fit_transform(tags)), index=tags.index)
blfs = pd.DataFrame(pca.fit_transform(scaler.fit_transform(blfs)), index=blfs.index)
alexnet = pd.DataFrame(pca.fit_transform(scaler.fit_transform(alexnet)), index=alexnet.index)

# precompute tag occurrences for calculating tag coverage
tag_occs = tags > 0

In [9]:
# SOLUTION USING BLFS, ALEXNET AND TAGS
max_clips_per_movie = 5
results = pd.DataFrame(columns=['movieId', 'movieclipId', 'sim', 'rank']).set_index(['movieId', 'movieclipId'])

# compute average feature vector of audio and visual features for (all clips of) a movie
qblfs = blfs.groupby(lambda x: movie_id(x)).mean()
qalexnet = alexnet.groupby(lambda x: movie_id(x)).mean()
# compute movie tag distances
movie_similarity_meta = pd.DataFrame(cosine_distances(tags), index=movies.index, columns=movies.index.values)

for qmovie_id in tqdm_notebook(movies.index):
    # create multiple clip rankings...
    # rank clips by audio distance using manhattan distance from average BLF vector of query
    ranked_clips_audio = pd.DataFrame(manhattan_distances(blfs.values, qblfs.loc[qmovie_id].values.reshape(1, -1)), 
                                        index=clips['movieclipId'], 
                                       columns=['dist']).sort_values('dist')
    # rank clips by visual distance using manhattan distance from average AlexNet vector of query
    ranked_clips_visual = pd.DataFrame(manhattan_distances(alexnet.values, qalexnet.loc[qmovie_id].values.reshape(1, -1)),
                                        index=clips['movieclipId'], 
                                        columns=['dist']).sort_values('dist')
    # rank clips by metadata distance (take one random clip per movie)
    ranked_clips_meta = movie_similarity_meta[qmovie_id].reset_index().rename({qmovie_id:'dist'}, axis=1)
    ranked_clips_meta['movieclipId'] = ranked_clips_meta['movieId'].apply(lambda mid: clips.loc[mid].sample(1).iloc[0]['movieclipId'] if len(clips.loc[mid]) > 1 else clips.loc[mid].sample(1)['movieclipId'])
    ranked_clips_meta = ranked_clips_meta.set_index('movieclipId')[['dist']].sort_values('dist')

    # retrieve most similar clips (alternate between audio, visual and metadata features)
    # never take more than 5 clips from the same movie! (to keep it interesting)
    qresult = pd.DataFrame(columns=['movieId', 'movieclipId', 'sim', 'rank'])

    while len(qresult) < 100:
        if len(qresult) % 3 == 0:
            source = ranked_clips_visual
        elif len(qresult) % 3 == 1:
            source = ranked_clips_audio
        else:
            source = ranked_clips_meta
        for i in range(0, len(source)):
            rcandidate = source.iloc[i]
            is_new_clip = not rcandidate.name in qresult['movieclipId'].values
            rmovie_id = movie_id(rcandidate.name)
            is_movie_allowed = len(qresult.loc[qresult['movieId'] == rmovie_id]) < max_clips_per_movie
            if is_new_clip & is_movie_allowed:
                if len(qresult) % 3 == 0:
                    ranked_clips_audio = source.iloc[i:]
                elif len(qresult) % 3 == 1:
                    ranked_clips_visual = source.iloc[i:]
                else:
                    ranked_clips_meta = source.iloc[i:]
                qresult.loc[len(qresult)] = [qmovie_id, rcandidate.name, rcandidate.dist, len(qresult)]
                break

    # transform distances into similarity measure (i.e. invert it)
    qresult['sim'] = qresult['sim'].map(lambda x: 1 / (1 + float(x)))
    results = pd.concat([results, qresult], sort=False)

results.to_csv('results.csv', columns=['movieId', 'movieclipId', 'sim', 'rank'], index=False)

HBox(children=(FloatProgress(value=0.0, max=637.0), HTML(value='')))




In [8]:
# SOLUTION PICKING CLIPS RANDOMLY (AS BASELINE)

qmovies = np.repeat(movies.index, 100)
rclips = np.array([clips['movieclipId'].sample(100) for i in tqdm_notebook(range(len(movies.index)))]).flatten()
random_results = pd.DataFrame([qmovies, rclips]).transpose()
random_results.columns = ['movieId', 'movieclipId']
random_results['sim'] = np.repeat(0.0, len(random_results))
random_results['rank'] = np.array([np.arange(100) for i in range(len(movies.index))]).flatten()
random_results.to_csv('random_results.csv', columns=['movieId', 'movieclipId', 'sim', 'rank'], index=False)

HBox(children=(FloatProgress(value=0.0, max=637.0), HTML(value='')))




In [24]:
# TODO!
# find a way to predict movie genres with decent accuracy...
# then integrate the predicted genres somehow in the algorithm that generates the results file... 
# (e.g. use them to calculate the jaccard index of currently retrieved movies in order to ensure a certain precision?)
# idea: maybe use weaker dimensionality reduction (PCA parameter currently at 0.9) and maybe prediction results will improve...?

y = genres.values

# predict movie genre from audio features
X = qblfs.values
blfs_clf = OneVsRestClassifier(SVC(gamma='scale'))
blfs_score = cross_val_score(blfs_clf, X, y, cv=5, scoring='precision_samples')
blfs_score = blfs_score.tolist()
blfs_score.insert(0, "BLFS")
print(blfs_score)

# predict movie genre from video features
X = qalexnet.values
alexnet_clf = OneVsRestClassifier(SVC(gamma='scale'))
alexnet_score = cross_val_score(alexnet_clf, X, y, cv=5, scoring='precision_samples')
alexnet_score = alexnet_score.tolist()
alexnet_score.insert(0, "ALEXNET")
print(alexnet_score)

# predict movie genre from metadata
X = red_tags.values
tags_clf = OneVsRestClassifier(SVC(gamma='scale'))
tags_score = cross_val_score(tags_clf, X, y, cv=5, scoring='precision_samples')
tags_score = tags_score.tolist()
tags_score.insert(0, "TAGS")
print(tags_score)

# predict movie genre from averaged audio/visual/metadata features
X = pd.concat([qblfs, qalexnet, red_tags]).groupby(lambda x: x).mean()
all_clf = OneVsRestClassifier(SVC(gamma='scale'))
all_score = cross_val_score(all_clf, X, y, cv=5, scoring='precision_samples')
all_score = all_score.tolist()
all_score.insert(0, "COMBINED MEAN")
print(all_score)

with open('svm_scores.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Type", "1", "2", "3", "4", "5"])
    writer.writerow(blfs_score)
    writer.writerow(alexnet_score)
    writer.writerow(tags_score)
    writer.writerow(all_score)




ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.