In [5]:
import pandas as pd
import scipy.sparse as sp
import pickle

In [6]:
df = pd.read_feather('./comparisons.feather')

# keep only columns=['video_a', 'video_b', 'criteria', 'score']
df = df[['video_a', 'video_b', 'criteria', 'score']]

In [7]:
# first, dump as feather (less memory)
# read csv first
# df.to_feather('GNNRank/data/tournesol/comparisons.feather')

In [8]:
more_than = 2

for criteria in df['criteria'].unique():  # for each criteria
    # keep only rows with this criteria
    df_criteria = df[df['criteria'] == criteria]
    # keep only columns=['video_a', 'video_b', 'score']
    df_criteria = df_criteria[['video_a', 'video_b', 'score']]

    # make sure video_a < video_b in terms of alphabetical order
    df_criteria['video_a'], df_criteria['video_b'] = zip(*df_criteria.apply(
        lambda x: (x['video_a'], x['video_b']) if x['video_a'] < x['video_b'] else (x['video_b'], x['video_a']),
        axis=1))

    if more_than > 0:
        # count how many times a video appears (in video_a or video_b)
        video_counts = pd.concat([df_criteria['video_a'].value_counts(), df_criteria['video_b'].value_counts()]).groupby(level=0).sum()

        # create a new dataframe with only the scores from videos that appear more than once
        df_criteria = df_criteria[df_criteria.apply(lambda row: video_counts[row['video_a']] >= more_than and video_counts[row['video_b']] >= more_than, axis=1)]

    # first, create a list of all videos
    videos = set(df_criteria['video_a'].unique()).union(set(df_criteria['video_b'].unique()))
    videos = sorted(videos)
    videos = {video: i for i, video in enumerate(videos)}
    # save dictionnary using pickle
    with open(f'./video_ids_{criteria}_{more_than}.pickle', 'wb') as handle:
        pickle.dump(videos, handle, protocol=pickle.HIGHEST_PROTOCOL)

    df_criteria['is_positive'] = df_criteria['score'] > 0
    df_criteria = df_criteria.groupby(['video_a', 'video_b'])

    # adj matrix should be symmetric
    mean_scores = df_criteria.mean()['score'].reset_index()
    # count how many positive score
    count_scores = df_criteria.sum()['is_positive'].reset_index()

    for agg_method, scores in zip(['mean', 'count'], [mean_scores, count_scores]):
        # create an empty adjacency matrix (sparse)
        adj = sp.lil_matrix((len(videos), len(videos)))
        for (video_a, video_b, score) in scores.values:
            # get index of video_a and video_b
            video_a_idx = videos[video_a]
            video_b_idx = videos[video_b]
            # set the value of the adjacency matrix
            adj[video_a_idx, video_b_idx] = score
            adj[video_b_idx, video_a_idx] = -score

        # save adjacency matrix as a sparse matrix
        adj = adj.tocsr()
        sp.save_npz(f'./adj_{criteria}_{agg_method}_mt{more_than}.npz', adj)

In [9]:
# df_criteria[(df_criteria['video_a'] == '-3Dn7coSFQc') & (df_criteria['video_b'] == 'KyeJTbFCSv0')]