In [1]:
import pandas as pd
import pathlib


models=[
    'gmf_ipa_mean',
    'gmf_gpa_mean',
    'gmf_gpa_expert',
    'gmf_gpa_softmax',
    'mlp_ipa_mean',
    'mlp_gpa_mean',
    'mlp_gpa_expert',
    'mlp_gpa_softmax',
]

"""
'groups-2-gmf_k8_dsft_seed1234_gpa_mean.csv'
'groups-2-gmf_k8_dsft_seed1234_ipa_mean.csv'
"""
def name_to_file(name, ds, k='8', seed='1234'):
    model_variation = name.split('_')
    modelname = model_variation[0] # gmf
    variation = '_'.join(model_variation[1:]) # ipa_mean
    return f'{modelname}_k{k}_ds{ds}_seed{seed}_{variation}'


"""
@param ds: Dataset
@return: means per groupsize, results as pandas DF with g column, restuls_dict dict with results
"""
def get_error(ds):
    datapath=f'../data/grupos/{ds}/'
    restulspath=f'../results/{ds}/'
    

    # Load data
    groups = {}

    fromngroups=2
    tongroups=10

    # Group mean as 'y'
    for g in range(fromngroups,tongroups+1):
        data_group = pd.read_csv(f"{datapath}groups-{g}.csv")
        groups[g] = data_group #pd.DataFrame(data_group.filter(regex="rating").mean(axis=1), columns=['y'])
        groups[g]['g'] = g

    # Load model predictions
    for g in range(fromngroups,tongroups+1):
        for model in models:
            mfile = name_to_file(model, ds)
            groups[g][model] = pd.read_csv(
                f"{restulspath}groups-{g}-{mfile}.csv",
                header=0,
                names=["data"]
            )['data']

    # Compute error
    for g in range(fromngroups,tongroups+1):    # Each group
        for m in models:                        # Each model
            for gi in range(1,g+1):             # Each rating in group
                groups[g][m+'-error-'+str(gi)] = abs(groups[g]['rating-'+str(gi)] - groups[g][m])
                groups[g][m+'-error2-'+str(gi)] = (groups[g]['rating-'+str(gi)] - groups[g][m])**2

    for g in range(fromngroups,tongroups+1):    # Each group
        for m in models:                        # Each model
            groups[g][m+'-MAE'] = groups[g].filter(regex='^'+m+'-error-',axis=1).mean(axis=1, skipna=True)
            groups[g][m+'-MAX'] = groups[g].filter(regex='^'+m+'-error-',axis=1).max(axis=1, skipna=True)
            groups[g][m+'-MSE'] = groups[g].filter(regex='^'+m+'-error2-',axis=1).mean(axis=1, skipna=True)

    # Join dict of groups
    results = pd.concat(groups, join='inner', ignore_index=True)
    # Calculate means
    means = results.groupby('g').mean()

    return means, results, groups

#DS='ml1m'
#DS='ft'
#DS='anime'
#DS='netflix'

In [2]:
from IPython.display import display

def draw_table(what, data):
    focus_on_data = [
        'gmf_ipa_mean-'+what,
        'gmf_gpa_mean-'+what,
        'gmf_gpa_expert-'+what,
        'gmf_gpa_softmax-'+what,
        'mlp_ipa_mean-'+what,
        'mlp_gpa_mean-'+what,
        'mlp_gpa_expert-'+what,
        'mlp_gpa_softmax-'+what,
    ]
    print(what)
    pd.set_option("display.precision", 4)
    display(data[focus_on_data].groupby("g", as_index=False).mean().filter(regex=what+"$").T\
                    .rename(columns = {0:2, 1:3, 2:4, 3:5, 4:6, 5:7, 6:8, 7:9, 8:10})\
                    .style.highlight_min(color = 'lightgreen', axis = 0))

def highlight_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

def draw_with_highlight_ndcg(data):
    pd.set_option("display.precision", 4)
    display(data.style.apply(highlight_max))
    #display(data.style.highlight_max(color = 'lightgreen', axis = 0))

In [11]:
from sklearn.metrics import ndcg_score
NTIEMS=5 #number of item rated per group
NGROUPS=10000

"""
@param ds: Dataset
@return: means per groupsize, results as pandas DF with g column, restuls_dict dict with results
"""
def get_ndcg_score(data_dict):
    fromngroups=2
    tongroups=10
    
    ndcg_score_data = {}
    
    subfix='-NDCG'
    models_ndcg = [m+subfix for m in models]
    
    for g in range(fromngroups,tongroups+1):
        ndcg_score_data[g] = pd.DataFrame(index=range(0,NGROUPS), columns=['g'] + models_ndcg, dtype='float64')
        ndcg_score_data[g]['g'] = g
        
        # get real mean.
        data_dict[g]['rating-mean'] = data_dict[g].filter(regex='^rating-',axis=1).mean(axis=1, skipna=True)
        # Generate index per item in group 1, 2, 3, 4, 5
        item_per_group = data_dict[g].groupby(['group']).cumcount().add(1)
        data_dict[g]['item-index'] = item_per_group
        
        groudtruth = data_dict[g].pivot(index='group', columns='item-index', values='rating-mean').to_numpy()
        # id_group, rating-mean-1, rating-mean-2, etc... (No with this names)
        for m in models:
            model_prediction = data_dict[g].pivot(index='group', columns='item-index', values=m).to_numpy()
            
            for idx, data_tuple in enumerate(zip(groudtruth, model_prediction)):
                ndcg_score_data[g].loc[idx,m+subfix] = ndcg_score([data_tuple[0]], [data_tuple[1]])
    
    # Join dict of groups
    results = pd.concat(ndcg_score_data, join='inner', ignore_index=True)
    # Calculate means
    means = results.groupby('g').mean()

    return means, results, ndcg_score_data

#result_ndcg[2][['group','rating-mean']].head(10)#.pivot('group','rating-mean')
#groups[2].pivot(index='group', columns='item-index', values='gmf_ipa_mean').head(10)
#ndcg_results = get_ndcg_score(groups)
#ndcg_means, ndcg_results, ndcg_results_dict = get_ndcg_score(groups)

In [4]:
datasets = {}
datasets['ml1m']='MovieLens1M'
datasets['ft']='FilmTrust'
datasets['anime']='MyAnimeList'

In [5]:
all_results = {}

for ds in datasets.keys():
    means, results, result_dict = get_error(ds)
    all_results[ds] = {}
    all_results[ds]['means'] = means
    all_results[ds]['results'] = results
    all_results[ds]['result_dict'] = result_dict

In [12]:
for ds in datasets.keys():
    ndcg_means, ndcg_results, ndcg_results_dict = get_ndcg_score(all_results[ds]['result_dict'])
    all_results[ds]['ndcg_means'] = ndcg_means
    all_results[ds]['ndcg_results'] = ndcg_results
    all_results[ds]['ndcg_results_dict'] = ndcg_results_dict

# Draw results

In [None]:
means, results, group_result = get_error('ml1m')
draw_table('MAE', means)
draw_table('MSE', means)
draw_table('MAX', means)
ndcg_score_data = get_ndcg_score(group_result)
draw_with_highlight_ndcg(ndcg_score_data)

In [None]:
means, results, group_result = get_error('ft')
draw_table('MAE', means)
draw_table('MSE', means)
draw_table('MAX', means)
ndcg_score_data = get_ndcg_score(group_result)
draw_with_highlight_ndcg(ndcg_score_data)

In [None]:
means, results, group_result = get_error('anime')
draw_table('MAE', means)
draw_table('MSE', means)
draw_table('MAX', means)
ndcg_score_data = get_ndcg_score(group_result)
draw_with_highlight_ndcg(ndcg_score_data)

# Latex table generation

In [None]:
print(models)

In [28]:
PRECISION=5
STD_PRECISION=3
pd.set_option("display.precision", PRECISION)

def table_describe(data, what):
    what = '-' + what
    focus_on_data = [
        'g',
        'gmf_ipa_mean'+what,
        #'gmf_gpa_mean'+what,
        'gmf_gpa_expert'+what,
        'gmf_gpa_softmax'+what,
        'mlp_ipa_mean'+what,
        'mlp_gpa_mean'+what,
        'mlp_gpa_expert'+what,
        'mlp_gpa_softmax'+what,
    ]
    names = [
        'g',
        #'GMF IPA',
        #'GMF Avg',
        '\\makecell[l]{GMF IPA \\\\ GMF Avg}',
        #'GMF Most expert',
        'GMF Expertise',
        'GMF Softmax',
        'MLP IPA',
        '\\makecell[l]{MLP Avg \\\\ DeepGroup}',
        #'MLP Most expert',
        'MLP Expertise',
        'MLP Softmax',
    ]
    d = data.copy()[focus_on_data]
    new_names_map = {d.columns[i]:names[i] for i in range(len(names))}
    d.rename(new_names_map, axis=1, inplace=True)
    std = d.groupby("g", as_index=True).std().T
    mean = d.groupby("g", as_index=True).mean().T
    return (mean, std)

def generate_latext_table(mean, std, highlight_min=True):
    content = ""
    
    content += "\\hline\n"
    content += "Model \\textbackslash Group Size & 2      & 3      & 4      & 5      & 6      & 7      & 8      & 9      & 10     \\\\ \\hline\n"
    
    if highlight_min:
        highlight_values = mean.idxmin()
    else:
        highlight_values = mean.idxmax()
    
    for rix in range(len(mean)):
        content += mean.index[rix]
        for cix in range(len(mean.iloc[rix])):
            
            if mean.index[rix] == highlight_values[cix+2]:
                cell = " & \\cellcolor[gray]{0.8}\\makecell{\\textbf{"
                cell += str(round(mean.iloc[rix, cix], PRECISION))
                cell += "}\\\\\\textbf{(" + str(round(std.iloc[rix, cix], STD_PRECISION)) + ")}"
                cell += "} "                
            else:
                cell = " & \\makecell{"
                cell += str(round(mean.iloc[rix, cix], PRECISION))
                cell += "\\\\(" + str(round(std.iloc[rix, cix], STD_PRECISION)) + ")"
                cell += "} "
            
            content += cell
        content += "\\\\ \\hline \n"
    return content

def generate_highlight_table(data):
    #return mean.index.get_loc(mean[3].idxmin())
    index_color = ""
    
    
    for i, r_label in enumerate(highlight_values):
        rix = data.index.get_loc(r_label)
        index_color += "cell{"+str(rix+3)+"}{"+str(i+2)+"} = {gray9,font=\\bfseries},\n"
    return index_color


"""
@param name: Table name
@param tag:  Tag for latex
@param dataset_names: Names of subtables
@param dataset_tags: Tags of subtables
@param what: MAE, MSE, MAX, NDCG
@param data: all data
@param section: section name in dictionary data
@param highlight_min: min value the best
@return: str latex table
"""
def generate_table(name, tag, dataset_names, dataset_tags, what, data, section='results', highlight_min=True):
    content = """
    \\begin{table}[ht]
        \\caption{"""+name+"""}
        \\label{tab:"""+tag+"""}
    """

    for subname, subtag in zip(dataset_names, dataset_tags): # ml1m, ft, anime,... 
        mean, std = table_describe(data[subtag][section], what)

        content += """
        \\begin{subtable}[h]{1\\textwidth}
        \\resizebox{\\textwidth}{!}{%
            \\renewcommand{\\arraystretch}{1.9}
            \\begin{tabular}{|l|c|c|c|c|c|c|c|c|c|}
        """
        content += generate_latext_table(mean, std, highlight_min)
        content += """
            \\end{tabular}%
        }
        \\caption{"""+subname+"""}
        \\label{tab:"""+tag+""":"""+subtag+"""}
        \\end{subtable}"""

    content += """
    \\end{table}
    """
    return content


In [29]:
print(
    generate_table(
        'Mean Absolute Error',
        'mae',
        datasets.values(),
        datasets.keys(),
        'MAE',
        all_results
    )
)

print(
    generate_table(
        'Mean Squared Error',
        'mse',
        datasets.values(),
        datasets.keys(),
        'MSE',
        all_results
    )
)

print(
    generate_table(
        'Mean Max Error',
        'max',
        datasets.values(),
        datasets.keys(),
        'MAX',
        all_results
    )
)

print(
    generate_table(
        'Discounted cumulative gain',
        'ndcg',
        datasets.values(),
        datasets.keys(),
        'NDCG',
        all_results,
        'ndcg_results',
        False
    )
)


    \begin{table}[ht]
        \caption{Mean Absolute Error}
        \label{tab:mae}
    
        \begin{subtable}[h]{1\textwidth}
        \resizebox{\textwidth}{!}{%
            \renewcommand{\arraystretch}{1.9}
            \begin{tabular}{|l|c|c|c|c|c|c|c|c|c|}
        \hline
Model \textbackslash Group Size & 2      & 3      & 4      & 5      & 6      & 7      & 8      & 9      & 10     \\ \hline
\makecell[l]{GMF IPA \\ GMF Avg} & \makecell{0.74205\\(0.409)}  & \makecell{0.76075\\(0.341)}  & \makecell{0.76893\\(0.299)}  & \makecell{0.77009\\(0.271)}  & \makecell{0.77745\\(0.249)}  & \makecell{0.77659\\(0.234)}  & \makecell{0.77681\\(0.221)}  & \makecell{0.77599\\(0.212)}  & \makecell{0.77558\\(0.201)} \\ \hline 
GMF Expertise & \makecell{0.74393\\(0.41)}  & \makecell{0.76207\\(0.341)}  & \makecell{0.77018\\(0.299)}  & \makecell{0.77155\\(0.27)}  & \makecell{0.779\\(0.249)}  & \makecell{0.77782\\(0.234)}  & \makecell{0.77834\\(0.221)}  & \makecell{0.77729\\(0.211)}  & \makecell{0.7768


    \begin{table}[ht]
        \caption{Mean Squared Error}
        \label{tab:mse}
    
        \begin{subtable}[h]{1\textwidth}
        \resizebox{\textwidth}{!}{%
            \renewcommand{\arraystretch}{1.9}
            \begin{tabular}{|l|c|c|c|c|c|c|c|c|c|}
        \hline
Model \textbackslash Group Size & 2      & 3      & 4      & 5      & 6      & 7      & 8      & 9      & 10     \\ \hline
\makecell[l]{GMF IPA \\ GMF Avg} & \cellcolor[gray]{0.8}\makecell{\textbf{0.87504}\\\textbf{(0.914)}}  & \cellcolor[gray]{0.8}\makecell{\textbf{0.90947}\\\textbf{(0.767)}}  & \makecell{0.92437\\(0.676)}  & \makecell{0.92648\\(0.609)}  & \makecell{0.94191\\(0.567)}  & \makecell{0.93896\\(0.53)}  & \makecell{0.9376\\(0.5)}  & \makecell{0.93832\\(0.48)}  & \makecell{0.93571\\(0.454)} \\ \hline 
GMF Expertise & \makecell{0.88044\\(0.918)}  & \makecell{0.91329\\(0.77)}  & \makecell{0.92754\\(0.678)}  & \makecell{0.92979\\(0.61)}  & \makecell{0.94504\\(0.568)}  & \makecell{0.94135\\(0.531)}  & \mak


    \begin{table}[ht]
        \caption{Mean Max Error}
        \label{tab:max}
    
        \begin{subtable}[h]{1\textwidth}
        \resizebox{\textwidth}{!}{%
            \renewcommand{\arraystretch}{1.9}
            \begin{tabular}{|l|c|c|c|c|c|c|c|c|c|}
        \hline
Model \textbackslash Group Size & 2      & 3      & 4      & 5      & 6      & 7      & 8      & 9      & 10     \\ \hline
\makecell[l]{GMF IPA \\ GMF Avg} & \cellcolor[gray]{0.8}\makecell{\textbf{1.02112}\\\textbf{(0.6)}}  & \cellcolor[gray]{0.8}\makecell{\textbf{1.2213}\\\textbf{(0.598)}}  & \makecell{1.35658\\(0.588)}  & \cellcolor[gray]{0.8}\makecell{\textbf{1.45195}\\\textbf{(0.583)}}  & \cellcolor[gray]{0.8}\makecell{\textbf{1.54115}\\\textbf{(0.578)}}  & \makecell{1.59908\\(0.577)}  & \makecell{1.64979\\(0.573)}  & \makecell{1.69807\\(0.576)}  & \makecell{1.73598\\(0.571)} \\ \hline 
GMF Expertise & \makecell{1.02474\\(0.602)}  & \makecell{1.22441\\(0.599)}  & \makecell{1.35928\\(0.59)}  & \makecell{1.4551\\(0