In [1]:
import os
import git
import numpy as np
import pandas as pd

In [2]:
git_root = git.Repo(search_parent_directories=True).git.rev_parse("--show-toplevel")

In [3]:
files = [f for f in os.listdir(os.path.join(git_root,'data','external')) if f.endswith('.txt')]

In [4]:
def read_ranking(file,
                 input_data_dir=os.path.join(git_root,'data','external')):

    with open(os.path.join(input_data_dir,file), 'r') as f:
        ranking = [line.strip() for line in f.readlines()]

    return ranking

In [5]:
raw_data = {os.path.splitext(file)[0]: {'ranking': read_ranking(file)} for file in files}

In [6]:
people = list(raw_data.keys())
chocs = raw_data[next(iter(raw_data))]['ranking']

In [7]:
from sklearn.preprocessing import LabelEncoder
choc_le = LabelEncoder()
people_le = LabelEncoder()

In [8]:
people_le.fit(people)
choc_le.fit(next(iter(raw_data.values()))['ranking'])

In [11]:
ranking_df = pd.DataFrame.from_dict(raw_data).melt(var_name='person',value_name='choc').explode('choc')

In [12]:
ranking_df['person_idx'] = people_le.transform(ranking_df['person'])
ranking_df['choc_idx'] = choc_le.transform(ranking_df['choc'])

In [13]:
ranking_df['rank'] = ranking_df.groupby('person').cumcount()

In [14]:
ranking_df

Unnamed: 0,person,choc,person_idx,choc_idx,rank
0,jimmy,crunchie,5,2,0
0,jimmy,snickers,5,13,1
0,jimmy,fudge,5,7,2
0,jimmy,milky_way,5,12,3
0,jimmy,creme_egg,5,1,4
...,...,...,...,...,...
9,telf,fudge,8,7,12
9,telf,double_decker,8,5,13
9,telf,twix,8,15,14
9,telf,snickers,8,13,15


In [15]:
import plotly.express as px

In [16]:
fig = px.bar(ranking_df.groupby('choc')[['rank']].mean().sort_values('rank'))

fig.update_layout(
{
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)'
},
showlegend=False)

fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
fig.update_yaxes(showline=True, linewidth=1, linecolor='black',title='mean rank')

fig.show()

In [17]:
ranking_df['top_5'] = ranking_df['rank'] <= 4

In [18]:
ranking_df['bottom_5'] = ranking_df['rank'] >= 12

In [19]:
fig = px.bar(ranking_df[['choc', 'top_5', 'bottom_5']].melt(id_vars='choc').groupby(['choc','variable'])['value'].sum().reset_index(),
       y='choc',
       x='value',
       facet_col='variable')

fig.update_layout(
{
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)'
},
showlegend=False)

fig.update_xaxes(showline=True, linewidth=1, linecolor='black', title='frequency')
fig.update_yaxes(showline=True, linewidth=1, linecolor='black',title='choc')

fig.show()