# Nobel Prize Laureates

In the notebook we want to visualize the dynamics of increase of Nobel Prize laureates by country in the period from 1901 to 2019. For this we will use the animated bar chart.

Data comes from [Kaggle](https://www.kaggle.com/bahramjannesarr/nobel-prize-from-1901-till-2020).

In [1]:
import pandas as pd
import colorcet as cc

from lets_plot import *
LetsPlot.setup_html()

## Preparation

In [2]:
def cramers_corrected_stat(x, y):
    import numpy as np
    import scipy.stats as ss
    
    confusion_matrix = pd.crosstab(x, y).to_numpy()
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    return np.sqrt(chi2 / (n * (min(confusion_matrix.shape) - 1)))

def cramers_corr(df):
    from itertools import product
    
    index = pd.MultiIndex.from_tuples(product(*([df.select_dtypes(include='category').columns] * 2)), \
                                      names=['col1', 'col2'])
    corr_df = pd.Series(data=index.map(lambda p: cramers_corrected_stat(df[p[0]], df[p[1]])), \
                        index=index, name='corr').to_frame()
    return corr_df.reset_index().pivot(index='col1', columns='col2', values='corr')

In [3]:
def plot_corr_matrix(corr_df):
    cols_enum = {name: index for index, name in enumerate(corr_df.columns)}
    stacked_corr_df = corr_df.stack().reset_index()
    stacked_corr_df.columns = ['col1', 'col2', 'corr']
    stacked_corr_df['x'] = stacked_corr_df.col1.apply(lambda name: cols_enum[name])
    stacked_corr_df['y'] = stacked_corr_df.col2.apply(lambda name: cols_enum[name])

    return ggplot() + \
        geom_point(aes(x='x', y='y', fill='corr'), \
                   data=stacked_corr_df, color='#bd0026', shape=22, size=18, \
                   tooltips=layer_tooltips().line('(@col1, @col2)')\
                                            .format('@corr', '.2f').line('correlation value|@corr')) + \
        scale_fill_gradient(name='correlation', low='#ffeda0', high='#f03b20') + \
        scale_x_discrete(labels=list(cols_enum.keys())) + \
        scale_y_discrete(limits=list(cols_enum.values())[::-1], labels=list(cols_enum.keys())) + \
        ggtitle('Cramer\'s V Correlation for Categorical Features') + \
        ggsize(600, 500) + \
        theme(axis_title='blank', axis_line='blank')

In [4]:
def player_widget(plots, *, fps=1):
    from ipywidgets import widgets as wg
    
    interval = max(1, int(1000 / fps))
    player = wg.Play(min=0, max=len(plots) - 1, step=1, value=0, interval=interval)
    slider = wg.IntSlider(min=0, max=len(plots) - 1, step=1, value=0)
    wg.jslink((player, 'value'), (slider, 'value'))
    widget = wg.HBox([player, slider])
    iout = wg.interactive_output(lambda n, m: display(plots[n]), {'n': slider, 'm': player})
    return display(widget, iout)

## Data Exploration

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/HIL-HK/lets-plot-examples/master/data/nobel.csv')

In [6]:
df.head()

Unnamed: 0,firstname,surname,born_country_code,died_country_code,gender,year,category,share,name_of_university,city_of_university,country_of_university,born_month,age,age_get_prize
0,Wilhelm Conrad,Röntgen,DE,DE,male,1901,physics,1,Munich University,Munich,Germany,Mar,78,56
1,Hendrik A.,Lorentz,NL,NL,male,1902,physics,2,Leiden University,Leiden,the Netherlands,Jul,75,49
2,Pieter,Zeeman,NL,NL,male,1902,physics,2,Amsterdam University,Amsterdam,the Netherlands,May,78,37
3,Henri,Becquerel,FR,FR,male,1903,physics,2,École Polytechnique,Paris,France,Dec,56,51
4,Pierre,Curie,FR,FR,male,1903,physics,4,École municipale de physique et de chimie indu...,Paris,France,May,47,44


In [7]:
df.count() - df.isna().sum()

firstname                923
surname                  919
born_country_code        923
died_country_code        311
gender                   923
year                     923
category                 923
share                    923
name_of_university       471
city_of_university       461
country_of_university    463
born_month               923
age                      923
age_get_prize            923
dtype: int64

In [8]:
cat_cols = ['born_country_code', 'died_country_code', 'gender', \
            'category', 'share', 'country_of_university', 'born_month']
for column in cat_cols:
    df[column] = df[column].astype('category')

In [9]:
corr_df = cramers_corr(df)
plot_corr_matrix(corr_df)

Only the `born_country_code` column is fully filled, so we forced to use this column for construction of animated bar chart. According to correlation plot, two other columns with countries are moderate correlated with chosen one, and we has no obvious way to transfer our results for `born_country_code` to them.

In [10]:
def modulo_sort(array, m):
    pairs = [(e, e % m) for e in array]
    pairs = sorted(pairs, key=lambda e: (e[1], e[0]))
    return list(list(zip(*pairs))[0])

In [11]:
def get_sorted_countries(df, *, year=None, ascending=False):
    year = year or df.year.max()
    local_df = df[df.year == year]
    return local_df.groupby(['country']).cum.sum().sort_values(ascending=ascending).index.to_list()

## Animated Bar Chart

In [12]:
TOP_SIZE = 10
PALETTE = cc.palette['glasbey_hv']

In [13]:
iso_df = pd.read_csv('https://raw.githubusercontent.com/HIL-HK/lets-plot-examples/master/data/iso_3166.csv')

In [14]:
cum_df = df.groupby(['year', 'born_country_code', 'category']).size()\
           .groupby(['born_country_code', 'category']).cumsum().to_frame(name='cum').reset_index()
cum_df = cum_df.merge(iso_df[['iso_a2', 'name_1']], left_on='born_country_code', right_on='iso_a2', how='left')\
               .rename(columns={'name_1': 'country'})[['year', 'country', 'category', 'cum']]
cum_df.country = cum_df.country.astype('category')

In [15]:
country_colors = {country: PALETTE[i] for i, country in enumerate(get_sorted_countries(cum_df))}

In [16]:
plots = []
for year in cum_df.year.unique():
    countries = get_sorted_countries(cum_df, year=year)
    top_size = min(TOP_SIZE, len(countries))
    countries = countries[:top_size]
    local_df = cum_df[cum_df.year == year]
    local_df = local_df[local_df.country.isin(countries)]
    local_df.country.cat.set_categories(countries, inplace=True)
    local_df = local_df.sort_values(['country', 'cum', 'category'])
    local_df['color'] = local_df.country.apply(lambda country_code: country_colors[country_code])
    plots.append(
        ggplot() + \
        geom_bar(aes(x='country', y='cum', group='category', fill='color'), \
                 data=local_df, stat='identity', color='white', \
                 sampling=sampling_pick(local_df.shape[0]), \
                 tooltips=layer_tooltips().line('@country')\
                                          .line('category|@category')\
                                          .line('laureates number|@cum')) + \
        scale_fill_identity() + \
        scale_x_discrete(name='born country') + \
        scale_y_continuous(name='total laureates number') + \
        ggtitle('Nobel Laureates up to {0}'.format(year)) + \
        ggsize(600, 450) + \
        theme(legend_position='none')
    )

In [17]:
player_widget(plots)

HBox(children=(Play(value=0, interval=1000, max=115), IntSlider(value=0, max=115)))

Output()