# Tutorial : https://towardsdatascience.com/bar-chart-race-in-python-with-matplotlib-8e687a5c8a41

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.animation as animation
from IPython.display import HTML

In [None]:
df = pd.read_csv('https://gist.githubusercontent.com/johnburnmurdoch/4199dbe55095c3e13de8d5b2e5e5307a/raw/fa018b25c24b7b5f47fd0568937ff6c04e384786/city_populations', 
                 usecols=['name', 'group', 'year', 'value'])
df.head(3)

In [None]:
current_year = 2018
dff = (df[df['year'].eq(current_year)]
       .sort_values(by='value', ascending=True)
       .head(10))
dff

In [None]:
fig, ax = plt.subplots(figsize=(15, 8))
ax.barh(dff['name'], dff['value'])

In [None]:
colors = dict(zip(
    ['India', 'Europe', 'Asia', 'Latin America',
     'Middle East', 'North America', 'Africa'],
    ['#adb0ff', '#ffb3ff', '#90d595', '#e48381',
     '#aafbff', '#f7bb5f', '#eafb50']
))
group_lk = df.set_index('name')['group'].to_dict()

In [None]:
fig, ax = plt.subplots(figsize=(15, 8))
dff = dff[::-1]   # flip values from top to bottom
# pass colors values to `color=`
ax.barh(dff['name'], dff['value'], color=[colors[group_lk[x]] for x in dff['name']])
# iterate over the values to plot labels and values (Tokyo, Asia, 38194.2)
for i, (value, name) in enumerate(zip(dff['value'], dff['name'])):
    ax.text(value, i,     name,            ha='right')  # Tokyo: name
    ax.text(value, i-.25, group_lk[name],  ha='right')  # Asia: group name
    ax.text(value, i,     value,           ha='left')   # 38194.2: value
# Add year right middle portion of canvas
ax.text(1, 0.4, current_year, transform=ax.transAxes, size=46, ha='right')

In [None]:
fig, ax = plt.subplots(figsize=(15, 8))
def draw_barchart(year):
    dff = df[df['year'].eq(year)].sort_values(by='value', ascending=True).tail(10)
    ax.clear()
    ax.barh(dff['name'], dff['value'], color=[colors[group_lk[x]] for x in dff['name']])
    dx = dff['value'].max() / 200
    for i, (value, name) in enumerate(zip(dff['value'], dff['name'])):
        ax.text(value-dx, i,     name,           size=14, weight=600, ha='right', va='bottom')
        ax.text(value-dx, i-.25, group_lk[name], size=10, color='#444444', ha='right', va='baseline')
        ax.text(value+dx, i,     f'{value:,.0f}',  size=14, ha='left',  va='center')
    # ... polished styles
    ax.text(1, 0.4, year, transform=ax.transAxes, color='#777777', size=46, ha='right', weight=800)
    ax.text(0, 1.06, 'Population (thousands)', transform=ax.transAxes, size=12, color='#777777')
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
    ax.xaxis.set_ticks_position('top')
    ax.tick_params(axis='x', colors='#777777', labelsize=12)
    ax.set_yticks([])
    ax.margins(0, 0.01)
    ax.grid(which='major', axis='x', linestyle='-')
    ax.set_axisbelow(True)
    ax.text(0, 1.12, 'The most populous cities in the world from 1500 to 2018',
            transform=ax.transAxes, size=24, weight=600, ha='left')
    ax.text(1, 0, 'by @pratapvardhan; credit @jburnmurdoch', transform=ax.transAxes, ha='right',
            color='#777777', bbox=dict(facecolor='white', alpha=0.8, edgecolor='white'))
    plt.box(False)
    
draw_barchart(2018)

In [None]:
import matplotlib.animation as animation
from IPython.display import HTML
fig, ax = plt.subplots(figsize=(15, 8))
animator = animation.FuncAnimation(fig, draw_barchart, frames=range(1968, 2019))
HTML(animator.to_jshtml())

In [None]:
import matplotlib.animation as animation
from IPython.display import HTML
fig, ax = plt.subplots(figsize=(15, 8))
animator = animation.FuncAnimation(fig, draw_barchart, frames=range(1500, 2019))
animator.save('image.mp4', fps=10, writer="avconv", codec="libx264")

# Validator data

Improve data quality
- Filter some tasks (< 5min in validation ?)
- Merge some users (Ex: Joean-Yves Longchamp and JYL45)

In [None]:
%run ../tasking_manager_stats/data_management
%run ../tasking_manager_stats/map_tools

In [None]:
# Read the merged raw data of several projects
df = pd.read_csv(os.path.join(get_data_dir(), 'merged_stats.csv'))
print(f'{len(df)} lines')
df.head()

In [None]:
# Filter on validation in S1 2019
df2 = df[df['Type'] == 'VALIDATION']
df2 = df2[df2['Year'] == 2019]
df2 = df2[df2['Month'] <= 6]
print(f'{len(df2)} lines')
df2.head()

In [None]:
# Keep only one validation by author and task
df3 = df2.groupby(['Project', 'Task', 'Year', 'Month', 'Day', 'Rel. Day', 'Author']).max()
df3 = df3.reset_index()
print(f'{len(df3)} lines')
df3.head()

In [None]:
# Count the number of tasks by day and author
df4 = df3.groupby(['Author', 'Year', 'Month', 'Day']).count()
df4 = df4['Task']
df4 = df4.reset_index()
print(f'{len(df4)} lines')
df4.head()

In [None]:
# Add a column for the day of the year
df4['Day_of_year'] = None
for index, row in df4.iterrows():
    df4.loc[index, 'Day_of_year'] = pd.datetime(row['Year'], row['Month'], row['Day']).timetuple().tm_yday
df4 = df4[['Author', 'Day_of_year', 'Task']]
print(f'{len(df4)} lines')
df4.head()

In [None]:
# Add 0 task validated on all the days
for author in df4['Author'].unique():
    for day in range(1, max(df4['Day_of_year']) + 1):
        df4 = pd.concat([df4, pd.DataFrame(data=[(author, day, 0)], columns=['Author', 'Day_of_year', 'Task'])],
                        axis=0, ignore_index=True)
print(f'{len(df4)} lines')
df4.tail()

In [None]:
# Sum again the empty lines with existing ones to have a line for a day and an author
df4 = df4.groupby(['Author', 'Day_of_year']).sum()
df4 = df4.reset_index()
print(f'{len(df4)} lines')
df4.head()

In [None]:
# Check the number
len(df4['Author'].unique()) * max(df4['Day_of_year'])

In [None]:
# Add a cumsum column named value
df5 = df4.copy()
df5['value'] = None
for author in df5['Author'].unique():
    df_author = df5.loc[df5['Author'] == author,]
    df5.loc[df_author.index, 'value'] = df_author['Task'].cumsum()
print(f'{len(df5)} lines')
df5[df5['Author'] == 'Anaximandre'].head(10)

In [None]:
# Rename and extract useful columns
df6 = df5.rename(index=int, columns={'Author' : 'name'})
df6 = df6[['name', 'Day_of_year', 'value']]
print(f'{len(df6)} lines')
df6[df6['name'] == 'Anaximandre'].head()

In [None]:
fig, ax = plt.subplots(figsize=(15, 8))
def draw_barchart(day_of_year):
    dff = df6[df6['Day_of_year'].eq(day_of_year)].sort_values(by='value', ascending=True).tail(10)
    ax.clear()
    ax.barh(dff['name'], dff['value']) # TODO color
    dx = dff['value'].max() / 200
    for i, (value, name) in enumerate(zip(dff['value'], dff['name'])):
        ax.text(value-dx, i,     name,           size=14, weight=600, ha='right', va='bottom')
        ax.text(value+dx, i,     f'{value:,.0f}',  size=14, ha='left',  va='center')
    # ... polished styles
    ax.text(1, 0.4, day_of_year, transform=ax.transAxes, color='#777777', size=46, ha='right', weight=800)
    ax.text(0, 1.06, 'Validated tasks', transform=ax.transAxes, size=12, color='#777777')
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
    ax.xaxis.set_ticks_position('top')
    ax.tick_params(axis='x', colors='#777777', labelsize=12)
    ax.set_yticks([])
    ax.margins(0, 0.01)
    ax.grid(which='major', axis='x', linestyle='-')
    ax.set_axisbelow(True)
    ax.text(0, 1.12, 'Number of validated tasks on CartONG project in S1 2019',
            transform=ax.transAxes, size=24, weight=600, ha='left')
    ax.text(1, 0, 'by CartONG credit @pratapvardhan, @jburnmurdoch', transform=ax.transAxes, ha='right',
            color='#777777', bbox=dict(facecolor='white', alpha=0.8, edgecolor='white'))
    plt.box(False)
    
draw_barchart(150)

In [None]:
from random import seed
random.seed(14)

In [None]:
# Use color from https://medium.com/@6berardi/how-to-create-a-smooth-bar-chart-race-with-python-ad2daf6510dc
from random import randint
import matplotlib.colors as mc
import colorsys
def transform_color(color, amount = 0.5):

    try:
        c = mc.cnames[color]
    except:
        c = color
        c = colorsys.rgb_to_hls(*mc.to_rgb(c))
    return colorsys.hls_to_rgb(c[0], 1 - amount * (1 - c[1]), c[2])

all_names = df6['name'].unique().tolist()
random_hex_colors = []
for i in range(len(all_names)):
    random_hex_colors.append('#' + '%06X' % randint(0, 0xFFFFFF))

rgb_colors = [transform_color(i, 1) for i in random_hex_colors]
rgb_colors_opacity = [rgb_colors[x] + (0.825,) for x in range(len(rgb_colors))]
rgb_colors_dark = [transform_color(i, 1.12) for i in random_hex_colors]
normal_colors = dict(zip(df6['name'].unique(), rgb_colors_opacity))
dark_colors = dict(zip(df6['name'].unique(), rgb_colors_dark))

In [None]:
# Add additionnal column to choose initial top
df6['initial_ranking'] = 0
for name in df6['name'].unique():
    if df6[(df6['name'] == name) & (df6['Day_of_year'] == 20)]['value'].sum() > 0:
        df6.loc[df6['name'] == name, 'initial_ranking'] = 1
print(f'{len(df6)} lines')
df6[df6['name'] == 'Anaximandre'].head()

In [None]:
fig, ax = plt.subplots(figsize=(15, 8))
def draw_barchart(day_of_year):
    dff = df6[df6['Day_of_year'].eq(day_of_year)].sort_values(by=['value', 'initial_ranking'], ascending=True).tail(10)
    ax.clear()    
    ax.barh(dff['name'], dff['value'], color = [normal_colors[x] for x in dff['name']], height = 0.8,
            edgecolor =([dark_colors[x] for x in dff['name']]), linewidth = '3')
    dx = dff['value'].max() / 200
    for i, (value, name) in enumerate(zip(dff['value'], dff['name'])):
        ax.text(value-dx, i,     name,           size=14, weight=600, ha='right', va='bottom')
        ax.text(value+dx, i,     value,  size=14, ha='left',  va='center')
    # ... polished styles
    date = (datetime.datetime(2019, 1, 1) + datetime.timedelta(day_of_year - 1)).strftime('%B %d')
    ax.text(1, 0.4, date, transform=ax.transAxes, color='#777777', size=46, ha='right', weight=800)
    ax.text(0, 1.06, 'Validated tasks', transform=ax.transAxes, size=12, color='#777777')
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:.0f}'))
    ax.xaxis.set_ticks_position('top')
    ax.tick_params(axis='x', colors='#777777', labelsize=12)
    ax.set_yticks([])
    ax.margins(0, 0.01)
    ax.grid(which='major', axis='x', linestyle='-')
    ax.set_axisbelow(True)
    ax.text(0, 1.12, 'Number of validated tasks on CartONG project in S1 2019',
            transform=ax.transAxes, size=24, weight=600, ha='left')
    ax.text(1, 0, 'by CartONG credit @pratapvardhan, @jburnmurdoch', transform=ax.transAxes, ha='right',
            color='#777777', bbox=dict(facecolor='white', alpha=0.8, edgecolor='white'))
    plt.box(False)
    
draw_barchart(150)

In [None]:
# Put logarithms
fig, ax = plt.subplots(figsize=(15, 8))
def draw_barchart(day_of_year):
    dff = df6[df6['Day_of_year'].eq(day_of_year)].sort_values(by=['value', 'initial_ranking'], ascending=True).tail(10)
    ax.clear()    
    ax.barh(dff['name'], dff['value'], color = [normal_colors[x] for x in dff['name']], height = 0.8,
            edgecolor =([dark_colors[x] for x in dff['name']]), linewidth = '3', log=True)
    # Add ('', 0) in ax.barh ?
    dx = np.log(dff['value'].max() - dff['value'].min()) / np.log(200)
    for i, (value, name) in enumerate(zip(dff['value'], dff['name'])):
        ax.text(value*.95, i,     name,           size=14, weight=600, ha='right', va='center')
        ax.text(value*1.05, i,     value,  size=14, ha='left',  va='center')
    # ... polished styles
    date = (datetime.datetime(2019, 1, 1) + datetime.timedelta(day_of_year - 1)).strftime('%B %d')
    ax.text(1, 0.4, date, transform=ax.transAxes, color='#777777', size=46, ha='right', weight=800)
    ax.text(0, 1.06, 'Validated tasks', transform=ax.transAxes, size=12, color='#777777')
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:.0f}'))
    ax.xaxis.set_ticks_position('top')
    ax.tick_params(axis='x', colors='#777777', labelsize=12)
    ax.set_yticks([])
    ax.grid(which='major', axis='x', linestyle='-')
    ax.set_axisbelow(True)
    ax.text(0, 1.12, 'Number of validated tasks on CartONG project in S1 2019',
            transform=ax.transAxes, size=24, weight=600, ha='left')
    ax.text(1, 0, 'by CartONG credit @pratapvardhan, @jburnmurdoch', transform=ax.transAxes, ha='right',
            color='#777777', bbox=dict(facecolor='white', alpha=0.8, edgecolor='white'))
    plt.box(False)
    
draw_barchart(150)

In [None]:
import matplotlib.animation as animation
from IPython.display import HTML
fig, ax = plt.subplots(figsize=(15, 8))
animator = animation.FuncAnimation(fig, draw_barchart, frames=range(1, max(df6['Day_of_year'])))
HTML(animator.to_jshtml())

In [None]:
import matplotlib.animation as animation
from IPython.display import HTML
fig, ax = plt.subplots(figsize=(15, 8))
animator = animation.FuncAnimation(fig, draw_barchart, frames=range(1, max(df6['Day_of_year'])))
animator.save('validator_s1_2019.mp4', fps=3)

# Time spent by project

In [None]:
print(f'{len(df)} lines')
df.head()

In [None]:
# Group spent time by Project and day
df2 = df.groupby(['Project', 'Year', 'Month', 'Day']).sum()
df2 = df2['Duration']
df2 = df2.reset_index()
print(f'{len(df2)} lines')
df2.head()

In [None]:
# Add a column for the day of the year
df2['Day_of_year'] = None
for index, row in df2.iterrows():
    if row['Year'] < 2019:
        df2.loc[index, 'Day_of_year'] = 0
    else:
        df2.loc[index, 'Day_of_year'] = pd.datetime(row['Year'], row['Month'], row['Day']).timetuple().tm_yday
df2 = df2[['Project', 'Day_of_year', 'Duration']]
print(f'{len(df2)} lines')
df2.head()

In [None]:
# Restrict to a number of days
MAX_DAY = 188
df2 = df2[df2['Day_of_year'] <= MAX_DAY]

In [None]:
# Add 0 task validated on all the days
for project in df2['Project'].unique():
    for day in range(0, MAX_DAY):
        df2 = pd.concat([df2, pd.DataFrame(data=[(project, day, 0)], columns=['Project', 'Day_of_year', 'Duration'])],
                        axis=0, ignore_index=True)
print(f'{len(df2)} lines')
df2.tail()

In [None]:
# Sum again the empty lines with existing ones to have a line for a day and a project
df3 = df2.groupby(['Project', 'Day_of_year']).sum()
df3 = df3.reset_index()
print(f'{len(df3)} lines')
df3.head()

In [None]:
# Check the number of lines
len(df3['Project'].unique()) * MAX_DAY

In [None]:
# Add a cumsum column named value
df4 = df3.copy()
df4['value'] = None
for project in df4['Project'].unique():
    df_project = df4.loc[df4['Project'] == project,]
    df4.loc[df_project.index, 'value'] = df_project['Duration'].cumsum()
print(f'{len(df4)} lines')
df4.head(10)

In [None]:
# Add country data
df_countries = pd.read_csv(os.path.join(get_data_dir(), 'Mapathons_countries.csv'), encoding='ISO-8859-1')
df_countries['Country'] = df_countries['Country'].apply(lambda s: s.replace(' ', ''))
df_countries = df_countries.drop_duplicates()
# Subgroup Africa :  http://www.actualite-ouest-africaine.org/content/fr/les-six-r%C3%A9gions-de-l%E2%80%99union-africaine
df_countries.loc[df_countries['Country'].isin(['Algérie']), 'Group'] = 'North Africa'
df_countries.loc[df_countries['Country'].isin(['BurkinaFaso', 'Niger', 'Sénégal']), 'Group'] = 'West Africa'
df_countries.loc[df_countries['Country'].isin(['Angola']), 'Group'] = 'South Africa'
df_countries.loc[df_countries['Country'].isin(['Kenya', 'Madagascar', 'Tanzanie', 'Ethiopie', 'Ouganda', 'Soudan']), 'Group'] = 'Est Africa'
df_countries.loc[df_countries['Country'].isin(['RDC', 'Cameroun', 'Tchad', 'Kalémie', 'Congo']), 'Group'] = 'Center Africa'
df_countries.head()

In [None]:
df_countries[df_countries['Group'] == 'Africa']

In [None]:
# Merge country
df5 = pd.merge(df4, df_countries, on='Project')
df5.head()

In [None]:
# Rename, reformat extract useful columns
df6 = df5.rename(index=int, columns={'Project' : 'name'})
df6 = df6[['name', 'Day_of_year', 'value', 'Country', 'Group']]
df6['name'] = df6['name'].apply(str)
df6['value'] = df6['value'] / 3600
print(f'{len(df6)} lines')
df6.head()

In [None]:
df6['Group'].unique()

In [None]:
colors = dict(zip(
    ['Central America', 'South America', 'Asia', 'Est Africa',
     'West Africa', 'Central Africa', 'North Africa'],
    ['#adb0ff', '#ffb3ff', '#90d595', '#e48381',
     '#aafbff', '#f7bb5f', '#eafb50']
))

In [None]:
fig, ax = plt.subplots(figsize=(15, 8))
def draw_barchart(day_of_year):
    dff = df6[df6['Day_of_year'].eq(day_of_year)].sort_values(by='value', ascending=True).tail(10)
    ax.clear()    
    ax.barh(dff['name'], dff['value'], color=[colors[x] for x in dff['Group']])
    dx = dff['value'].max() / 200
    for i, (value, name, country, group) in enumerate(zip(dff['value'], dff['name'], dff['Country'], dff ['Group'])):
        ax.text(value-dx, i,     name,           size=14, weight=600, ha='right', va='bottom')
        ax.text(value+dx, i,      f'{value:.1f}',  size=14, ha='left',  va='center')
        ax.text(value-dx, i-.25, country + ' ( ' + group + ' )', size=10, color='#444444', ha='right', va='baseline')
    # ... polished styles
    date = (datetime.datetime(2019, 1, 1) + datetime.timedelta(day_of_year - 1)).strftime('%B %d')
    ax.text(1, 0.4, date, transform=ax.transAxes, color='#777777', size=46, ha='right', weight=800)
    ax.text(0, 1.06, 'Tme spend (hours)', transform=ax.transAxes, size=12, color='#777777')
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:.0f}'))
    ax.xaxis.set_ticks_position('top')
    ax.tick_params(axis='x', colors='#777777', labelsize=12)
    ax.set_yticks([])
    ax.margins(0, 0.01)
    ax.grid(which='major', axis='x', linestyle='-')
    ax.set_axisbelow(True)
    ax.text(0, 1.12, 'Hours spent on CartONG projects finished in S1 2019',
            transform=ax.transAxes, size=24, weight=600, ha='left')
    ax.text(1, 0, 'by CartONG credit @pratapvardhan, @jburnmurdoch', transform=ax.transAxes, ha='right',
            color='#777777', bbox=dict(facecolor='white', alpha=0.8, edgecolor='white'))
    plt.box(False)
    
draw_barchart(150)

In [None]:
import matplotlib.animation as animation
from IPython.display import HTML
fig, ax = plt.subplots(figsize=(15, 8))
animator = animation.FuncAnimation(fig, draw_barchart, frames=range(1, max(df6['Day_of_year'])))
HTML(animator.to_jshtml())

In [None]:
import matplotlib.animation as animation
from IPython.display import HTML
fig, ax = plt.subplots(figsize=(15, 8))
animator = animation.FuncAnimation(fig, draw_barchart, frames=range(1, max(df6['Day_of_year'])))
animator.save('time_spent_s1_2019.mp4', fps=3)