# Preprocessing data

In [1]:
import json
import numpy as np
import csv 
import sys
import locale
locale.setlocale(locale.LC_ALL, 'en_US')

import pandas as pd

from bokeh.io import show, curdoc
from bokeh.layouts import layout
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure
from bokeh.sampledata.degrees import data
from bokeh.themes import Theme

dictCountries={
    "Alemania":"Germany", 
    "Austria":"Austria",
    "Bélgica":"Belgium",
    "Bulgaria":"Bulgaria",
    "Chipre":"Cyprus",
    "Croacia":"Croatia",
    "Dinamarca":"Denmark",
    "Eslovenia":"Slovenia",
    "Estonia":"Estonia",
    "Finlandia":"Finland",
    "Francia":"France",
    "Grecia":"Greece",
    "Holanda":"Holland",
    "Hungría":"Hungary",
    "Irlanda":"Ireland",
    "Italia":"Italy",
    "Letonia":"Latvia",
    "Lituania":"Lithuania",
    "Luxemburgo":"Luxembourg",
    "Malta":"Malta",
    "Polonia":"Poland",
    "Portugal":"Portugal",
    "Reino Unido":"United Kingdom",
    "República Checa":"Czech Rep.",
    "República Eslovaca":"Slovakia",
    "Rusia":"Russia",
    "Rumanía":"Romania",
    "Suecia":"Sweden",
    "Federación de Rusia":"Russia",
    "Noruega":"Norway",
    "Serbia":"Serbia",
    "Suiza":"Switzerland",
    "Ucrania":"Ukraine"}

invdictCountries = {v: k for k, v in dictCountries.items()}

#Data from: Instuto Nacional de Estadística www.ine.es
f = open("./sources/viajeros.txt", "r")
reader = csv.reader(f)

max_value = -1

names = []
viaj2000_list = []
viaj2016_list = []

for row in reader:
    tokens=row[0].split(";")
    name=tokens[0]
    if name in ["Eslovaquia", "Hungría", "Islandia"]:
        continue
    names.append(name)
    tokens[1]=tokens[1].split(".")[0]
    tokens[2]=tokens[2].split(".")[0]
    
    viaj2000=int(tokens[1])
    if(max_value < viaj2000):
        max_value = viaj2000
    viaj2000_list.append(viaj2000)
    viaj2016=int(tokens[2])
    if(max_value < viaj2016):
        max_value = viaj2016
    
    viaj2016_list.append(viaj2016)


    
f.close()

print(names[1:])
print(viaj2000_list[1:])
print(viaj2016_list[1:])

source = ColumnDataSource(
    data=dict(
        namesAtt=names[1:],
        v2000Att=viaj2000_list[1:],
        v2016Att=viaj2016_list[1:],
    ),
    column_names=["Name", "Travelers_2000", "Travelers_2016"]
    
)

print(source.column_names)


['Alemania', 'Austria', 'Bélgica', 'Dinamarca', 'Finlandia', 'Francia', 'Grecia', 'Irlanda', 'Italia', 'Luxemburgo', 'Países Bajos', 'Portugal', 'Polonia', 'Reino Unido', 'República Checa', 'Suecia', 'Noruega', 'Rusia', 'Suiza']
[6352796, 210910, 953410, 209614, 2486487, 2486487, 78825, 198034, 1690482, 65586, 1032814, 939173, 180289, 5625361, 169135, 442104, 213573, 195199, 478528]
[7933484, 474167, 1346973, 605733, 357970, 5646970, 111152, 794803, 2836854, 88948, 1892615, 1347136, 785146, 10208186, 234094, 1053996, 589194, 803503, 954249]
['namesAtt', 'v2000Att', 'v2016Att']


In [3]:
import numpy
from bokeh.palettes import PuBu
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource, ranges, LabelSet
from bokeh.plotting import figure

output_notebook()

rang_list = list(range(0, len(names[1:])))

rang1 = [x + 0.3 for x in rang_list]
rang2 = [x + 0.8 for x in rang_list]

total_2000 = np.sum(viaj2000_list)
total_2016 = np.sum(viaj2016_list)



log_viaj2000_list = [x * 1.0/total_2000  if x != 0 else 0 for x in viaj2000_list]
log_viaj2016_list = [x * 1.0/total_2016   if x != 0 else 0 for x in viaj2016_list]

print(log_viaj2000_list)

x_label = "Países"
y_label = "Visitantes de otros países (%)"
title = "Visitantes por país de residencia (2000 vs 2016)"
plot = figure(plot_width=700, plot_height=400, tools="save",
        x_axis_label = x_label,
        y_axis_label = y_label,
        title=title,
        x_range = names[1:])

#labels = LabelSet(x='x', y='y', text='y', level='glyph', angle=45,
#        x_offset=-13.5, y_offset=0, source=source1, render_mode='canvas')

plot.vbar(x=rang1, top=log_viaj2000_list[1:], bottom=0,width=0.4,color='#AE9E59', legend='2000')
plot.vbar(x=rang2, top=log_viaj2016_list[1:], bottom=0,width=0.4,color='#4F4478', legend='2016')

plot.xaxis.major_label_orientation = 45


#plot.add_layout(labels)
show(plot)

[8.3295825917054767e-05, 0.26458069485128094, 0.0087839613220830112, 0.03970753669378959, 0.0087299856268887588, 0.10355699414850987, 0.10355699414850987, 0.0032828967389559212, 0.0082477027948290125, 0.070405047193957293, 0.0027315200192979771, 0.0430145475743485, 0.039114595356999037, 0.0075086605793799437, 0.23428454528829457, 0.0070441197582405291, 0.01841270891061679, 0.0088948697142915683, 0.0081296309615915864, 0.019929692492218192]


In [3]:
from bokeh.io import show, output_file, output_notebook
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure

#output_file("bars.html")
output_notebook()

fruits = names[1:]
years = ['2015', '2016']

data = {'fruits' : fruits,
        '2015'   : viaj2000_list[1:],
        '2016'   : viaj2016_list[1:]}

# this creates [ ("Apples", "2015"), ("Apples", "2016"), ("Apples", "2017"), ("Pears", "2015), ... ]
x = [ (fruit, year) for fruit in fruits for year in years ]
counts = sum(zip(data['2015'], data['2016']), ()) # like an hstack

print(counts)
source = ColumnDataSource(data=dict(x=x, counts=counts))

p = figure(x_range=FactorRange(*x), plot_height=250, title="Fruit Counts by Year",
           toolbar_location=None, tools="")

p.vbar(x='x', top='counts', width=0.9, source=source)

p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 45
show(p)

(6352796, 7933484, 210910, 474167, 953410, 1346973, 209614, 605733, 2486487, 357970, 2486487, 5646970, 78825, 111152, 198034, 794803, 1690482, 2836854, 65586, 88948, 1032814, 1892615, 939173, 1347136, 180289, 785146, 5625361, 10208186, 169135, 234094, 442104, 1053996, 213573, 589194, 195199, 803503, 478528, 954249, 43547, 0, 45899, 0, 18384, 0)


In [11]:
import numpy as np

from bokeh.models import ColumnDataSource, DataRange1d, Plot, LinearAxis, Grid
from bokeh.models.glyphs import VBar
from bokeh.io import curdoc, show

N = 9
x = np.linspace(-20, 20, len(names[1:]))
y = x

print(viaj2000_list[1:])


source = ColumnDataSource(
    dict(
        x=names[1:],
        top=viaj2000_list[1:]))

xdr = DataRange1d()
ydr = DataRange1d()

from bokeh.models import Range1d

xdr = Range1d(start=0, end=len(names[1:]))
#ydr = Range1d(start=10, end=20)

plot = Plot(
    title=None, x_range=xdr, y_range=ydr, plot_width=300, plot_height=300,
    h_symmetry=False, v_symmetry=False, min_border=0, toolbar_location=None)

glyph = VBar(x="x", top="top", bottom=0, width=0.1, fill_color="#b3de69")
plot.add_glyph(source, glyph)

xaxis = LinearAxis()
plot.add_layout(xaxis, 'below')

yaxis = LinearAxis()
plot.add_layout(yaxis, 'left')

plot.add_layout(Grid(dimension=0, ticker=xaxis.ticker))
plot.add_layout(Grid(dimension=1, ticker=yaxis.ticker))

curdoc().add_root(plot)

show(plot)

[6352796, 210910, 953410, 209614, 2486487, 2486487, 78825, 198034, 1690482, 65586, 1032814, 939173, 180289, 5625361, 169135, 442104, 213573, 195199, 478528, 43547, 45899, 18384]


In [2]:
bar_opts = dict(width=0.3, alpha=0.8)
p = figure(title="Percentage of women graduating over time in two fields.", y_range=(0, 100), tools='')
p.vbar(bottom=0, top=100, x=names[1:], color='#4F4478', legend='Psychology', **bar_opts)
show(p)


In [1]:
import pandas as pd

from bokeh.io import show, curdoc
from bokeh.layouts import layout
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure
from bokeh.sampledata.degrees import data
from bokeh.themes import Theme

data = data.set_index('Year')

print(data)

categories = data.columns.tolist()
categories.reverse()

curdoc().theme = Theme(json={'attrs': {
    'Figure': {
        'toolbar_location': None,
        'outline_line_color': None,
        'min_border_right': 10,
    },
    'Axis': {
        'major_tick_in': None,
        'minor_tick_out': None,
        'minor_tick_in': None,
        'axis_line_color': '#CAC6B6',
        'major_tick_line_color': '#CAC6B6',
    },
    'Legend': {
        'background_fill_alpha': 0.8,
    }
}})


def _make_source_for_year(year):
    # Get data out of dataframe for a given year
    year_df = pd.DataFrame(data.loc[year]).reset_index()
    year_df = year_df.rename(columns={year: 'percent_female', 'index': 'category'})
    source = ColumnDataSource(year_df)
    return source


def all_for_year(year):
    source = _make_source_for_year(year)
    bar_opts = dict(y='category', height=0.5)
    p = figure(title=str(year), y_range=FactorRange(factors=categories), x_range=(0, 100), tools='')
    p.grid.grid_line_color = None
    p.hbar(left=0, right='percent_female', color='#AE9E59', legend='Female', source=source, **bar_opts)
    p.hbar(left='percent_female', right=100, color='#CAC6B6', legend='Male', source=source, **bar_opts)
    return p


def two_categories_over_time():
    bar_opts = dict(width=0.3, alpha=0.8)
    p = figure(title="Percentage of women graduating over time in two fields.", y_range=(0, 100), tools='')
    p.vbar(bottom=0, top=data['Psychology'], x=data.index - 0.2, color='#4F4478', legend='Psychology', **bar_opts)
    p.vbar(bottom=0, top=data['Engineering'], x=data.index + 0.2, color='#827F8B', legend='Engineering', **bar_opts)
    return p

print(data.index - 0.2)
l = layout([
    [all_for_year(1970), all_for_year(2010)],
    [two_categories_over_time()],
], sizing_mode='stretch_both')
show(l)

      Agriculture  Architecture  Art and Performance    Biology   Business  \
Year                                                                         
1970     4.229798     11.921005                 59.7  29.088363   9.064439   
1971     5.452797     12.003106                 59.9  29.394403   9.503187   
1972     7.420710     13.214594                 60.4  29.810221  10.558962   
1973     9.653602     14.791613                 60.2  31.147915  12.804602   
1974    14.074623     17.444688                 61.9  32.996183  16.204850   
1975    18.333162     19.134048                 60.9  34.449902  19.686249   
1976    22.252760     21.394491                 61.3  36.072871  23.430038   
1977    24.640177     23.740541                 62.0  38.331386  27.163427   
1978    27.146192     25.849240                 62.5  40.112496  30.527519   
1979    29.633365     27.770477                 63.2  42.065551  33.621634   
1980    30.759390     28.080381                 63.4  43.999257 