In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('whitegrid')
import networkx as nx
import plotly.express as px
import plotly.graph_objs as go
from dash import Dash, html, dcc, Input, Output

from kg import KnowledgeGraph

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

In [None]:
countries = pd.read_csv('Data/countries of the world.csv')
turn = pd.read_csv('Data/turnover_preprocessed (1).csv')

try:
    comp = pd.read_csv('Data/comp.csv')
except:
    comp = pd.read_csv('Data/Companies_pre.csv')
    comp = comp[comp.company_number.isin(turn.Company)]
    comp.to_csv('Data/comp.csv', index=False) # is 9 kB in plaats van 1.2 GB

countries['Country'] = countries.Country.apply(lambda x: x.strip().lower())
countries = countries[countries.Country.isin(comp.country)]

turn_cols = turn.columns.tolist()
turn_cols[0] = 'turn_idx' # Unnamed 0
turn.columns = turn_cols
turn['turn_idx'] = turn['turn_idx'].apply(lambda x: 'e_'+str(x))

comp = comp.drop(columns=['industry', 'linkedin url'])

kg = KnowledgeGraph(turn, comp, countries) # adds country/size range of company to turn through knowledge graph
turn = kg.turn
    
print('Companies:', comp.shape)
print('Countries:', countries.shape)
print('Turnover:', turn.shape)

In [None]:
def selection_summary(turn_filt):
    
    top_industry = turn_filt.industry.value_counts().index[0]
    top_size = turn_filt['size range'].value_counts().index[0]
    top_country = turn_filt.country.value_counts().index[0]
    
    turn_industry = turn_filt.groupby('industry').agg({'turnover':'mean'}).sort_values('turnover').index[0]
    turn_size = turn_filt.groupby('size range').agg({'turnover':'mean'}).sort_values('turnover').index[0]
    turn_country = turn_filt.groupby('country').agg({'turnover':'mean'}).sort_values('turnover').index[0]
    
    worst_turn_industry = turn_filt.groupby('industry').agg({'turnover':'mean'}).sort_values('turnover').index[-1]
    worst_turn_size = turn_filt.groupby('size range').agg({'turnover':'mean'}).sort_values('turnover').index[-1]
    worst_turn_country = turn_filt.groupby('country').agg({'turnover':'mean'}).sort_values('turnover').index[-1]
    
    str_out = [
    "Most common country: {} \nLowest turnover country: {} \nHighest turnover country: {}\n".format(top_country, turn_country, worst_turn_country),
    "Most common size range: {} \nLowest turnover size range: {} \nHighest turnover size range: {}\n".format(top_size, turn_size, worst_turn_size),
    "Most common industry: {} \nLowest turnover industry: {} \nHighest turnover industry: {}".format(top_industry, turn_industry, worst_turn_industry)
    ]
    return "\n".join(str_out)

In [None]:
app = Dash(__name__, external_stylesheets=external_stylesheets)

app.layout = html.Div([
    # filters
    html.Div([
        dcc.Markdown('## Filters'),
        # country
        dcc.Markdown('#### Included countries:'),
        dcc.Checklist(
            id='country_filter',
            options=turn.country.unique(),
            value=turn.country.unique(),
            style={'width':'50%'}
        ),
        # industry
        dcc.Markdown('#### Included industries:'),
        dcc.Dropdown(
            id='industry_filter',
            options=turn.industry.unique(),
            value=turn.industry.unique(),
            multi=True,
            clearable=False,
        ),
        # profession
        dcc.Markdown('#### Included professions:'),
        dcc.Dropdown(
            id='profession_filter',
            options=turn.profession.unique(),
            value=turn.profession.unique(),
            multi=True,
            clearable=False,
        ),
        # profession
        dcc.Markdown('#### Included size ranges:'),
        dcc.Checklist(
            id='size_filter',
            options=sorted(turn['size range'].unique(), key=len),
            value=sorted(turn['size range'].unique(), key=len),
        ),
        
    ], className='six columns', style={'width':'20%'}),
    
    dcc.Store('turn_filt'),
    # bar plot and x/y dropdown
    html.Div([
        dcc.Markdown('### Bar / line plot'),
        dcc.Graph(id='fig_bar_line'), # bar plot
        html.Div([
            
            html.Div([ # x variable
                dcc.Markdown('#### x-axis variable:'),
                dcc.Dropdown( 
                            id="x_var",
                            searchable=True, clearable=False, multi = False,
                            options=turn.dtypes[~turn.columns.isin(['turn_idx', 'instance', 'turnover', 'layer', 'Company'])].index.tolist(),
                            value='country',
                ),
            ], className='six columns'),
            
            html.Div([ # y variable
                dcc.Markdown('#### y-axis variable:'),
                dcc.Dropdown( 
                        id="y_var",
                        searchable=True, clearable=False, multi = False,
                        options=turn.dtypes[(turn.dtypes != 'O') & (~turn.columns.isin(['Unnamed: 0', 'Company', 'company_number', 'layer']))].index.tolist(),
                        value='turnover',
                ),
            ], className='six columns'),
            
        ], className='row'),
        dcc.Markdown('### Selection summary:'),
        dcc.Markdown(id='selection_summary',
                     style={'white-space':'pre'})
        
    ], className='six columns', style={'width':'25%'}),
    
    # knowledge graph
    html.Div([
        dcc.Markdown('### Knowledge graph'),
        dcc.Graph(id='fig_network'),
        html.Div([
            dcc.Markdown('##### Number of employees to sample for the graph visualization:'),
            dcc.Slider(0, len(turn), step=5,
                       value=25, 
                       id='sample_slider', 
                       marks=None,
                       tooltip={"placement": "bottom", "always_visible": True}),
        ])
    ], className='six columns', style={'width':'40%'})

], className='row')

# update turnover dataset with filters
@app.callback(
    Output('turn_filt', 'data'),
    Output('selection_summary', 'children'),
    
    Input('country_filter', 'value'),
    Input('industry_filter', 'value'),
    Input('profession_filter', 'value'),
    Input('size_filter', 'value')
)
def update_data(filt_countries, filt_industries, filt_professions, filt_size):
    turn_filt = turn[turn.country.isin(filt_countries) & turn.industry.isin(filt_industries) & turn.profession.isin(filt_professions) & turn['size range'].isin(filt_size)]
    return turn_filt.to_json(), selection_summary(turn_filt)

# update the knowledge graph with filters    
@app.callback(
    Output('fig_network', 'figure'),
    Output('sample_slider', 'max'),
    Output('sample_slider', 'marks'),
    
    Input('turn_filt', 'data'),
    Input('sample_slider', 'value')
)
def create_network(data, slider_val):
    turn_filt = pd.read_json(data)
    
    kg = KnowledgeGraph(turn_filt, comp, countries)
    kg.sample_network(slider_val)
    fig_network = kg.plot_network_plotly(show=False)
    slider_markers = {i:str(i) for i in range(0, len(turn_filt), int(len(turn_filt)/10))}
    slider_markers.popitem()
    slider_markers[len(turn_filt)] = str(len(turn_filt))

    return fig_network, len(turn_filt), slider_markers

# update the bar chart with filters
@app.callback(
    Output('fig_bar_line', 'figure'),
    
    Input('turn_filt', 'data'),
    Input('x_var', 'value',),
    Input('y_var', 'value')
)
def create_bar_line(data, x_var, y_var):
    turn_filt = pd.read_json(data)

    df_grouped = turn_filt.groupby(x_var).agg({y_var:'mean'}).reset_index()
    if (turn_filt.dtypes[x_var] == 'O') or (len(turn_filt[x_var].unique()) == 2):
        fig = px.bar(df_grouped.sort_values(y_var), x=x_var, y=y_var)
    else:
        fig = px.line(df_grouped.sort_values(x_var), x=x_var, y=y_var)
    return fig

if __name__ == '__main__':
    app.run_server(debug=False)