In [1]:
import pandas as pd
import numpy as np
from math import pi
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import NumeralTickFormatter
output_notebook()

In [39]:
df = pd.read_csv('./data/multipleChoiceResponses.csv', low_memory=False)
df_text = pd.read_csv('./data/freeFormResponses.csv', low_memory=False)
df_schema = pd.read_csv('./data/SurveySchema.csv')

### Job titles of people who took the survey

In [40]:
job_titles = df['Q6'].value_counts().index.tolist()[:-1]
counts = df['Q6'].value_counts().values[:-1]

p = figure(x_range=job_titles, plot_height=350, plot_width=900, title="Job Title Counts",
           toolbar_location=None, tools="")

p.vbar(x=job_titles, top=counts, width=0.9)

p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.major_label_orientation = pi/6

show(p)

### Industries of people who took the survey

In [4]:
industry_titles = df['Q7'].value_counts().index.tolist()[:-1]
counts = df['Q7'].value_counts().values[:-1]

p = figure(x_range=industry_titles, plot_height=350, plot_width=900, title="Industry Counts",
           toolbar_location=None, tools="")

p.vbar(x=industry_titles, top=counts, width=0.9)

p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.major_label_orientation = pi/6
p.min_border_left = 100

show(p)

### Do data scientists say that their employers use ML?

In [18]:
df_data_scientist = df[df['Q6'] == 'Data Scientist']

ml_use = [label[:50] for label in df_data_scientist['Q10'].value_counts().index.tolist()]
counts = df_data_scientist['Q10'].value_counts().tolist()

p = figure(x_range=ml_use, plot_height=550, plot_width=900, title="Data scientist employer's use of ML",
           toolbar_location=None, tools="")

p.vbar(x=ml_use, top=counts, width=0.9)

p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.major_label_orientation = pi/6
p.min_border_left = 150

show(p)

In [24]:
df_data_scientist = df[df['Q26'] == 'Definitely yes']

ml_use = [label[:50] for label in df_data_scientist['Q10'].value_counts().index.tolist()]
counts = df_data_scientist['Q10'].value_counts().tolist()

p = figure(x_range=ml_use, plot_height=550, plot_width=900, title="Data scientist employer's use of ML",
           toolbar_location=None, tools="")

p.vbar(x=ml_use, top=counts, width=0.9)

p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.major_label_orientation = pi/6
p.min_border_left = 150

show(p)

### Average compensation comparisons between job titles

In [6]:
temp = df.copy(deep=True)
temp['Q9'].replace('0-10,000', 5000, inplace=True)
temp['Q9'].replace('10-20,000', 15000, inplace=True)
temp['Q9'].replace('20-30,000', 25000, inplace=True)
temp['Q9'].replace('30-40,000', 35000, inplace=True)
temp['Q9'].replace('40-50,000', 45000, inplace=True)
temp['Q9'].replace('50-60,000', 55000, inplace=True)
temp['Q9'].replace('60-70,000', 65000, inplace=True)
temp['Q9'].replace('70-80,000', 75000, inplace=True)
temp['Q9'].replace('80-90,000', 85000, inplace=True)
temp['Q9'].replace('90-100,000', 95000, inplace=True)
temp['Q9'].replace('100-125,000', 112500, inplace=True)
temp['Q9'].replace('125-150,000', 137500, inplace=True)
temp['Q9'].replace('150-200,000', 175000, inplace=True)
temp['Q9'].replace('200-250,000', 225000, inplace=True)
temp['Q9'].replace('250-300,000', 275000, inplace=True)
temp['Q9'].replace('300-400,000', 350000, inplace=True)
temp['Q9'].replace('400-500,000', 450000, inplace=True)
temp['Q9'].replace('500,000+', 500000, inplace=True)
temp['Q9'].replace('I do not wish to disclose my approximate yearly compensation', np.nan, inplace=True)
temp['Q9'].replace('What is your current yearly compensation (approximate $USD)?', np.nan, inplace=True)

averages_tuples = []
for job in job_titles:
    if job == 'Not employed' or job == 'Student' or job == 'Other':
        continue
    averages_tuples.append((job, temp[temp['Q6'] == job]['Q9'].mean()))

averages_tuples.sort(key=lambda x: x[1])

In [7]:
job_titles = [tup[0] for tup in averages_tuples]
averages = [tup[1] for tup in averages_tuples]

p = figure(x_range=job_titles, plot_height=550, plot_width=900, title="Average compensation per job title",
           toolbar_location=None, tools="")

p.vbar(x=job_titles, top=averages, width=0.9)

p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.major_label_orientation = pi/6
p.yaxis.formatter=NumeralTickFormatter(format='$0,0.00')

show(p)

In [12]:
temp['Q6'].value_counts()

Student                                                                                                    5253
Data Scientist                                                                                             4137
Software Engineer                                                                                          3130
Data Analyst                                                                                               1922
Other                                                                                                      1322
Research Scientist                                                                                         1189
Not employed                                                                                                842
Consultant                                                                                                  785
Business Analyst                                                                                        

In [31]:
df[(df['Q6'] != 'Student') & (df['Q6'] == 'Data Scientist')]['Q3'].value_counts()

United States of America                                946
India                                                   595
France                                                  181
Russia                                                  178
Germany                                                 165
United Kingdom of Great Britain and Northern Ireland    163
Other                                                   147
Brazil                                                  125
Canada                                                  107
Spain                                                   106
China                                                   106
Netherlands                                              75
Italy                                                    74
Poland                                                   73
Israel                                                   63
Turkey                                                   61
Japan                                   