In [1]:
# https://www.kaggle.com/c/kaggle-survey-2019/notebooks
# https://www.kaggle.com/shivamb/spending-for-ms-in-data-science-worth-it
# https://www.kaggle.com/fatihbilgin/data-science-trends-in-2019
# https://www.kaggle.com/ibtesama/a-guide-for-aspiring-data-scientists

In [16]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
import pywaffle
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True) 

import warnings
warnings.filterwarnings('ignore')

In [43]:
def counts_pct(category,data):
    '''Funkcja, która zwraca dla tanej kategori liczbe wartości, oraz wartość procentową'''
    base = data[category].value_counts()
    pct = base/data[category].shape[0] * 100
    joined = pd.concat([base,pct],axis=1)
    joined.columns = [str(category),str(category) + ' %']
    return joined

def gender(x):
    if x == 'Prefer not to say' or x == 'Prefer to self-describe':
        return 'Other'
    else:
        return x

In [4]:
data = pd.read_csv(r"responses.csv")

In [44]:
data['Q2'] = data['Q2'].apply(lambda x: gender(x))

In [45]:
data.head()

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q10,Q11,Q14,Q15,Q19,Q22,Q23
0,What is your age (# years)?,What is your gender? - Selected Choice,In which country do you currently reside?,What is the highest level of formal education ...,Select the title most similar to your current ...,What is the size of the company where you are ...,Approximately how many individuals are respons...,Does your current employer incorporate machine...,What is your current yearly compensation (appr...,Approximately how much money have you spent on...,What is the primary tool that you use at work ...,How long have you been writing code to analyze...,What programming language would you recommend ...,Have you ever used a TPU (tensor processing un...,For how many years have you used machine learn...
1,22-24,Male,France,Master’s degree,Software Engineer,"1000-9,999 employees",0,I do not know,"30,000-39,999",$0 (USD),"Basic statistical software (Microsoft Excel, G...",1-2 years,Python,Never,1-2 years
2,40-44,Male,India,Professional degree,Software Engineer,"> 10,000 employees",20+,"We have well established ML methods (i.e., mod...","5,000-7,499","> $100,000 ($USD)","Cloud-based data software & APIs (AWS, GCP, Az...",I have never written code,,,
3,55-59,Female,Germany,Professional degree,,,,,,,,,,,
4,40-44,Male,Australia,Master’s degree,Other,"> 10,000 employees",20+,I do not know,"250,000-299,999","$10,000-$99,999","Local development environments (RStudio, Jupyt...",1-2 years,Python,Once,2-3 years


In [6]:
data.isnull().sum()

Time from Start to Finish (seconds)        0
Q1                                         0
Q2                                         0
Q2_OTHER_TEXT                              0
Q3                                         0
                                       ...  
Q34_Part_9                             19238
Q34_Part_10                            19191
Q34_Part_11                            18472
Q34_Part_12                            19430
Q34_OTHER_TEXT                             0
Length: 246, dtype: int64

In [7]:
questions = [f'Q{i}' for i in range(1,33)]

In [8]:
q = []
for i in questions:
    if i in data.columns:
        q.append(i)

In [9]:
q

['Q1',
 'Q2',
 'Q3',
 'Q4',
 'Q5',
 'Q6',
 'Q7',
 'Q8',
 'Q10',
 'Q11',
 'Q14',
 'Q15',
 'Q19',
 'Q22',
 'Q23']

In [10]:
data = data[q]

In [11]:
data.head()

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q10,Q11,Q14,Q15,Q19,Q22,Q23
0,What is your age (# years)?,What is your gender? - Selected Choice,In which country do you currently reside?,What is the highest level of formal education ...,Select the title most similar to your current ...,What is the size of the company where you are ...,Approximately how many individuals are respons...,Does your current employer incorporate machine...,What is your current yearly compensation (appr...,Approximately how much money have you spent on...,What is the primary tool that you use at work ...,How long have you been writing code to analyze...,What programming language would you recommend ...,Have you ever used a TPU (tensor processing un...,For how many years have you used machine learn...
1,22-24,Male,France,Master’s degree,Software Engineer,"1000-9,999 employees",0,I do not know,"30,000-39,999",$0 (USD),"Basic statistical software (Microsoft Excel, G...",1-2 years,Python,Never,1-2 years
2,40-44,Male,India,Professional degree,Software Engineer,"> 10,000 employees",20+,"We have well established ML methods (i.e., mod...","5,000-7,499","> $100,000 ($USD)","Cloud-based data software & APIs (AWS, GCP, Az...",I have never written code,,,
3,55-59,Female,Germany,Professional degree,,,,,,,,,,,
4,40-44,Male,Australia,Master’s degree,Other,"> 10,000 employees",20+,I do not know,"250,000-299,999","$10,000-$99,999","Local development environments (RStudio, Jupyt...",1-2 years,Python,Once,2-3 years


## Przeanalizujmy tylko pytania, które mają jedną część

'What is your age (# years)?'  
'What is your gender? - Selected Choice'   
'In which country do you currently reside?'  
'What is the highest level of formal education that you have attained or plan to attain within the next 2 years?'  
'Select the title most similar to your current role (or most recent title if retired): - Selected Choice'  
'What is the size of the company where you are employed?'  
'Approximately how many individuals are responsible for data science workloads at your place of business?'  
'Does your current employer incorporate machine learning methods into their business?'  
'What is your current yearly compensation (approximate $USD)?'  
'Approximately how much money have you spent on machine learning and/or cloud computing products at your work in the past 5 years?'   
'What is the primary tool that you use at work or school to analyze data? (Include text response) - Selected Choice'  
'How long have you been writing code to analyze data (at work or at school)?'  
'What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice'  
'Have you ever used a TPU (tensor processing unit)?'  
'For how many years have you used machine learning methods?' 

In [12]:
for i in data.head(1):
    print(data.head(1)[i].values)

['What is your age (# years)?']
['What is your gender? - Selected Choice']
['In which country do you currently reside?']
['What is the highest level of formal education that you have attained or plan to attain within the next 2 years?']
['Select the title most similar to your current role (or most recent title if retired): - Selected Choice']
['What is the size of the company where you are employed?']
['Approximately how many individuals are responsible for data science workloads at your place of business?']
['Does your current employer incorporate machine learning methods into their business?']
['What is your current yearly compensation (approximate $USD)?']
['Approximately how much money have you spent on machine learning and/or cloud computing products at your work in the past 5 years?']
['What is the primary tool that you use at work or school to analyze data? (Include text response) - Selected Choice']
['How long have you been writing code to analyze data (at work or at school)?']

In [13]:
data.isnull().sum()/data.shape[0]*100

Q1      0.000000
Q2      0.000000
Q3      0.000000
Q4      1.998174
Q5      3.093620
Q6     28.983670
Q7     30.905771
Q8     32.914089
Q10    36.616290
Q11    37.868952
Q14    20.422964
Q15    20.742469
Q19    27.081854
Q22    27.964297
Q23    28.070798
dtype: float64

### Wiek

In [37]:
data['Q1'].value_counts()[:-1]

25-29    4458
22-24    3610
30-34    3120
18-21    2502
35-39    2087
40-44    1439
45-49     949
50-54     692
55-59     422
60-69     338
70+       100
Name: Q1, dtype: int64

In [46]:
data.groupby('Q2')['Q1'].value_counts()

Q2                                      Q1                         
Female                                  25-29                           810
                                        22-24                           678
                                        30-34                           513
                                        18-21                           419
                                        35-39                           304
                                        40-44                           209
                                        45-49                           118
                                        50-54                            80
                                        55-59                            51
                                        60-69                            26
                                        70+                               4
Male                                    25-29                          3562
                    

In [59]:
male_age = go.Bar(
    y=data[data['Q2']=='Male']['Q1'].value_counts().values[:-1],
    x=data[data['Q2']=='Male']['Q1'].value_counts().index[:-1],
    marker=dict(
        color='rgb(49,130,189)',
    ),
    name='Age of Male Participants',
    orientation='v',)

female_age = go.Bar(
    y=data[data['Q2']=='Female']['Q1'].value_counts().values[:-1],
    x=data[data['Q2']=='Female']['Q1'].value_counts().index[:-1],
    marker=dict(
        color='rgb(204,204,204)',
    ),
    name='Age of Female Participants',
    orientation='v',)

other_age = go.Bar(
    y=data[data['Q2']=='Other']['Q1'].value_counts().values[:-1],
    x=data[data['Q2']=='Other']['Q1'].value_counts().index[:-1],
    marker=dict(
        color='#B0122C',
    ),
    name='Age of Other Participants',
    orientation='v',)



layout = dict(autosize= False, 
              width= 1000, 
              height= 500, 
              legend= dict(font=dict(size=10),
              yanchor='top',xanchor='center',orientation='h',x= 0.5, y=1.12
                           
                          ))


fig = go.Figure(data=[male_age,female_age,other_age], layout = layout)
fig.update_layout(title_text='Age of Participants', title_x=0.5)
fig.show()