In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pprint import pprint

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

### Datasets load

In [3]:
dataset_base_file_name = 'stack_overflow_datasets/survey_results_'
dataset_years = list(range(2011,2021))

In [4]:
# for the original dataset: encoding = "ISO-8859-1"
dfs = {}
for year in dataset_years:
    name = f'{dataset_base_file_name}{year}.csv'
    df = pd.read_csv(name)
    df['year'] = year
    dfs[year] = df

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Renaming column name to snake case format

In [5]:
import re
def camel_to_snake(name):
    # https://stackoverflow.com/a/1176023
    name = name.replace(' ', '_').replace("'", '').replace(':', '_')
    name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    name = re.sub('_+', r'_', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()

for df in dfs.values():
    df.columns = map(lambda name: camel_to_snake(name), df.columns)

### Dataset columns comparison helpers

In [6]:
def print_year_dataset_cols_difference(base_year, years_against=[]):
    df_to_compare_with = dfs
    
    if years_against:
        df_to_compare_with = { year: dfs[year] for year in years_against }

    columns_to_compare = set(dfs[base_year].columns.tolist())

    for compared_year, compared_df in df_to_compare_with.items():
        if compared_year <= base_year: continue
        columns = compared_df.columns.tolist()
        print(f'{base_year} columns not present in {compared_year}')
        pprint(columns_to_compare - set(columns))
        print('\n')

In [7]:
def print_similar_col_name(columns: list, years=[]):
    df_to_compare_with = dfs
    if years:
        df_to_compare_with = { year: dfs[year] for year in years }

    for year, df in df_to_compare_with.items():
        print(year)
        similar = set()
        for name in columns:
            similar.update([col for col in df.columns.tolist() if name in col])
        pprint(similar)
        print('\n')

### Questions persisted over years

In [8]:
print_year_dataset_cols_difference(2016)

2016 columns not present in 2017
{'age_midpoint',
 'age_range',
 'agree_adblocker',
 'agree_alcohol',
 'agree_diversity',
 'agree_legacy',
 'agree_loveboss',
 'agree_mars',
 'agree_nightcode',
 'agree_notice',
 'agree_problemsolving',
 'agree_tech',
 'aliens',
 'big_mac_index',
 'collector',
 'commit_frequency',
 'company_size_range',
 'desktop_os',
 'dev_environment',
 'developer_challenges',
 'dogs_vs_cats',
 'education',
 'experience_midpoint',
 'experience_range',
 'hobby',
 'how_to_improve_interview_process',
 'important_buildexisting',
 'important_buildnew',
 'important_companymission',
 'important_control',
 'important_newtech',
 'important_ownoffice',
 'important_promotion',
 'important_sameend',
 'important_variety',
 'important_wfh',
 'industry',
 'interview_likelihood',
 'job_discovery',
 'job_search_annoyance',
 'new_job_value',
 'occupation',
 'occupation_group',
 'open_to_new_job',
 'programming_ability',
 'remote',
 'rep_range',
 'salary_midpoint',
 'salary_range',
 'sel

### Column names similarity
Finding column names similarity based in some basic descriptors like:
* `data`, `age`, `year`, `gender`, `experience`, `time`, `want`, `lang`

In [9]:
print_similar_col_name(['remote', 'home', 'rem'], [])

2011
set()


2012
set()


2013
set()


2014
{'remote_location', 'enjoy_working_remotely', 'remote_status'}


2015
{'how_important_is_remote_when_evaluating_new_job_opportunity?',
 'remote_status'}


2016
{'remote'}


2017
{'assess_job_remote', 'home_remote', 'collaborate_remote'}


2018
set()


2019
{'work_remote'}


2020
set()




### Dataset head exploration

In [10]:
dfs[2017].head(1)

Unnamed: 0,respondent,professional,program_hobby,country,university,employment_status,formal_education,major_undergrad,home_remote,company_size,company_type,years_program,years_coded_job,years_coded_job_past,developer_type,web_developer_type,mobile_developer_type,non_developer_type,career_satisfaction,job_satisfaction,ex_coder_return,ex_coder_not_for_me,ex_coder_balance,ex_coder10_years,ex_coder_belonged,ex_coder_skills,ex_coder_will_not_code,ex_coder_active,pronounce_gif,problem_solving,building_things,learning_new_tech,boring_details,job_security,diversity_important,annoying_ui,friends_developers,right_wrong_way,understand_computers,serious_work,invest_time_tools,work_pay_care,kinship_developers,challenge_myself,compete_peers,change_world,job_seeking_status,hours_per_week,last_new_job,assess_job_industry,assess_job_role,assess_job_exp,assess_job_dept,assess_job_tech,assess_job_projects,assess_job_compensation,assess_job_office,assess_job_commute,assess_job_remote,assess_job_leaders,assess_job_prof_devel,assess_job_diversity,assess_job_product,assess_job_finances,important_benefits,clicky_keys,job_profile,resume_prompted,learned_hiring,important_hiring_algorithms,important_hiring_tech_exp,important_hiring_communication,important_hiring_open_source,important_hiring_pm_exp,important_hiring_companies,important_hiring_titles,important_hiring_education,important_hiring_rep,important_hiring_getting_things_done,currency,overpaid,tabs_spaces,education_important,education_types,self_taught_types,time_after_bootcamp,cousin_education,work_start,have_worked_language,want_work_language,have_worked_framework,want_work_framework,have_worked_database,want_work_database,have_worked_platform,want_work_platform,ide,auditory_environment,methodology,version_control,check_in_code,ship_it,other_peoples_code,project_management,enjoy_debugging,in_the_zone,difficult_communication,collaborate_remote,metric_assess,equipment_satisfied_monitors,equipment_satisfied_cpu,equipment_satisfied_ram,equipment_satisfied_storage,equipment_satisfied_rw,influence_internet,influence_workstation,influence_hardware,influence_servers,influence_tech_stack,influence_dept_tech,influence_viz_tools,influence_database,influence_cloud,influence_consultants,influence_recruitment,influence_communication,stack_overflow_describes,stack_overflow_satisfaction,stack_overflow_devices,stack_overflow_found_answer,stack_overflow_copied_code,stack_overflow_job_listing,stack_overflow_company_page,stack_overflow_job_search,stack_overflow_new_question,stack_overflow_answer,stack_overflow_meta_chat,stack_overflow_ads_relevant,stack_overflow_ads_distracting,stack_overflow_moderation,stack_overflow_community,stack_overflow_helpful,stack_overflow_better,stack_overflow_what_do,stack_overflow_make_money,gender,highest_education_parents,race,survey_long,questions_interesting,questions_confusing,interested_answers,salary,expected_salary,year
0,1,Student,"Yes, both",United States,No,"Not employed, and not looking for work",Secondary school,,,,,2 to 3 years,,,,,,,,,,,,,,,,,"With a soft ""g,"" like ""jiff""",Strongly agree,Strongly agree,Agree,Disagree,Strongly agree,Agree,Agree,Disagree,Somewhat agree,Disagree,Strongly agree,Strongly agree,Strongly disagree,Agree,Agree,Disagree,Agree,"I'm not actively looking, but I am open to new opportunities",0.0,Not applicable/ never,Very important,Very important,Important,Very important,Very important,Very important,Important,Very important,Very important,Very important,Very important,Very important,Somewhat important,Not very important,Somewhat important,Stock options; Vacation/days off; Remote options,Yes,Other,,,Important,Important,Important,Somewhat important,Important,Not very important,Not very important,Not at all important,Somewhat important,Very important,,,Tabs,,Online course; Open source contributions,,,,6:00 AM,Swift,Swift,,,,,iOS,iOS,Atom; Xcode,Turn on some music,,,,,,,,,,,,Somewhat satisfied,Not very satisfied,Not at all satisfied,Very satisfied,Satisfied,Not very satisfied,,,,,,,,,,,,I have created a CV or Developer Story on Stack Overflow,9.0,Desktop; iOS app,At least once each week,Haven't done at all,Once or twice,Haven't done at all,Haven't done at all,Several times,Several times,Once or twice,Somewhat agree,Strongly disagree,Strongly disagree,Strongly agree,Agree,Strongly agree,Strongly agree,Strongly disagree,Male,High school,White or of European descent,Strongly disagree,Strongly agree,Disagree,Strongly agree,,,2017


### Merging and transforming common columns

* 2018 occupation is expressed in both student and employment questions, so lets merge these two

In [11]:
dfs[2018].student.replace(['Yes, part-time', 'Yes, full-time', 'No'], ['student part-time', 'student full-time', 'Not student'], inplace=True)
dfs[2018]['occupation'] = dfs[2018].student + ';' + dfs[2018].employment

* 2017 remote status related questions `collaborate_remote, assess_job_remote, home_remote`

In [12]:
display(dfs[2017]['collaborate_remote'].unique())
display(dfs[2017]['assess_job_remote'].unique())
display(dfs[2017]['home_remote'].unique())

array([nan, 'Strongly disagree', 'Somewhat agree', 'Agree', 'Disagree',
       'Strongly agree'], dtype=object)

array(['Very important', nan, 'Somewhat important',
       'Not at all important', 'Important', 'Not very important'],
      dtype=object)

array([nan, 'More than half, but not all, the time',
       'Less than half the time, but at least one day each week', 'Never',
       "All or almost all the time (I'm full-time remote)",
       "It's complicated", 'A few days each month', 'About half the time'],
      dtype=object)

### Datasets columns renaming
After exploring common names here's the final mapping.

In [13]:
for df in dfs.values():
    columns_renamed={
        'salary': 'annual_compensation',
        'annueal_compensation': 'annual_compensation',
        
        'org_size': 'company_size',
        
        'years_code': 'years_experience', 
        'years_coding': 'years_experience', 
        'years_program': 'years_experience', 
        
        'main_branch': 'occupation', 
        'professional': 'occupation',
        
        'training_and_education': 'education',
        'education_types': 'education',
        'ed_level': 'education',
        
        'remote': 'remote_status',
        'work_remote': 'remote_status',
        'home_remote': 'remote_status',
        
        'job_sat': 'job_satisfaction',
        
        'op_sys': 'os',
        'operating_system': 'os',
        'desktop_os': 'os',
        
        'age_range': 'age',
        'company_size_range': 'company_size',
        'experience_range': 'years_experience',
        'salary_range': 'annual_compensation',
        'team_size_range': 'team_size',
        
        'language_worked_with': 'programming_languages',
        'have_worked_language': 'programming_languages',
        'tech_do': 'programming_languages',
        'programming_lalnguages': 'programming_languages',
        
        'future_language_or_tech': 'want_work_language',
        'tech_want': 'want_work_language',
        'language_desire_next_year': 'want_work_language',
        'new_tech_interests': 'want_work_language',
        
        'have_worked_database': 'database_worked_with',
        'want_work_database': 'database_desire_work',
        'database_desire_next_year': 'database_desire_work',
        
        'major_undergrad': 'undergrad_mayor', 
 
        'so_visit1st': 'stack_overflow_visit1st',
        'so_visit_freq': 'stack_overflow_visit_freq',
        'so_visit_to': 'stack_overflow_visit_to',
        'so_find_answer': 'stack_overflow_find_answer',
        'so_time_saved': 'stack_overflow_time_saved',
        'so_how_much_time': 'stack_overflow_how_much_time',
        'so_account': 'stack_overflow_account',
        'so_part_freq': 'stack_overflow_part_freq',
        'so_jobs': 'stack_overflow_jobs',
        'so_comm': 'stack_overflow_comm',
        'so_new_content': 'stack_overflow_new_content',
        'so_region': 'stack_overflow_region',
        'newso_sites': 'new_stack_overflow_sites'
    }
    df.rename(columns=columns_renamed, inplace=True)

### Questions over time
After exploring some common questions persisted over time, here's the final list of questions that will be used for analysis:

`P: present`  
`NP: not present`  
`AP: always present`

* `programming_languages`: AP
* `country`: AP
* `years_experience`: AP
* `occupation`: AP


* `age`: AP but 2017
* `os`: AP but 2017
* `job_satisfaction`: AP but 2014
* `company_size`: AP but 2014, 2015


* `want_work_language`: P since 2013
* `gender`: P since 2014
* `education`: P since 2015
* `database_desire_work`: P since 2017
* `database_worked_with`: P since 2017
* `undergrad_major`: P since 2018


* `remote_status`: P 2014-2017, 2019


* `industry`: NP since 2017

### Merging datasets common questions

In [14]:
features = [
    'year',
    'country', 'programming_languages', 'years_experience', 'occupation',
    'age', 'os', 'job_satisfaction', 'company_size',
    'want_work_language', 'gender', 'education', 'database_desire_work', 'database_worked_with', 'undergrad_major',
    'remote_status',
    'industry'
]

### Missing values per column

In [15]:
dfs[2011].isna().mean().round(4) * 100

country                               0.00
usa_state                            63.42
age                                   2.99
years_experience                      2.99
industry                              2.99
company_size                          5.94
occupation                            5.94
recommendation_likely_acted_upon      7.22
purchase_involvement                  5.94
purchase_involvement_type            30.61
outside_expenditures_budget          30.61
project_type                          7.64
programming_languages                 7.79
os                                    7.93
job_satisfaction                      9.24
annual_compensation                  15.93
techn_related_purchases_last_year    10.95
stackoverflow_sites_most_visited     10.49
year                                  0.00
dtype: float64

### Missing values per row

In [16]:
missing_values_percentage = 0.6
df_missing_gt_percentage = dfs[2011][(dfs[2011].isna().sum(1)/dfs[2011].shape[1]).gt(missing_values_percentage)]

In [17]:
missing_percentage_per_row = 100 - df_missing_gt_percentage.apply(lambda x: x.count(), axis=1)/df_missing_gt_percentage.shape[1]*100

In [18]:
missing_percentage_per_row.unique()

array([89.47368421, 68.42105263, 73.68421053, 84.21052632])