In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pprint import pprint

from scripts.dataset_explorer import SODataSetExplorer

### Datasets load

In [2]:
df_explorer = SODataSetExplorer()

### Comparing column across years

In [6]:
df_explorer.dataset_cols_difference(2011);

2011 columns not present in 2012
{'programming_languages',
 'programming_languages_oher',
 'recommendation_likely_acted_upon',
 'stackoverflow_sites_most_visited'}


2011 columns not present in 2013
{'programming_languages_oher',
 'project_type',
 'recommendation_likely_acted_upon',
 'stackoverflow_sites_most_visited'}


2011 columns not present in 2014
{'company_size',
 'job_satisfaction',
 'programming_languages_oher',
 'project_type',
 'recommendation_likely_acted_upon',
 'stackoverflow_sites_most_visited',
 'techn_related_purchases_last_year'}


2011 columns not present in 2015
{'annual_compensation',
 'company_size',
 'outside_expenditures_budget',
 'programming_languages',
 'programming_languages_oher',
 'project_type',
 'purchase_involvement',
 'purchase_involvement_type',
 'recommendation_likely_acted_upon',
 'stackoverflow_sites_most_visited',
 'tech_products_own',
 'tech_products_own_other',
 'techn_related_purchases_last_year',
 'usa_state'}


2011 columns not present in 201

### Column names similarity
Finding column names similarity based in some basic descriptors like:
* `data`, `age`, `year`, `gender`, `experience`, `time`, `want`, `lang`

In [5]:
df_explorer.similar_columns(['language']);

2011
{'programming_languages', 'programming_languages_oher'}


2012
{'programming_language_other', 'programming_language'}


2013
{'programming_languages', 'programming_languages_other'}


2014
{'programming_languages', 'programming_languages_other'}


2015
{'programming_language',
 'programming_language_other',
 'want_work_language',
 'want_work_language_other'}


2016
set()


2017
{'want_work_language', 'have_worked_language'}


2018
{'language_worked_with', 'language_desire_next_year'}


2019
{'language_worked_with', 'language_desire_next_year'}


2020
{'language_worked_with', 'language_desire_next_year'}




### Dataset head exploration

In [7]:
df_explorer.datasets[2017].head(1)

Unnamed: 0,respondent,professional,program_hobby,country,university,employment_status,formal_education,major_undergrad,home_remote,company_size,...,gender,highest_education_parents,race,survey_long,questions_interesting,questions_confusing,interested_answers,salary,expected_salary,year
0,1,Student,"Yes, both",United States,No,"Not employed, and not looking for work",Secondary school,,,,...,Male,High school,White or of European descent,Strongly disagree,Strongly agree,Disagree,Strongly agree,,,2017


### Merging and transforming common columns

* 2018 occupation is expressed in both student and employment questions, so lets merge these two

In [8]:
df_explorer.datasets[2018].student.replace(
    ['Yes, part-time', 'Yes, full-time', 'No'], 
    ['student part-time', 'student full-time', 'Not student'], 
    inplace=True
)

In [9]:
df_explorer.datasets[2018]['occupation'] = (
    df_explorer.datasets[2018].student + ';' + 
    df_explorer.datasets[2018].employment
)

* 2017 remote status related questions `collaborate_remote, assess_job_remote, home_remote`.  

  *home_remote* feature captures the time spent in remote work

In [10]:
display(df_explorer.datasets[2017]['collaborate_remote'].unique())
display(df_explorer.datasets[2017]['assess_job_remote'].unique())
display(df_explorer.datasets[2017]['home_remote'].unique())

array([nan, 'Strongly disagree', 'Somewhat agree', 'Agree', 'Disagree',
       'Strongly agree'], dtype=object)

array(['Very important', nan, 'Somewhat important',
       'Not at all important', 'Important', 'Not very important'],
      dtype=object)

array([nan, 'More than half, but not all, the time',
       'Less than half the time, but at least one day each week', 'Never',
       "All or almost all the time (I'm full-time remote)",
       "It's complicated", 'A few days each month', 'About half the time'],
      dtype=object)

### Datasets columns renaming
After exploring common names here's the mapping taken.  
*- **Note** This mapping is performed over the initial openrefine operation results*

In [11]:
from scripts.column_rename import column_rename_mapping

df_explorer.rename_columns(column_rename_mapping)

### Questions over time
After exploring some common questions persisted over time, here's the final list of questions that will be used for analysis:

`P: present`  
`NP: not present`  
`AP: always present`

* `programming_languages`: AP
* `country`: AP
* `years_experience`: AP
* `occupation`: AP


* `age`: AP but 2017
* `os`: AP but 2017
* `job_satisfaction`: AP but 2014
* `company_size`: AP but 2014, 2015


* `want_work_language`: P since 2013
* `gender`: P since 2014
* `education`: P since 2015
* `database_desire_work`: P since 2017
* `database_worked_with`: P since 2017
* `undergrad_major`: P since 2018


* `remote_status`: P 2014-2017, 2019


* `industry`: NP since 2017

### Missing values per row per dataset
Feature column for tracking percetage of missing rows per year dataset.

In [12]:
for df in df_explorer.datasets.values():
    presence_values_percentage = df.apply(lambda x: x.count(), axis=1)/df.shape[1]*100
    df['original_missing_values_per_row_percentage'] = 100 - presence_values_percentage
    df['original_missing_values_per_row_percentage'] = df['original_missing_values_per_row_percentage'].round(2)

In [13]:
df_explorer.datasets[2011]['original_missing_values_per_row_percentage'].value_counts().to_frame().sort_index()

Unnamed: 0,original_missing_values_per_row_percentage
0.0,13
4.55,243
9.09,760
13.64,839
18.18,314
22.73,303
27.27,62
31.82,23
36.36,21
40.91,9


### Merging datasets common questions

In [15]:
features = [
    'year', 'original_missing_values_per_row_percentage', 
    'programming_languages', 'programming_languages_other',
    'country', 'years_experience', 'occupation',
    'age', 'os', 'job_satisfaction', 'company_size',
    'want_work_language', 'gender', 'education', 'database_desire_work', 'database_worked_with', 'undergrad_major',
    'remote_status',
    'industry'
]

In [16]:
concat_df = pd.concat([df for df in df_explorer.datasets.values()], ignore_index=True)[features]

In [17]:
concat_df.to_csv('survey_report_concat_common_questions.csv', index=False)