In [1]:
# Constants
# DATA_PATH   = "../data/raw/survey_results_public_2022.csv"
DATA_PATH   = "../data/processed/0_preprocessed_df.pkl"
EXPORT_PATH = "../data/processed/01_preprocessed_df.pkl"

REPLACE_DICT = {
    'YearsCodePro': {'Less than 1 year': 0, 'More than 50 years': 51}, 
    'YearsCode':    {'Less than 1 year': 0, 'More than 50 years': 51}}


In [2]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle

___________

## Functions

In [3]:
def split_answers(data_series, delimiter=";"):
    """ 
    Split multiple answers in a single string 
    to a list of single strings each represnting a single answers 

    Parameters:
    * data_series (pd.Series): String series with answers 
    * delimiter (string): Another decimal integer 
                          Defaults to ";"

    Returns: (pd.Series) 
    """

    # Check if multiple answers exist - if none: return original 
    if not data_series.str.contains(delimiter).any():
        return data_series
    
    # Else, split each value to a list 
    modified_series = data_series.str.split(delimiter)    
    
    # Replace NAs with empty lists 
    null_mask = modified_series.isnull()
    modified_series.loc[null_mask] = modified_series.loc[null_mask].apply(lambda x: [])
    
    return modified_series

___________

## Processing 

In [4]:
#processing data
df = pd.read_pickle(DATA_PATH)
df1 = df.copy()

In [5]:
df1.head(3)

Unnamed: 0,Employment,RemoteWork,EdLevel,YearsCode,YearsCodePro,DevType,Country,CompTotal,LanguageHaveWorkedWith,LanguageWantToWorkWith,...,ToolsTechHaveWorkedWith,ToolsTechWantToWorkWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsWantToWorkWith,OpSysProfessional use,OpSysPersonal use,VersionControlSystem,VCInteraction,Gender,WorkExp
0,,,,,,,,,,,...,,,,,,,,,,
1,"Employed, full-time",Fully remote,,,,,Canada,,JavaScript;TypeScript,Rust;TypeScript,...,,,,,macOS,Windows Subsystem for Linux (WSL),Git,,,
2,"Employed, full-time","Hybrid (some remote, some in-person)","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",14.0,5.0,Data scientist or machine learning specialist;...,United Kingdom of Great Britain and Northern I...,32000.0,C#;C++;HTML/CSS;JavaScript;Python,C#;C++;HTML/CSS;JavaScript;TypeScript,...,,,Notepad++;Visual Studio,Notepad++;Visual Studio,Windows,Windows,Git,Code editor,Man,


In [6]:
#Replace Values with the dictionary
for col, replacement in REPLACE_DICT.items():
    df1[col] = df1[col].replace(replacement)

In [7]:
#Split multiple answers
cat_cols = df1.select_dtypes(include = "object")
for col in cat_cols:
    df1[col] = split_answers(df1[col])

In [8]:
df1.head()

Unnamed: 0,Employment,RemoteWork,EdLevel,YearsCode,YearsCodePro,DevType,Country,CompTotal,LanguageHaveWorkedWith,LanguageWantToWorkWith,...,ToolsTechHaveWorkedWith,ToolsTechWantToWorkWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsWantToWorkWith,OpSysProfessional use,OpSysPersonal use,VersionControlSystem,VCInteraction,Gender,WorkExp
0,[],,,,,[],,,[],[],...,[],[],[],[],[],[],[],[],[],
1,"[Employed, full-time]",Fully remote,,,,[],Canada,,"[JavaScript, TypeScript]","[Rust, TypeScript]",...,[],[],[],[],[macOS],[Windows Subsystem for Linux (WSL)],[Git],[],[],
2,"[Employed, full-time]","Hybrid (some remote, some in-person)","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",14.0,5.0,[Data scientist or machine learning specialist...,United Kingdom of Great Britain and Northern I...,32000.0,"[C#, C++, HTML/CSS, JavaScript, Python]","[C#, C++, HTML/CSS, JavaScript, TypeScript]",...,[],[],"[Notepad++, Visual Studio]","[Notepad++, Visual Studio]",[Windows],[Windows],[Git],[Code editor],[Man],
3,"[Employed, full-time]",Fully remote,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",20.0,17.0,"[Developer, full-stack]",Israel,60000.0,"[C#, JavaScript, SQL, TypeScript]","[C#, SQL, TypeScript]",...,[],[],"[Notepad++, Visual Studio, Visual Studio Code]","[Notepad++, Visual Studio, Visual Studio Code]",[Windows],[Windows],[Git],"[Code editor, Command-line, Version control ho...",[Man],
4,"[Employed, full-time]","Hybrid (some remote, some in-person)","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",8.0,3.0,"[Developer, front-end, Developer, full-stack, ...",United States of America,,"[C#, HTML/CSS, JavaScript, SQL, Swift, TypeScr...","[C#, Elixir, F#, Go, JavaScript, Rust, TypeScr...",...,[npm],"[Docker, Kubernetes]","[Notepad++, Visual Studio, Visual Studio Code,...","[Rider, Visual Studio, Visual Studio Code]",[Windows],"[macOS, Windows]","[Git, Other (please specify):]",[Code editor],[],


___________

## Visually verify results 

In [9]:
i = df.sample(1).index[0]
print(df['LanguageHaveWorkedWith'].iloc[i])
print(df1['LanguageHaveWorkedWith'].iloc[i])

Bash/Shell;HTML/CSS;Lua
['Bash/Shell', 'HTML/CSS', 'Lua']


In [10]:
i = df.sample(1).index[0]
print(df['DevType'].iloc[i])
print(df1['DevType'].iloc[i])

Developer, full-stack;System administrator
['Developer, full-stack', 'System administrator']


In [11]:
i = df.sample(1).index[0]
print(df['EdLevel'].iloc[i])
print(df1['EdLevel'].iloc[i])

Bachelor’s degree (B.A., B.S., B.Eng., etc.)
Bachelor’s degree (B.A., B.S., B.Eng., etc.)


In [12]:
i = df.sample(1).index[0]
print(df['YearsCodePro'].iloc[i])
print(df1['YearsCodePro'].iloc[i])

1
1


## Export Data 

In [13]:
df1.to_pickle(EXPORT_PATH)