In [1]:
# constants
DATA_PATH='../data/raw/survey_results_public.csv'
EXPORT_PATH = "../data/processed/1_preprocessed_data.pkl"

REPLACE_COLS=['YearsCode','YearsCodePro']
REPLACE_DICT={
    'Less than 1 year' : 0,
    'More than 50 years' : 51 
}

In [2]:
# packages
import pandas as pd 
import numpy as np
import logging
import pickle

# Functions

In [3]:
def Replace_nulls(pd_series,replacement=[]):
    '''
        Aim:
        Replace null values with a specific replacement
        
        Parmeters:
        * pd_series (pd.Series): String series with answers
        * replacement (list) : list to replace null value
        
        Return: (pd.Series): series after replacing 
    '''
    nulls=pd_series.isnull()
    pd_series.loc[nulls]=pd_series.loc[nulls].apply(lambda x: replacement)
    return pd_series

In [4]:
def split_multianswers(answers,delimit=';'):
    '''
        Aim:
        Split multiple answers in a single string 
        to a list of strings each represnting a single answer
        
        Parameters:
        * answers (pd.Series): String series with answers 
        * delimit (string): endpoint for each single answer Defaults to ";"
        
        Return: (pd.Series): If column contains multiple answers
    
    '''
     # Sub functions 
    def is_splittable(answer,delimit):
        """ Check if answer should be splitted - Returns boolean """    
        return answer.str.contains(delimit)
    
    def split_answer(answer,delimit):
        """Function to split single answer - Returns list of strings"""
        return answer.str.split(delimit)
    
    #__________________________________________________________#
    
    # Check if multiple answers exist - if none: return original 
    splittable_answers=is_splittable(answers,delimit)
    if not splittable_answers.any():
        return answers
    
    # Split each multiple answer to list of single answers
    splitted_answers=split_answer(answers,delimit)
    
    # Replace NAs with empty lists 
    modified_values=Replace_nulls(splitted_answers)
    
    return modified_values
    

##### ________________________________________________________________________________________________________________________

# Preprocessing

In [5]:
# Read data
raw_data=pd.read_csv(DATA_PATH)
data=raw_data.copy()

## Replace values

In [6]:
for col in REPLACE_COLS:
    data[col]=data[col].replace(REPLACE_DICT).astype(np.float32)
    
# we applied parsing by changing type to float32

## Split multiple values

In [7]:
object_cols = data.select_dtypes(include='object').columns.tolist()
for col in object_cols:
    
    data[col]=split_multianswers(data[col])

##### ________________________________________________________________________________________________________________________

# Visually verify results

In [8]:
idx=data.sample(1).index[0]

print(raw_data['LanguageWantToWorkWith'].iloc[idx])
print(data['LanguageWantToWorkWith'].iloc[idx])

Dart;Go;Kotlin;Node.js
['Dart', 'Go', 'Kotlin', 'Node.js']


In [9]:
idx=data.sample(1).index[0]

print(raw_data['DevType'].iloc[idx])
print(data['DevType'].iloc[idx])

Developer, full-stack;Developer, back-end
['Developer, full-stack', 'Developer, back-end']


In [12]:
idx=data.sample(1).index[0]

print(raw_data['YearsCode'].iloc[idx])
print(data['YearsCode'].iloc[idx])

Less than 1 year
0.0


In [13]:
print(raw_data['YearsCode'].loc[raw_data['YearsCode']=='Less than 1 year'])
print('_'*100) 
print(data['YearsCode'].loc[raw_data['YearsCode']=='Less than 1 year'])


40       Less than 1 year
310      Less than 1 year
445      Less than 1 year
531      Less than 1 year
623      Less than 1 year
               ...       
83142    Less than 1 year
83172    Less than 1 year
83236    Less than 1 year
83237    Less than 1 year
83351    Less than 1 year
Name: YearsCode, Length: 1489, dtype: object
____________________________________________________________________________________________________
40       0.0
310      0.0
445      0.0
531      0.0
623      0.0
        ... 
83142    0.0
83172    0.0
83236    0.0
83237    0.0
83351    0.0
Name: YearsCode, Length: 1489, dtype: float32


##### ________________________________________________________________________________________________________________________

# Export data

In [14]:
data.to_pickle(EXPORT_PATH)

In [15]:
pk=pd.read_pickle(EXPORT_PATH)
pk.head()

Unnamed: 0,ResponseId,MainBranch,Employment,Country,US_State,UK_Country,EdLevel,Age1stCode,LearnCode,YearsCode,...,Age,Gender,Trans,Sexuality,Ethnicity,Accessibility,MentalHealth,SurveyLength,SurveyEase,ConvertedCompYearly
0,1,I am a developer by profession,"Independent contractor, freelancer, or self-em...",Slovakia,,,"Secondary school (e.g. American high school, G...",18 - 24 years,"[Coding Bootcamp, Other online resources (ex: ...",,...,25-34 years old,[Man],No,[Straight / Heterosexual],[White or of European descent],[None of the above],[None of the above],Appropriate in length,Easy,62268.0
1,2,I am a student who is learning to code,"Student, full-time",Netherlands,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,"[Other online resources (ex: videos, blogs, et...",7.0,...,18-24 years old,[Man],No,[Straight / Heterosexual],[White or of European descent],[None of the above],[None of the above],Appropriate in length,Easy,
2,3,"I am not primarily a developer, but I write co...","Student, full-time",Russian Federation,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,"[Other online resources (ex: videos, blogs, et...",,...,18-24 years old,[Man],No,[Prefer not to say],[Prefer not to say],[None of the above],[None of the above],Appropriate in length,Easy,
3,4,I am a developer by profession,Employed full-time,Austria,,,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",11 - 17 years,[],,...,35-44 years old,[Man],No,[Straight / Heterosexual],[White or of European descent],[I am deaf / hard of hearing],[],Appropriate in length,Neither easy nor difficult,
4,5,I am a developer by profession,"Independent contractor, freelancer, or self-em...",United Kingdom of Great Britain and Northern I...,,England,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",5 - 10 years,[Friend or family member],17.0,...,25-34 years old,[Man],No,[],[White or of European descent],[None of the above],[],Appropriate in length,Easy,
