In [56]:
import pandas as pd 

In [107]:
DATA_PATH = "../data/raw/"
EXPORT_PATH = "../data/processed/"
RAW_DATA = "survey_results_public.csv"

In [58]:
raw_df = pd.read_csv(DATA_PATH + RAW_DATA) 
df = raw_df.copy()

# Convert features to numerical values

In [59]:
CONVERTION_DICT = {
    'YearsCode' : { 'Less than 1 year' : 0, 'More than 50 years' : 51 },
    'YearsCodePro' : { 'Less than 1 year' : 0, 'More than 50 years' : 51 }
}

In [60]:

df = df.replace(CONVERTION_DICT)


# Split columns 

In [61]:
def split_answers(df, delimiter = ';') : 
    '''
    Split the survey answers seperated by a delimiter to form a list
    params : 
        series : the pandas serie that holds the data to split
        delimiter : the delimiter based on which we make the split. Default value ';'
    Return : List of the comma seperated values
    '''
    _df = df.copy() 
    
    def get_split_cols(df, delimiter = ';') :
        '''
        get the columns that meat the conditions and should be splitted
        params : 
            serie : the pandas series that holds the data
            delimiter : the delimiter based on which we make the split. Default value ';'   
        '''

        obj_columns = df.select_dtypes(include='object').columns
        split_cols = []
        for column in obj_columns :
            if True in df[column].str.contains(';').tolist() : 
                split_cols.append(column)

        return split_cols 

    split_cols = get_split_cols(_df)
    for column in split_cols :
        modified_series = _df[column].str.split(';') 
        # Replace NAs with empty lists 
        mask_null = modified_series.isnull()
        modified_series.loc[mask_null] = modified_series.loc[mask_null].apply(lambda x: [])
        _df[column] = modified_series

    return _df 

    

In [62]:
splitted_df = split_answers(df)

# Visual Verification 

In [67]:
i = df.sample(1).index[0]
print(raw_df['LanguageHaveWorkedWith'].iloc[i])
print(splitted_df['LanguageHaveWorkedWith'].iloc[i])

C;C++;HTML/CSS;JavaScript;Node.js;PHP
['C', 'C++', 'HTML/CSS', 'JavaScript', 'Node.js', 'PHP']


In [106]:
i = df.sample(1).index[0]
print(raw_df['YearsCodePro'].iloc[i])
print(splitted_df['YearsCodePro'].iloc[i])

3
3


# Export Data

In [109]:
splitted_df.to_pickle(EXPORT_PATH + "preprocessed_survey_results_public.csv")