In [168]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_rows = 1000
import pickle

In [169]:
#data path
DATA_PATH = '../data/raw/survey_results_public.csv'
#export path
EXPORT_PATH = '../data/processed/1_processed.pkl'

In [24]:
#read dataset
dataset = pd.read_csv(DATA_PATH)

In [26]:
#check shape of data
dataset.shape

(64461, 61)

In [57]:
dataset.sample(1).iloc[0]

Respondent                                                                  15880
MainBranch                      I am not primarily a developer, but I write co...
Hobbyist                                                                      Yes
Age                                                                          34.0
Age1stCode                                                                     18
CompFreq                                                                  Monthly
CompTotal                                                                  9600.0
ConvertedComp                                                             26388.0
Country                                                                    Brazil
CurrencyDesc                                                       Brazilian real
CurrencySymbol                                                                BRL
DatabaseDesireNextYear                                                        NaN
DatabaseWorkedWi

In [58]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64461 entries, 0 to 64460
Data columns (total 61 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Respondent                    64461 non-null  int64  
 1   MainBranch                    64162 non-null  object 
 2   Hobbyist                      64416 non-null  object 
 3   Age                           45446 non-null  float64
 4   Age1stCode                    57900 non-null  object 
 5   CompFreq                      40069 non-null  object 
 6   CompTotal                     34826 non-null  float64
 7   ConvertedComp                 34756 non-null  float64
 8   Country                       64072 non-null  object 
 9   CurrencyDesc                  45472 non-null  object 
 10  CurrencySymbol                45472 non-null  object 
 11  DatabaseDesireNextYear        44070 non-null  object 
 12  DatabaseWorkedWith            49537 non-null  object 
 13  D

In [61]:
dataset['YearsCodePro'].unique()

array(['27', '4', nan, '8', '13', '2', '7', '20', '1', '23', '3', '12',
       '17', '18', '10', '14', '29', '6', '28', '9', '15', '11', '16',
       '25', 'Less than 1 year', '5', '21', '19', '35', '24', '32', '22',
       '30', '38', '26', '40', '33', '31', 'More than 50 years', '34',
       '36', '39', '37', '41', '45', '47', '42', '46', '50', '43', '44',
       '48', '49'], dtype=object)

In [62]:
dataset['YearsCode'].unique()

array(['36', '7', '4', '15', '6', '17', '8', '10', '35', '5', '37', '19',
       '9', '22', '30', '23', '20', '2', 'Less than 1 year', '3', '13',
       '25', '16', '43', '11', '38', '33', nan, '24', '21', '12', '40',
       '27', '50', '46', '14', '18', '28', '32', '44', '26', '42', '31',
       '34', '29', '1', '39', '41', '45', 'More than 50 years', '47',
       '49', '48'], dtype=object)

In [63]:
dataset['Age1stCode'].unique()

array(['13', '19', '15', '18', '16', '14', '12', '20', '42', '8', '25',
       '22', '30', '17', '21', '10', '46', '9', '7', '11', '6', nan, '31',
       '29', '5', 'Younger than 5 years', '28', '38', '23', '27', '41',
       '24', '53', '26', '35', '32', '40', '33', '36', '54', '48', '56',
       '45', '44', '34', 'Older than 85', '39', '51', '68', '50', '37',
       '47', '43', '52', '85', '64', '55', '58', '49', '76', '72', '73',
       '83', '63'], dtype=object)

In [66]:
#convert object to float after replacement
REPLACE_DICT = {
    
    'YearsCodePro': {'Less than 1 year':0,'More than 50 years':51},
    'YearsCode'   : {'Less than 1 year':0,'More than 50 years':51},
    'Age1stCode'  : {'Younger than 5 years':4,'Older than 85':86}
    
}

In [67]:
for col,replacement in REPLACE_DICT.items():
    dataset[col]=dataset[col].replace(replacement).astype('float32')

In [163]:
#split multiple answers into single answer using delimiter(';')
def split_answers(df_series,delimiter=';'):
    """
    split multiple answers into a list of answers using delimiter
    
    parameters:
     * df_series(pd.series) as series of multiple answers
     * delimiter(string) default ';'
    """
    #check if column has multiple answers or not
    def is_splittable(df_series,delimiter):
        """check if multiple answers""" 'return boolean'
        return df_series.str.contains(delimiter)
        
    def split_answer(df_series,delimiter):
        """ split to asingle answer"""
        return df_series.str.split(delimiter)
    
    splittable_values = is_splittable(df_series,delimiter)
        
    if not splittable_values.any():
        return df_series
    
    modified_series = split_answer(df_series,delimiter)
    
    null_mask = modified_series.isnull()
    
    modified_series.loc[null_mask] = modified_series.loc[null_mask].apply(lambda x:[])
    
    return modified_series    
    

In [165]:
object_cols = dataset.select_dtypes(include='object').columns.tolist()
for col in object_cols:
    dataset[col] = split_answers(dataset[col])

In [167]:
dataset.sample(1).iloc[0]

Respondent                                                                     65
MainBranch                                         I am a developer by profession
Hobbyist                                                                      Yes
Age                                                                          34.0
Age1stCode                                                                   13.0
CompFreq                                                                  Monthly
CompTotal                                                                 98000.0
ConvertedComp                                                           1176000.0
Country                                                             United States
CurrencyDesc                                                 United States dollar
CurrencySymbol                                                                USD
DatabaseDesireNextYear                          [Couchbase, Elasticsearch, Redis]
DatabaseWorkedWi

In [170]:
dataset.to_pickle(EXPORT_PATH)