In [71]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('data/survey_results_public.csv', low_memory=False)
data.shape

(98855, 129)

In [3]:
# Features with multiple choice options
MULTIPLE_CHOICE = [
    'CommunicationTools', 'EducationTypes', 'SelfTaughtTypes', 'HackathonReasons',
    'DatabaseWorkedWith', 'DatabaseDesireNextYear', 'PlatformWorkedWith',
    'PlatformDesireNextYear', 'Methodology', 'VersionControl',
    'AdBlockerReasons', 'AdsActions', 'ErgonomicDevices', 'Gender',
    'SexualOrientation', 'RaceEthnicity', 'LanguageWorkedWith'
]

# Dev types - let's only look at data scientists
DEV_TYPES = [
    'Data or business analyst',
    'Data scientist or machine learning specialist'
]

# Features which we are not interested in (predicting Python/R would be too easy with them)
DROP_COLUMNS = [
    'IDE', 'FrameworkWorkedWith', 'FrameworkDesireNextYear',
    'LanguageDesireNextYear', 'DevType', 'CurrencySymbol',
    'Salary', 'SalaryType', 'Respondent', 'Currency'
]

In [130]:
# Filter data to include only observations that have business analyst
# or machine learning in DevType feature
df = data.loc[data['DevType'].str.contains('|'.join(DEV_TYPES)).fillna(False)]

# Drop unnecessary columns
df.drop(DROP_COLUMNS, axis=1, inplace=True)

# Iterate over all multiple choice features to create OneHot Encoding for each possible answer
# of each feature
for col in MULTIPLE_CHOICE:
    temp = df[col].str.split(';', expand=True)
    new_cols = pd.unique(temp.values.ravel())
    for new_col in new_cols:
        if new_col and new_col is not np.nan:
            idxs = df[col].str.contains(new_col, regex=False).fillna(False)
            df.loc[idxs, f'{col}_{new_col}'] = 1
            df.loc[~idxs, f'{col}_{new_col}'] = 0
    df.drop(col, axis=1, inplace=True)

# Create dummy features for the rest of features that weren't multiple choice
df = pd.get_dummies(df, drop_first=True)

In [134]:
r_only_idxs = (df.LanguageWorkedWith_R == 1) & (df.LanguageWorkedWith_Python == 0)
python_only_idxs = (df.LanguageWorkedWith_R == 0) & (df.LanguageWorkedWith_Python == 1)
r_and_python_idxs = (df.LanguageWorkedWith_R == 1) & (df.LanguageWorkedWith_Python == 1)

df.loc[r_only_idxs, 'R_or_Python'] = 1
df.loc[python_only_idxs, 'R_or_Python'] = 1
df.loc[r_and_python_idxs, 'R_or_Python'] = 1

df.drop(['LanguageWorkedWith_R', 'LanguageWorkedWith_Python'])

In [136]:
from sklearn.base import clone

In [138]:
np.sqrt(1760)

41.95235392680606