In [1]:
import pandas as pd
import urllib.request
import zipfile
from sklearn import base, pipeline
from feature_engine import encoding,imputation
from sklearn import model_selection

In [2]:
def read_zip(zip_filename:str='kaggle-survey-2018.zip', member_name:str="multipleChoiceResponses.csv"):
    with zipfile.ZipFile(zip_filename) as z:
        kag = pd.read_csv(z.open(member_name))
        return kag.iloc[1:]

In [3]:
# raw = extract_zip(src=url, dst=fname,member_name=member_name)
kaggle_zip_path = r'../assets/XGBoost/kaggle-survey-2018.zip'
kaggle_member_name = r'multipleChoiceResponses.csv'

raw = read_zip(zip_filename=kaggle_zip_path,member_name=kaggle_member_name)

  kag = pd.read_csv(z.open(member_name))


In [4]:
print("# Data frame")
print(raw.sample(10).to_string())
print('*'*200)

raw.info()

# Data frame
      Time from Start to Finish (seconds)      Q1 Q1_OTHER_TEXT     Q2                        Q3                   Q4                                                            Q5                  Q6 Q6_OTHER_TEXT                                      Q7 Q7_OTHER_TEXT     Q8                                                            Q9                                                                                      Q10                                                              Q11_Part_1                                                                                       Q11_Part_2                                                                                                        Q11_Part_3                                                          Q11_Part_4                                                          Q11_Part_5                                                         Q11_Part_6 Q11_Part_7 Q11_OTHER_TEXT                                                  

In [5]:
def topn(ser:pd.Series,n:int=5,default:str='other'):
    '''
    replace all values in Series that are not amoung top n , and replace with default
    '''
    counts = ser.value_counts()
    return ser.where(ser.isin(counts.index[:n]),default)

In [6]:
def tweak_kag(df_:pd.DataFrame)->pd.DataFrame:
    return(
        df_.assign(
        age = df_.Q2.str.slice(start=0,stop=2).astype(dtype=int,errors='raise'),
        education = df_.Q4.replace({
            "Master’s degree": 18,
            "Bachelor’s degree":16,
            "Doctoral degree":20,
            "Some college/university study without earning a bachelor’s degree":13,
            "Professional degree":19,
            "I prefer not to answer":None,
            "No formal education past high school":12
        }),
        major =df_.Q5\
                .pipe(topn,n=3)\
                .replace({
                    "Computer science (software engineering, etc.)":'cs',
                    "Engineering (non-computer focused)":'eng',
                    "Mathematics or statistics":'stat'
                }),
        years_exp = df_.Q8.str.replace('+','',regex=False).str.split('-',expand=True).iloc[:,0].astype(float),

        compensation = df_.Q9.str.replace('+','',regex=False)\
                            .str.replace(',','',regex=False)\
                            .str.replace('500000','500',regex=False)\
                            .str.replace("I do not wish to disclose my approximate yearly compensation",'0',regex=False)\
                            .str.split('-',expand=True)\
                            .iloc[:,0]
                            .fillna(0)
                            .astype(dtype=int)
                            .mul(1_000),
        python = df_.Q16_Part_1.fillna(0).replace('Python',1),
        r =  df_.Q16_Part_2.fillna(0).replace('R',1),
        sql = df_.Q16_Part_3.fillna(0).replace('SQL',1)
    ).rename(columns=lambda col:col.replace(' ','_') ).loc[:, 'Q3,age,education,major,years_exp,compensation,python,r,sql'.split(',')]
    )

In Scikit-learn,

transformer class subclasses the baseEstimator and TransformerMixin classes. These classes require that we implement the `.fit` and `.transform`

`.fit` returns class instance helps to learn parameters on training dataset

`.transform` method leverages the logic in the tweak_kag function helpful when learned parameter on applying it and transform

In [7]:
class TweakKagTransformer(base.BaseEstimator,base.TransformerMixin):
    def __init__(self,ycol=None) -> None:
        self.ycol = ycol

    def transform(self,X):
        return tweak_kag(X)
    
    def fit(self,X,y=None):
        return self

In [8]:
def get_rawX_y(df:pd.Series,y_col:str):
    raw = df.query('Q3.isin(["United States of America", "China", "India" ]) and Q6.isin(["Data Scientist","Software Engineer"])')
    return raw.drop(columns=[y_col]),raw[y_col]

In [9]:
kag_pl = pipeline.Pipeline([
    ('tweak',TweakKagTransformer()),
    ('cat',encoding.OneHotEncoder(top_categories=5,drop_last=True,variables=['Q3',"major"])),
    ("num_impute",imputation.MeanMedianImputer(imputation_method='median',variables=['education','years_exp']))
])

print(kag_pl)

Pipeline(steps=[('tweak', TweakKagTransformer()),
                ('cat',
                 OneHotEncoder(drop_last=True, top_categories=5,
                               variables=['Q3', 'major'])),
                ('num_impute',
                 MeanMedianImputer(variables=['education', 'years_exp']))])


In [10]:
kag_X , kag_y = get_rawX_y(df=raw,y_col='Q6')

kag_X_train, kag_X_test, kag_y_train, kag_y_test = model_selection.train_test_split( kag_X, kag_y, test_size=.3, random_state=42, stratify=kag_y)


X_train = kag_pl.fit_transform(kag_X_train, kag_y_train)
X_test  = kag_pl.transform(kag_X_test)

In [11]:
print(X_train.head(10).to_string())
print('*'*500)
print(kag_y_train.head(10).to_string())

       age  education  years_exp  compensation  python  r  sql  Q3_United States of America  Q3_India  Q3_China  major_cs  major_other  major_eng  major_stat
587     25       18.0        4.0             0       1  0    1                            0         1         0         0            1          0           0
3065    22       16.0        1.0         10000       1  0    0                            0         1         0         1            0          0           0
8435    22       18.0        1.0             0       1  0    0                            0         1         0         0            1          0           0
3110    40       20.0        3.0        125000       1  0    1                            1         0         0         0            1          0           0
16372   45       12.0        5.0        100000       1  0    1                            1         0         0         0            1          0           0
11453   55       18.0        4.0             0      

In [12]:
print(X_test.head(10).to_string())
print('*'*500)
print(kag_y_test.head(10).to_string())

       age  education  years_exp  compensation  python  r  sql  Q3_United States of America  Q3_India  Q3_China  major_cs  major_other  major_eng  major_stat
7894    22       16.0        1.0             0       1  0    0                            0         1         0         1            0          0           0
10541   25       18.0        1.0         70000       1  1    0                            1         0         0         0            1          0           0
21353   35       18.0        2.0             0       0  0    0                            0         1         0         0            0          0           1
13879   25       18.0        1.0        100000       1  0    1                            1         0         0         1            0          0           0
21971   18       18.0        1.0             0       0  0    0                            0         1         0         1            0          0           0
8350    22       18.0        0.0             0      

## Exercises

In [13]:
print(X_train.describe().to_string())

               age    education    years_exp   compensation       python            r          sql  Q3_United States of America     Q3_India     Q3_China     major_cs  major_other    major_eng   major_stat
count  2110.000000  2110.000000  2110.000000    2110.000000  2110.000000  2110.000000  2110.000000                  2110.000000  2110.000000  2110.000000  2110.000000  2110.000000  2110.000000  2110.000000
mean     28.520853    17.414218     3.367773   51343.601896     0.755450     0.296682     0.461611                     0.501422     0.412322     0.086256     0.449763     0.230332     0.203791     0.116114
std       8.651752     1.478785     4.867549   66557.874325     0.429922     0.456904     0.498642                     0.500117     0.492369     0.280808     0.497588     0.421145     0.402911     0.320437
min      18.000000    12.000000     0.000000       0.000000     0.000000     0.000000     0.000000                     0.000000     0.000000     0.000000     0.000000     0.000